1 /*
2 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/sysctl.h>
30
31 #include <netinet/in.h>
32 #include <netinet/ip.h>
33 #include <netinet/ip6.h>
34 #include <netinet/ip_var.h>
35
36 #include <netinet/tcp.h>
37 #include <netinet/tcp_fsm.h>
38 #include <netinet/tcp_seq.h>
39 #include <netinet/tcp_var.h>
40 #include <netinet/tcp_cc.h>
41
42 /*
43 * This file implements a LBE congestion control algorithm
44 * to compute the receive window of a background transport
45 * which uses same algorithm as ledbat-plus-plus.
46 */
47
48 #define GAIN_CONSTANT (16)
49 #define DEFER_SLOWDOWN_DURATION (30 * 1000) /* 30s */
50
51 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rledbat, CTLFLAG_RW | CTLFLAG_LOCKED,
52 int, tcp_rledbat, 1, "Use Receive LEDBAT");
53
54 void tcp_rledbat_init(struct tcpcb *tp);
55 void tcp_rledbat_cleanup(struct tcpcb *tp);
56 void tcp_rledbat_rwnd_init(struct tcpcb *tp);
57 void tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
58 struct tcpopt *to, uint32_t segment_len);
59 uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
60 void tcp_rledbat_after_idle(struct tcpcb *tp);
61 void tcp_rledbat_switch_to(struct tcpcb *tp);
62
63 struct tcp_rcv_cc_algo tcp_cc_rledbat = {
64 .name = "rledbat",
65 .init = tcp_rledbat_init,
66 .cleanup = tcp_rledbat_cleanup,
67 .rwnd_init = tcp_rledbat_rwnd_init,
68 .data_rcvd = tcp_rledbat_data_rcvd,
69 .get_rlwin = tcp_rledbat_get_rlwin,
70 .after_idle = tcp_rledbat_after_idle,
71 .switch_to = tcp_rledbat_switch_to,
72 };
73
74 static inline void
rledbat_clear_state(struct tcpcb * tp)75 rledbat_clear_state(struct tcpcb *tp)
76 {
77 tp->t_rlstate.num_slowdown_events = 0;
78 tp->t_rlstate.slowdown_ts = 0;
79 tp->t_rlstate.slowdown_begin = 0;
80 tp->t_rlstate.rcvd_bytes = 0;
81 tp->t_rlstate.md_rcvd_bytes = 0;
82 tp->t_rlstate.drained_bytes = 0;
83 }
84
85 void
tcp_rledbat_init(struct tcpcb * tp)86 tcp_rledbat_init(struct tcpcb *tp)
87 {
88 OSIncrementAtomic((volatile int *)&tcp_cc_rledbat.num_sockets);
89 rledbat_clear_state(tp);
90
91 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
92 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
93 }
94
95 void
tcp_rledbat_cleanup(struct tcpcb * tp)96 tcp_rledbat_cleanup(struct tcpcb *tp)
97 {
98 #pragma unused(tp)
99 OSDecrementAtomic((volatile int *)&tcp_cc_rledbat.num_sockets);
100 }
101
102 /*
103 * Initialize the receive window for a connection
104 */
105 void
tcp_rledbat_rwnd_init(struct tcpcb * tp)106 tcp_rledbat_rwnd_init(struct tcpcb *tp)
107 {
108 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
109
110 /* If the ssthresh hasn't been set, do it now */
111 if (tp->t_rlstate.ssthresh == 0) {
112 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
113 }
114 }
115
116 /*
117 * Compute the denominator
118 * MIN(16, ceil(2 * TARGET / base))
119 */
120 static uint32_t
rledbat_gain(uint32_t base_rtt)121 rledbat_gain(uint32_t base_rtt)
122 {
123 return MIN(GAIN_CONSTANT, tcp_ceil(2 * target_qdelay /
124 (double)base_rtt));
125 }
126
127 /*
128 * Congestion avoidance for ledbat++
129 */
130 static void
rledbat_congestion_avd(struct tcpcb * tp,uint32_t segment_len,uint32_t base_rtt,uint32_t curr_rtt)131 rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
132 uint32_t base_rtt, uint32_t curr_rtt)
133 {
134 uint32_t update = 0;
135 /*
136 * Set the next slowdown time i.e. 9 times the duration
137 * of previous slowdown except the initial slowdown.
138 */
139 if (tp->t_rlstate.slowdown_ts == 0) {
140 uint32_t slowdown_duration = 0;
141 if (tp->t_rlstate.num_slowdown_events > 0) {
142 slowdown_duration = tcp_now -
143 tp->t_rlstate.slowdown_begin;
144
145 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
146 /*
147 * Special case for slowdowns (other than initial)
148 * where cwnd doesn't recover fully to previous
149 * ssthresh
150 */
151 slowdown_duration *= 2;
152 }
153 }
154 tp->t_rlstate.slowdown_ts = tcp_now +
155 (9 * slowdown_duration);
156 if (slowdown_duration == 0) {
157 tp->t_rlstate.slowdown_ts += (2 * (tp->rcv_srtt >> TCP_RTT_SHIFT));
158 }
159 /* Reset the start */
160 tp->t_rlstate.slowdown_begin = 0;
161
162 /* On exit slow start due to higher qdelay, cap the ssthresh */
163 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
164 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
165 }
166 }
167
168 if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
169 /* Additive increase */
170 tp->t_rlstate.rcvd_bytes += segment_len;
171 if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
172 update = tp->t_maxseg;
173 tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
174 /*
175 * Move background slow-start threshold to current
176 * congestion window so that the next time (after some idle
177 * period), we can attempt to do slow-start till here if there
178 * is no increase in rtt
179 */
180 if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
181 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
182 }
183 tp->t_rlstate.win += update;
184 tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win,
185 tp->t_maxseg), TCP_MAXWIN << tp->rcv_scale);
186 }
187 } else {
188 /*
189 * If we are still within 1 RTT of previous reduction
190 * due to loss, do nothing
191 */
192 if (tcp_now < tp->t_rlstate.reduction_end) {
193 return;
194 }
195 /*
196 * Multiplicative decrease
197 * W -= min(W * (qdelay/target - 1), W/2) (per RTT)
198 * To calculate per bytes acked, it becomes
199 * W -= min((qdelay/target - 1), 1/2) * bytes_acked
200 */
201 uint32_t qdelay = curr_rtt > base_rtt ?
202 (curr_rtt - base_rtt) : 0;
203
204 tp->t_rlstate.md_rcvd_bytes += segment_len;
205 if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
206 update = (uint32_t)(MIN(((double)qdelay / target_qdelay - 1), 0.5) *
207 (double)tp->t_rlstate.win);
208 tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
209 tp->t_rlstate.win -= update;
210
211 if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
212 tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
213 }
214
215 tp->t_rlstate.win = tcp_round_to(tp->t_rlstate.win, tp->t_maxseg);
216 /*
217 * Lower background slow-start threshold so that the connection
218 * will stay in congestion avoidance phase
219 */
220 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
221 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
222 }
223
224 if (tp->t_rlstate.slowdown_ts != 0) {
225 /* As the window has been reduced, defer the slowdown */
226 tp->t_rlstate.slowdown_ts = tcp_now +
227 DEFER_SLOWDOWN_DURATION;
228 }
229 }
230 }
231 }
232
233 /*
234 * Update win based on ledbat++ algo
235 */
236 void
tcp_rledbat_data_rcvd(struct tcpcb * tp,struct tcphdr * th,struct tcpopt * to,uint32_t segment_len)237 tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
238 struct tcpopt *to, uint32_t segment_len)
239 {
240 uint32_t update = 0;
241 const uint32_t base_rtt = get_base_rtt(tp);
242 const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
243 tp->t_rttcur;
244 const uint32_t ss_target = (uint32_t)(3 * target_qdelay / 4);
245 tp->t_rlstate.drained_bytes += segment_len;
246
247 /*
248 * Slowdown period - first slowdown
249 * is 2RTT after we exit initial slow start.
250 * Subsequent slowdowns are after 9 times the
251 * previous slow down durations.
252 */
253 if (tp->t_rlstate.slowdown_ts != 0 &&
254 tcp_now >= tp->t_rlstate.slowdown_ts) {
255 if (tp->t_rlstate.slowdown_begin == 0) {
256 tp->t_rlstate.slowdown_begin = tcp_now;
257 tp->t_rlstate.num_slowdown_events++;
258 }
259 if (tcp_now < tp->t_rlstate.slowdown_ts +
260 (2 * (tp->rcv_srtt >> TCP_RTT_SHIFT))) {
261 // Set rwnd to 2 packets and return
262 if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
263 if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
264 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
265 }
266 tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
267 /* Reset total bytes acked */
268 tp->t_rlstate.rcvd_bytes = 0;
269 }
270 return;
271 }
272 }
273
274 /*
275 * Detect retransmissions first by checking if the current
276 * received sequence is smaller than largest and its
277 * timestamp is higher than the largest so far. Reduce
278 * win based on fast recovery only once per effective RTT.
279 *
280 * Note: As we are detecting retransmissions (not packet loss),
281 * we are giving some leeway for the next window reduction.
282 */
283 if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
284 TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
285 if (tcp_now < tp->t_rlstate.reduction_end) {
286 /* still need to wait for reduction end to elapse */
287 return;
288 }
289
290 uint32_t win = tp->t_rlstate.win / 2;
291 win = tcp_round_to(win, tp->t_maxseg);
292 if (win < 2 * tp->t_maxseg) {
293 win = 2 * tp->t_maxseg;
294 }
295 tp->t_rlstate.ssthresh = win;
296 tp->t_rlstate.win = win;
297
298 /* Reset the received bytes */
299 tp->t_rlstate.rcvd_bytes = 0;
300 tp->t_rlstate.md_rcvd_bytes = 0;
301
302 /* Update the reduction end time */
303 tp->t_rlstate.reduction_end = tcp_now + 2 *
304 (tp->rcv_srtt >> TCP_RTT_SHIFT);
305
306 if (tp->t_rlstate.slowdown_ts != 0) {
307 /* As the window has been halved, defer the slowdown. */
308 tp->t_rlstate.slowdown_ts = tcp_now +
309 DEFER_SLOWDOWN_DURATION;
310 }
311 return;
312 }
313
314 /* Now we can do slow start or CA */
315 if (curr_rtt == 0 || base_rtt == 0) {
316 update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
317 tp->t_maxseg);
318 tp->t_rlstate.win += update;
319 tp->t_rlstate.win = min(tp->t_rlstate.win,
320 TCP_MAXWIN << tp->rcv_scale);
321 } else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
322 ((tp->t_rlstate.num_slowdown_events > 0 &&
323 curr_rtt <= (base_rtt + (uint32_t)target_qdelay)) ||
324 curr_rtt <= (base_rtt + ss_target))) {
325 /*
326 * Modified slow start with a dynamic GAIN
327 * If the queuing delay is larger than 3/4 of the target
328 * delay, exit slow start, iff, it is the initial slow start.
329 * After the initial slow start, during CA, window growth
330 * will be bound by ssthresh.
331 */
332 tp->t_rlstate.rcvd_bytes += segment_len;
333 uint32_t gain_factor = rledbat_gain(base_rtt);
334 if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
335 update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
336 TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
337 tp->t_rlstate.rcvd_bytes = 0;
338 tp->t_rlstate.win += update;
339 tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win,
340 tp->t_maxseg), TCP_MAXWIN << tp->rcv_scale);
341 }
342
343 /* Reset the next slowdown timestamp */
344 if (tp->t_rlstate.slowdown_ts != 0) {
345 tp->t_rlstate.slowdown_ts = 0;
346 }
347 } else {
348 /* Congestion avoidance */
349 rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt);
350 }
351 }
352
353 uint32_t
tcp_rledbat_get_rlwin(struct tcpcb * tp)354 tcp_rledbat_get_rlwin(struct tcpcb *tp)
355 {
356 /* rlwin is either greater or smaller by at most drained bytes */
357 if (tp->t_rlstate.win > tp->t_rlstate.win_ws ||
358 tp->t_rlstate.win_ws - tp->t_rlstate.win <
359 tp->t_rlstate.drained_bytes) {
360 tp->t_rlstate.win_ws = tp->t_rlstate.win;
361 } else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
362 /*
363 * rlwin is smaller, decrease the advertised window
364 * only by drained bytes at a time
365 */
366 tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
367 tp->t_rlstate.drained_bytes;
368 }
369 tp->t_rlstate.drained_bytes = 0;
370 /* Round up to the receive window scale */
371 tp->t_rlstate.win_ws = tcp_round_up(tp->t_rlstate.win_ws,
372 1 << tp->rcv_scale);
373
374 return tp->t_rlstate.win_ws;
375 }
376
377 /*
378 * Function to handle connections that have been idle for
379 * some time. Slow start to get ack "clock" running again.
380 * Clear base history after idle time.
381 */
382 void
tcp_rledbat_after_idle(struct tcpcb * tp)383 tcp_rledbat_after_idle(struct tcpcb *tp)
384 {
385 rledbat_clear_state(tp);
386 /* Reset the rledbat window */
387 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
388 }
389
390 void
tcp_rledbat_switch_to(struct tcpcb * tp)391 tcp_rledbat_switch_to(struct tcpcb *tp)
392 {
393 rledbat_clear_state(tp);
394
395 uint32_t win = 0;
396
397 if (tp->t_rlstate.win == 0) {
398 /*
399 * Use half of previous window, the algorithm
400 * will quickly reduce the window if there is still
401 * high queueing delay.
402 */
403 win = (tp->rcv_adv - tp->rcv_nxt) / 2;
404 } else {
405 /* Reduce the window by half from the previous value */
406 win = tp->t_rlstate.win / 2;
407 }
408
409 win = tcp_round_to(win, tp->t_maxseg);
410 if (win < bg_ss_fltsz * tp->t_maxseg) {
411 win = bg_ss_fltsz * tp->t_maxseg;
412 }
413 tp->t_rlstate.win = win;
414
415 /* ssthresh should be at most the inital value */
416 if (tp->t_rlstate.ssthresh == 0) {
417 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
418 } else {
419 tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
420 TCP_MAXWIN << TCP_MAX_WINSHIFT);
421 }
422 }
423