1 /*
2 * Copyright (c) 2013-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include "tcp_includes.h"
30
31 #include <sys/param.h>
32 #include <sys/kernel.h>
33 #include <sys/syslog.h>
34
35 #include <netinet/in.h>
36 #include <netinet/in_systm.h>
37 #include <netinet/ip.h>
38 #include <netinet/ip6.h>
39 #include <netinet/ip_var.h>
40
41 static int tcp_cubic_init(struct tcpcb *tp);
42 static int tcp_cubic_cleanup(struct tcpcb *tp);
43 static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp);
44 static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th);
45 static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th);
46 static void tcp_cubic_pre_fr(struct tcpcb *tp);
47 static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th);
48 static void tcp_cubic_after_timeout(struct tcpcb *tp);
49 static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th);
50 static void tcp_cubic_switch_cc(struct tcpcb *tp);
51 static uint32_t tcp_cubic_update(struct tcpcb *tp, uint32_t rtt);
52 static inline void tcp_cubic_clear_state(struct tcpcb *tp);
53
54 extern float cbrtf(float x);
55
56 struct tcp_cc_algo tcp_cc_cubic = {
57 .name = "cubic",
58 .init = tcp_cubic_init,
59 .cleanup = tcp_cubic_cleanup,
60 .cwnd_init = tcp_cubic_cwnd_init_or_reset,
61 .congestion_avd = tcp_cubic_congestion_avd,
62 .ack_rcvd = tcp_cubic_ack_rcvd,
63 .pre_fr = tcp_cubic_pre_fr,
64 .post_fr = tcp_cubic_post_fr,
65 .after_idle = tcp_cubic_cwnd_init_or_reset,
66 .after_timeout = tcp_cubic_after_timeout,
67 .delay_ack = tcp_cubic_delay_ack,
68 .switch_to = tcp_cubic_switch_cc
69 };
70
71 static float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */
72 static float tcp_cubic_coeff = 0.4f;
73 static float tcp_cubic_fast_convergence_factor = 0.875f;
74
75 static float tcp_cubic_beta = 0.8f;
76
77 static int
tcp_cubic_init(struct tcpcb * tp)78 tcp_cubic_init(struct tcpcb *tp)
79 {
80 os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed);
81
82 tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */
83 tcp_cubic_fast_convergence_factor = 0.85f;
84 tcp_cubic_beta = 0.7f;
85
86 VERIFY(tp->t_ccstate != NULL);
87 tcp_cubic_clear_state(tp);
88 return 0;
89 }
90
91 static int
tcp_cubic_cleanup(struct tcpcb * tp)92 tcp_cubic_cleanup(struct tcpcb *tp)
93 {
94 #pragma unused(tp)
95 os_atomic_dec(&tcp_cc_cubic.num_sockets, relaxed);
96 return 0;
97 }
98
99 /*
100 * Initialize the congestion window at the beginning of a connection or
101 * after idle time
102 */
103 static void
tcp_cubic_cwnd_init_or_reset(struct tcpcb * tp)104 tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp)
105 {
106 VERIFY(tp->t_ccstate != NULL);
107
108 tcp_cubic_clear_state(tp);
109 tcp_cc_cwnd_init_or_reset(tp);
110 tp->t_pipeack = 0;
111 tcp_clear_pipeack_state(tp);
112
113 /* Start counting bytes for RFC 3465 again */
114 tp->t_bytes_acked = 0;
115
116 /*
117 * slow start threshold could get initialized to a lower value
118 * when there is a cached value in the route metrics. In this case,
119 * the connection can enter congestion avoidance without any packet
120 * loss and Cubic will enter steady-state too early. It is better
121 * to always probe to find the initial slow-start threshold.
122 */
123 if (tp->t_inpcb->inp_mstat.ms_total.ts_txbytes <= tcp_initial_cwnd(tp) &&
124 tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) {
125 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
126 }
127
128 /* Initialize cubic last max to be same as ssthresh */
129 tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
130
131 /* Set initial pacer state */
132 tcp_update_pacer_state(tp);
133 }
134
135 /*
136 * Compute the target congestion window for the next RTT according to
137 * cubic equation when an ack is received.
138 *
139 * W(t) = C(t-K)^3 + W(last_max)
140 */
141 static uint32_t
tcp_cubic_update(struct tcpcb * tp,uint32_t rtt)142 tcp_cubic_update(struct tcpcb *tp, uint32_t rtt)
143 {
144 struct tcp_globals *globals = tcp_get_globals(tp);
145 float K, var;
146 uint32_t elapsed_time, win;
147
148 win = min(tp->snd_cwnd, tp->snd_wnd);
149 if (tp->t_ccstate->cub_last_max == 0) {
150 tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
151 }
152
153 if (tp->t_ccstate->cub_epoch_start == 0) {
154 /*
155 * This is the beginning of a new epoch, initialize some of
156 * the variables that we need to use for computing the
157 * congestion window later.
158 */
159 tp->t_ccstate->cub_epoch_start = tcp_globals_now(globals);
160 if (tp->t_ccstate->cub_epoch_start == 0) {
161 tp->t_ccstate->cub_epoch_start = 1;
162 }
163 if (win < tp->t_ccstate->cub_last_max) {
164 /*
165 * Compute cubic epoch period, this is the time
166 * period that the window will take to increase to
167 * last_max again after backoff due to loss.
168 */
169 K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff;
170 K = cbrtf(K);
171 tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ;
172 /* Origin point */
173 tp->t_ccstate->cub_origin_point = tp->t_ccstate->cub_last_max;
174 } else {
175 tp->t_ccstate->cub_epoch_period = 0;
176 tp->t_ccstate->cub_origin_point = win;
177 }
178 }
179
180 VERIFY(tp->t_ccstate->cub_origin_point > 0);
181 /*
182 * Compute the target window for the next RTT using smoothed RTT
183 * as an estimate for next RTT.
184 */
185 elapsed_time = timer_diff(tcp_globals_now(globals), 0, tp->t_ccstate->cub_epoch_start, 0);
186
187 if (tcp_cubic_use_minrtt) {
188 elapsed_time += max(tcp_cubic_use_minrtt, rtt);
189 } else {
190 elapsed_time += rtt;
191 }
192 var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ;
193 var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg);
194
195 return (uint32_t)(tp->t_ccstate->cub_origin_point + var);
196 }
197
198 /*
199 * Standard TCP utilizes bandwidth well in low RTT and low BDP connections
200 * even when there is some packet loss. Enabling TCP mode will help Cubic
201 * to achieve this kind of utilization.
202 *
203 * But if there is a bottleneck link in the path with a fixed size queue
204 * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this
205 * link because of the steady-state behavior. Using average and mean
206 * absolute deviation of W(lastmax), we try to detect if the congestion
207 * window is close to the bottleneck bandwidth. In that case, disabling
208 * TCP mode will help to minimize packet loss at this link.
209 *
210 * Disable TCP mode if the W(lastmax) (the window where previous packet
211 * loss happened) is within a small range from the average last max
212 * calculated.
213 */
214 #define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \
215 ((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \
216 (_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0)
217
218 /*
219 * Compute the window growth if standard TCP (AIMD) was used with
220 * a backoff of 0.5 and additive increase of 1 packet per RTT.
221 *
222 * TCP window at time t can be calculated using the following equation
223 * with tcp_beta_cubic
224 *
225 * W(t) <- Wmax * tcp_beta_cubic + 3 * ((1 - tcp_beta_cubic)/(1 + tcp_beta_cubic)) * t/RTT
226 *
227 */
228 static uint32_t
tcp_cubic_tcpwin(struct tcpcb * tp,struct tcphdr * th)229 tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th)
230 {
231 if (tp->t_ccstate->cub_tcp_win == 0) {
232 /* Start of the epoch, we set the tcp_win to whatever Cubic decided
233 * at the beginning of the epoch.
234 */
235 tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd);
236 tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp);
237 } else {
238 tp->t_ccstate->cub_tcp_bytes_acked += BYTES_ACKED(th, tp);
239
240 /*
241 * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked
242 * against the snd_cwnd represents exactly one RTT at full rate.
243 */
244 while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) {
245 /* Enough bytes have been ACK'd for TCP to do AIMD */
246 tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd;
247
248 if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max) {
249 tp->t_ccstate->cub_tcp_win += tp->t_maxseg;
250 } else {
251 /* Increase-rate from Section 4.2, RFC 8312 */
252 float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta);
253
254 tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor);
255 }
256 }
257 }
258 return tp->t_ccstate->cub_tcp_win;
259 }
260
261 /*
262 * Handle an in-sequence ack during congestion avoidance phase.
263 */
264 static void
tcp_cubic_congestion_avd(struct tcpcb * tp,struct tcphdr * th)265 tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th)
266 {
267 uint32_t cubic_target_win, tcp_win, rtt;
268 uint64_t incr_win = UINT32_MAX;
269
270 /* Do not increase congestion window in non-validated phase */
271 if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
272 return;
273 }
274
275 tp->t_bytes_acked += BYTES_ACKED(th, tp);
276
277 rtt = get_base_rtt(tp);
278 /*
279 * First compute cubic window. If cubic variables are not
280 * initialized (after coming out of recovery), this call will
281 * initialize them.
282 */
283 cubic_target_win = tcp_cubic_update(tp, rtt);
284
285 /* Compute TCP window if a multiplicative decrease of 0.2 is used */
286 tcp_win = tcp_cubic_tcpwin(tp, th);
287
288 if (cubic_target_win > tp->snd_cwnd) {
289 /*
290 * The target win is computed for the next RTT.
291 * To reach this value, cwnd will have to be updated
292 * one segment at a time. Compute how many bytes
293 * need to be acknowledged before we can increase
294 * the cwnd by one segment.
295 */
296 incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg;
297 incr_win /= (cubic_target_win - tp->snd_cwnd);
298 }
299
300 tcp_win = tcp_round_to(tcp_win, tp->t_maxseg);
301
302 if (tp->snd_cwnd < tcp_win) {
303 uint64_t tcp_incr_win;
304
305 tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg;
306 tcp_incr_win /= (tcp_win - tp->snd_cwnd);
307
308 if (tcp_incr_win < incr_win) {
309 /* this connection is in TCP-friendly region */
310 incr_win = tcp_incr_win;
311 }
312 }
313
314 if (incr_win > 0 && tp->t_bytes_acked >= incr_win) {
315 tp->t_bytes_acked -= incr_win;
316 tp->snd_cwnd = min(tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale);
317 }
318
319 tcp_update_pacer_state(tp);
320 }
321
322 static void
tcp_cubic_ack_rcvd(struct tcpcb * tp,struct tcphdr * th)323 tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th)
324 {
325 /* Do not increase the congestion window in non-validated phase */
326 if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
327 return;
328 }
329
330 if (tp->snd_cwnd >= tp->snd_ssthresh) {
331 /* Congestion avoidance phase */
332 tcp_cubic_congestion_avd(tp, th);
333 } else {
334 /*
335 * Use 2*SMSS as limit on increment as suggested
336 * by RFC 3465 section 2.3
337 */
338 uint32_t acked, abc_lim, incr;
339
340 acked = BYTES_ACKED(th, tp);
341 /*
342 * Maximum burst-size is limited to the initial congestion-window.
343 * We know that the network can survive this kind of burst.
344 */
345 abc_lim = tcp_initial_cwnd(tp);
346 incr = min(acked, abc_lim);
347
348 tp->snd_cwnd += incr;
349 tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
350
351 tcp_update_pacer_state(tp);
352 }
353 }
354
355 static void
tcp_cubic_pre_fr(struct tcpcb * tp)356 tcp_cubic_pre_fr(struct tcpcb *tp)
357 {
358 uint32_t win, avg;
359 int32_t dev;
360 tp->t_ccstate->cub_epoch_start = 0;
361 tp->t_ccstate->cub_tcp_win = 0;
362 tp->t_ccstate->cub_tcp_bytes_acked = 0;
363
364 win = min(tp->snd_cwnd, tp->snd_wnd);
365 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
366 tp->t_lossflightsize = tp->snd_max - tp->snd_una;
367 win = max(tp->t_pipeack, tp->t_lossflightsize);
368 } else {
369 tp->t_lossflightsize = 0;
370 }
371 /*
372 * Note the congestion window at which packet loss occurred as
373 * cub_last_max.
374 *
375 * If the congestion window is less than the last max window when
376 * loss occurred, it indicates that capacity available in the
377 * network has gone down. This can happen if a new flow has started
378 * and it is capturing some of the bandwidth. To reach convergence
379 * quickly, backoff a little more.
380 */
381 if (win < tp->t_ccstate->cub_last_max) {
382 tp->t_ccstate->cub_last_max = (uint32_t)((float)win * tcp_cubic_fast_convergence_factor);
383 } else {
384 tp->t_ccstate->cub_last_max = win;
385 }
386
387 if (tp->t_ccstate->cub_last_max == 0) {
388 /*
389 * If last_max is zero because snd_wnd is zero or for
390 * any other reason, initialize it to the amount of data
391 * in flight
392 */
393 tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una;
394 }
395
396 /*
397 * Compute average and mean absolute deviation of the
398 * window at which packet loss occurred.
399 */
400 if (tp->t_ccstate->cub_avg_lastmax == 0) {
401 tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max;
402 } else {
403 /*
404 * Average is computed by taking 63 parts of
405 * history and one part of the most recent value
406 */
407 avg = tp->t_ccstate->cub_avg_lastmax;
408 avg = (avg << 6) - avg;
409 tp->t_ccstate->cub_avg_lastmax =
410 (avg + tp->t_ccstate->cub_last_max) >> 6;
411 }
412
413 /* caluclate deviation from average */
414 dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max;
415
416 /* Take the absolute value */
417 if (dev < 0) {
418 dev = -dev;
419 }
420
421 if (tp->t_ccstate->cub_mean_dev == 0) {
422 tp->t_ccstate->cub_mean_dev = dev;
423 } else {
424 dev = dev + ((tp->t_ccstate->cub_mean_dev << 4)
425 - tp->t_ccstate->cub_mean_dev);
426 tp->t_ccstate->cub_mean_dev = dev >> 4;
427 }
428
429 /* Backoff congestion window by tcp_cubic_backoff factor */
430 win = (uint32_t)(win - (win * tcp_cubic_backoff));
431 win = tcp_round_to(win, tp->t_maxseg);
432 if (win < 2 * tp->t_maxseg) {
433 win = 2 * tp->t_maxseg;
434 }
435 tp->snd_ssthresh = win;
436 tcp_cc_resize_sndbuf(tp);
437 }
438
439 static void
tcp_cubic_post_fr(struct tcpcb * tp,struct tcphdr * th)440 tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th)
441 {
442 uint32_t flight_size = 0;
443 uint32_t ack;
444
445 if (th != NULL) {
446 ack = th->th_ack;
447 } else {
448 ack = tp->snd_una;
449 }
450
451 VERIFY(SEQ_LEQ(ack, tp->snd_max));
452 flight_size = tp->snd_max - ack;
453
454 /*
455 * Complete ack. The current window was inflated for fast recovery.
456 * It has to be deflated post recovery.
457 *
458 * Window inflation should have left us with approx snd_ssthresh
459 * outstanding data. If the flight size is zero or one segment,
460 * make congestion window to be at least as big as 2 segments to
461 * avoid delayed acknowledgements. This is according to RFC 6582.
462 */
463 if (flight_size < tp->snd_ssthresh) {
464 tp->snd_cwnd = max(flight_size, tp->t_maxseg) + tp->t_maxseg;
465 } else {
466 tp->snd_cwnd = tp->snd_ssthresh;
467 }
468
469 tp->t_ccstate->cub_tcp_win = 0;
470 tp->t_ccstate->cub_tcp_bytes_acked = 0;
471
472 tcp_update_pacer_state(tp);
473 }
474
475 static void
tcp_cubic_after_timeout(struct tcpcb * tp)476 tcp_cubic_after_timeout(struct tcpcb *tp)
477 {
478 VERIFY(tp->t_ccstate != NULL);
479
480 /*
481 * Avoid adjusting congestion window due to SYN retransmissions.
482 * If more than one byte (SYN) is outstanding then it is still
483 * needed to adjust the window.
484 */
485 if (tp->t_state < TCPS_ESTABLISHED &&
486 ((int)(tp->snd_max - tp->snd_una) <= 1)) {
487 return;
488 }
489
490 if (!IN_FASTRECOVERY(tp)) {
491 tcp_cubic_clear_state(tp);
492 tcp_cubic_pre_fr(tp);
493 }
494
495 /*
496 * Close the congestion window down to one segment as a retransmit
497 * timeout might indicate severe congestion.
498 */
499 tp->snd_cwnd = tp->t_maxseg;
500
501 tcp_update_pacer_state(tp);
502 }
503
504 static int
tcp_cubic_delay_ack(struct tcpcb * tp,struct tcphdr * th)505 tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th)
506 {
507 return tcp_cc_delay_ack(tp, th);
508 }
509
510 /*
511 * When switching from a different CC it is better for Cubic to start
512 * fresh. The state required for Cubic calculation might be stale and it
513 * might not represent the current state of the network. If it starts as
514 * a new connection it will probe and learn the existing network conditions.
515 */
516 static void
tcp_cubic_switch_cc(struct tcpcb * tp)517 tcp_cubic_switch_cc(struct tcpcb *tp)
518 {
519 tcp_cubic_cwnd_init_or_reset(tp);
520
521 os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed);
522 }
523
524 static inline void
tcp_cubic_clear_state(struct tcpcb * tp)525 tcp_cubic_clear_state(struct tcpcb *tp)
526 {
527 tp->t_ccstate->cub_last_max = 0;
528 tp->t_ccstate->cub_epoch_start = 0;
529 tp->t_ccstate->cub_origin_point = 0;
530 tp->t_ccstate->cub_tcp_win = 0;
531 tp->t_ccstate->cub_tcp_bytes_acked = 0;
532 tp->t_ccstate->cub_epoch_period = 0;
533 }
534