xref: /xnu-12377.81.4/bsd/netinet/tcp_cubic.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2013-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "tcp_includes.h"
30 
31 #include <sys/param.h>
32 #include <sys/kernel.h>
33 #include <sys/syslog.h>
34 
35 #include <netinet/in.h>
36 #include <netinet/in_systm.h>
37 #include <netinet/ip.h>
38 #include <netinet/ip6.h>
39 #include <netinet/ip_var.h>
40 
41 static int tcp_cubic_init(struct tcpcb *tp);
42 static int tcp_cubic_cleanup(struct tcpcb *tp);
43 static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp);
44 static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th);
45 static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th);
46 static void tcp_cubic_pre_fr(struct tcpcb *tp);
47 static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th);
48 static void tcp_cubic_after_timeout(struct tcpcb *tp);
49 static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th);
50 static void tcp_cubic_switch_cc(struct tcpcb *tp);
51 static uint32_t tcp_cubic_update(struct tcpcb *tp, uint32_t rtt);
52 static inline void tcp_cubic_clear_state(struct tcpcb *tp);
53 
54 extern float cbrtf(float x);
55 
56 struct tcp_cc_algo tcp_cc_cubic = {
57 	.name = "cubic",
58 	.init = tcp_cubic_init,
59 	.cleanup = tcp_cubic_cleanup,
60 	.cwnd_init = tcp_cubic_cwnd_init_or_reset,
61 	.congestion_avd = tcp_cubic_congestion_avd,
62 	.ack_rcvd = tcp_cubic_ack_rcvd,
63 	.pre_fr = tcp_cubic_pre_fr,
64 	.post_fr = tcp_cubic_post_fr,
65 	.after_idle = tcp_cubic_cwnd_init_or_reset,
66 	.after_timeout = tcp_cubic_after_timeout,
67 	.delay_ack = tcp_cubic_delay_ack,
68 	.switch_to = tcp_cubic_switch_cc
69 };
70 
71 static float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */
72 static float tcp_cubic_coeff = 0.4f;
73 static float tcp_cubic_fast_convergence_factor = 0.875f;
74 
75 static float tcp_cubic_beta = 0.8f;
76 
77 static int
tcp_cubic_init(struct tcpcb * tp)78 tcp_cubic_init(struct tcpcb *tp)
79 {
80 	os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed);
81 
82 	tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */
83 	tcp_cubic_fast_convergence_factor = 0.85f;
84 	tcp_cubic_beta = 0.7f;
85 
86 	VERIFY(tp->t_ccstate != NULL);
87 	tcp_cubic_clear_state(tp);
88 	return 0;
89 }
90 
91 static int
tcp_cubic_cleanup(struct tcpcb * tp)92 tcp_cubic_cleanup(struct tcpcb *tp)
93 {
94 #pragma unused(tp)
95 	os_atomic_dec(&tcp_cc_cubic.num_sockets, relaxed);
96 	return 0;
97 }
98 
99 /*
100  * Initialize the congestion window at the beginning of a connection or
101  * after idle time
102  */
103 static void
tcp_cubic_cwnd_init_or_reset(struct tcpcb * tp)104 tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp)
105 {
106 	VERIFY(tp->t_ccstate != NULL);
107 
108 	tcp_cubic_clear_state(tp);
109 	tcp_cc_cwnd_init_or_reset(tp);
110 	tp->t_pipeack = 0;
111 	tcp_clear_pipeack_state(tp);
112 
113 	/* Start counting bytes for RFC 3465 again */
114 	tp->t_bytes_acked = 0;
115 
116 	/*
117 	 * slow start threshold could get initialized to a lower value
118 	 * when there is a cached value in the route metrics. In this case,
119 	 * the connection can enter congestion avoidance without any packet
120 	 * loss and Cubic will enter steady-state too early. It is better
121 	 * to always probe to find the initial slow-start threshold.
122 	 */
123 	if (tp->t_inpcb->inp_mstat.ms_total.ts_txbytes <= tcp_initial_cwnd(tp) &&
124 	    tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) {
125 		tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
126 	}
127 
128 	/* Initialize cubic last max to be same as ssthresh */
129 	tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
130 
131 	/* Set initial pacer state */
132 	tcp_update_pacer_state(tp);
133 }
134 
135 /*
136  * Compute the target congestion window for the next RTT according to
137  * cubic equation when an ack is received.
138  *
139  * W(t) = C(t-K)^3 + W(last_max)
140  */
141 static uint32_t
tcp_cubic_update(struct tcpcb * tp,uint32_t rtt)142 tcp_cubic_update(struct tcpcb *tp, uint32_t rtt)
143 {
144 	struct tcp_globals *globals = tcp_get_globals(tp);
145 	float K, var;
146 	uint32_t elapsed_time, win;
147 
148 	win = min(tp->snd_cwnd, tp->snd_wnd);
149 	if (tp->t_ccstate->cub_last_max == 0) {
150 		tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
151 	}
152 
153 	if (tp->t_ccstate->cub_epoch_start == 0) {
154 		/*
155 		 * This is the beginning of a new epoch, initialize some of
156 		 * the variables that we need to use for computing the
157 		 * congestion window later.
158 		 */
159 		tp->t_ccstate->cub_epoch_start = tcp_globals_now(globals);
160 		if (tp->t_ccstate->cub_epoch_start == 0) {
161 			tp->t_ccstate->cub_epoch_start = 1;
162 		}
163 		if (win < tp->t_ccstate->cub_last_max) {
164 			/*
165 			 * Compute cubic epoch period, this is the time
166 			 * period that the window will take to increase to
167 			 * last_max again after backoff due to loss.
168 			 */
169 			K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff;
170 			K = cbrtf(K);
171 			tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ;
172 			/* Origin point */
173 			tp->t_ccstate->cub_origin_point = tp->t_ccstate->cub_last_max;
174 		} else {
175 			tp->t_ccstate->cub_epoch_period = 0;
176 			tp->t_ccstate->cub_origin_point = win;
177 		}
178 	}
179 
180 	VERIFY(tp->t_ccstate->cub_origin_point > 0);
181 	/*
182 	 * Compute the target window for the next RTT using smoothed RTT
183 	 * as an estimate for next RTT.
184 	 */
185 	elapsed_time = timer_diff(tcp_globals_now(globals), 0, tp->t_ccstate->cub_epoch_start, 0);
186 
187 	if (tcp_cubic_use_minrtt) {
188 		elapsed_time += max(tcp_cubic_use_minrtt, rtt);
189 	} else {
190 		elapsed_time += rtt;
191 	}
192 	var = (elapsed_time  - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ;
193 	var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg);
194 
195 	return (uint32_t)(tp->t_ccstate->cub_origin_point + var);
196 }
197 
198 /*
199  * Standard TCP utilizes bandwidth well in low RTT and low BDP connections
200  * even when there is some packet loss. Enabling TCP mode will help Cubic
201  * to achieve this kind of utilization.
202  *
203  * But if there is a bottleneck link in the path with a fixed size queue
204  * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this
205  * link because of the steady-state behavior. Using average and mean
206  * absolute deviation of W(lastmax), we try to detect if the congestion
207  * window is close to the bottleneck bandwidth. In that case, disabling
208  * TCP mode will help to minimize packet loss at this link.
209  *
210  * Disable TCP mode if the W(lastmax) (the window where previous packet
211  * loss happened) is within a small range from the average last max
212  * calculated.
213  */
214 #define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \
215 	((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \
216 	(_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0)
217 
218 /*
219  * Compute the window growth if standard TCP (AIMD) was used with
220  * a backoff of 0.5 and additive increase of 1 packet per RTT.
221  *
222  * TCP window at time t can be calculated using the following equation
223  * with tcp_beta_cubic
224  *
225  * W(t) <- Wmax * tcp_beta_cubic + 3 * ((1 - tcp_beta_cubic)/(1 + tcp_beta_cubic)) * t/RTT
226  *
227  */
228 static uint32_t
tcp_cubic_tcpwin(struct tcpcb * tp,struct tcphdr * th)229 tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th)
230 {
231 	if (tp->t_ccstate->cub_tcp_win == 0) {
232 		/* Start of the epoch, we set the tcp_win to whatever Cubic decided
233 		 * at the beginning of the epoch.
234 		 */
235 		tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd);
236 		tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp);
237 	} else {
238 		tp->t_ccstate->cub_tcp_bytes_acked += BYTES_ACKED(th, tp);
239 
240 		/*
241 		 * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked
242 		 * against the snd_cwnd represents exactly one RTT at full rate.
243 		 */
244 		while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) {
245 			/* Enough bytes have been ACK'd for TCP to do AIMD */
246 			tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd;
247 
248 			if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max) {
249 				tp->t_ccstate->cub_tcp_win += tp->t_maxseg;
250 			} else {
251 				/* Increase-rate from Section 4.2, RFC 8312 */
252 				float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta);
253 
254 				tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor);
255 			}
256 		}
257 	}
258 	return tp->t_ccstate->cub_tcp_win;
259 }
260 
261 /*
262  * Handle an in-sequence ack during congestion avoidance phase.
263  */
264 static void
tcp_cubic_congestion_avd(struct tcpcb * tp,struct tcphdr * th)265 tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th)
266 {
267 	uint32_t cubic_target_win, tcp_win, rtt;
268 	uint64_t incr_win = UINT32_MAX;
269 
270 	/* Do not increase congestion window in non-validated phase */
271 	if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
272 		return;
273 	}
274 
275 	tp->t_bytes_acked += BYTES_ACKED(th, tp);
276 
277 	rtt = get_base_rtt(tp);
278 	/*
279 	 * First compute cubic window. If cubic variables are not
280 	 * initialized (after coming out of recovery), this call will
281 	 * initialize them.
282 	 */
283 	cubic_target_win = tcp_cubic_update(tp, rtt);
284 
285 	/* Compute TCP window if a multiplicative decrease of 0.2 is used */
286 	tcp_win = tcp_cubic_tcpwin(tp, th);
287 
288 	if (cubic_target_win > tp->snd_cwnd) {
289 		/*
290 		 * The target win is computed for the next RTT.
291 		 * To reach this value, cwnd will have to be updated
292 		 * one segment at a time. Compute how many bytes
293 		 * need to be acknowledged before we can increase
294 		 * the cwnd by one segment.
295 		 */
296 		incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg;
297 		incr_win /= (cubic_target_win - tp->snd_cwnd);
298 	}
299 
300 	tcp_win = tcp_round_to(tcp_win, tp->t_maxseg);
301 
302 	if (tp->snd_cwnd < tcp_win) {
303 		uint64_t tcp_incr_win;
304 
305 		tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg;
306 		tcp_incr_win /= (tcp_win - tp->snd_cwnd);
307 
308 		if (tcp_incr_win < incr_win) {
309 			/* this connection is in TCP-friendly region */
310 			incr_win = tcp_incr_win;
311 		}
312 	}
313 
314 	if (incr_win > 0 && tp->t_bytes_acked >= incr_win) {
315 		tp->t_bytes_acked -= incr_win;
316 		tp->snd_cwnd = min(tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale);
317 	}
318 
319 	tcp_update_pacer_state(tp);
320 }
321 
322 static void
tcp_cubic_ack_rcvd(struct tcpcb * tp,struct tcphdr * th)323 tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th)
324 {
325 	/* Do not increase the congestion window in non-validated phase */
326 	if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) {
327 		return;
328 	}
329 
330 	if (tp->snd_cwnd >= tp->snd_ssthresh) {
331 		/* Congestion avoidance phase */
332 		tcp_cubic_congestion_avd(tp, th);
333 	} else {
334 		/*
335 		 * Use 2*SMSS as limit on increment as suggested
336 		 * by RFC 3465 section 2.3
337 		 */
338 		uint32_t acked, abc_lim, incr;
339 
340 		acked = BYTES_ACKED(th, tp);
341 		/*
342 		 * Maximum burst-size is limited to the initial congestion-window.
343 		 * We know that the network can survive this kind of burst.
344 		 */
345 		abc_lim = tcp_initial_cwnd(tp);
346 		incr = min(acked, abc_lim);
347 
348 		tp->snd_cwnd += incr;
349 		tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
350 
351 		tcp_update_pacer_state(tp);
352 	}
353 }
354 
355 static void
tcp_cubic_pre_fr(struct tcpcb * tp)356 tcp_cubic_pre_fr(struct tcpcb *tp)
357 {
358 	uint32_t win, avg;
359 	int32_t dev;
360 	tp->t_ccstate->cub_epoch_start = 0;
361 	tp->t_ccstate->cub_tcp_win = 0;
362 	tp->t_ccstate->cub_tcp_bytes_acked = 0;
363 
364 	win = min(tp->snd_cwnd, tp->snd_wnd);
365 	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
366 		tp->t_lossflightsize = tp->snd_max - tp->snd_una;
367 		win = max(tp->t_pipeack, tp->t_lossflightsize);
368 	} else {
369 		tp->t_lossflightsize = 0;
370 	}
371 	/*
372 	 * Note the congestion window at which packet loss occurred as
373 	 * cub_last_max.
374 	 *
375 	 * If the congestion window is less than the last max window when
376 	 * loss occurred, it indicates that capacity available in the
377 	 * network has gone down. This can happen if a new flow has started
378 	 * and it is capturing some of the bandwidth. To reach convergence
379 	 * quickly, backoff a little more.
380 	 */
381 	if (win < tp->t_ccstate->cub_last_max) {
382 		tp->t_ccstate->cub_last_max = (uint32_t)((float)win * tcp_cubic_fast_convergence_factor);
383 	} else {
384 		tp->t_ccstate->cub_last_max = win;
385 	}
386 
387 	if (tp->t_ccstate->cub_last_max == 0) {
388 		/*
389 		 * If last_max is zero because snd_wnd is zero or for
390 		 * any other reason, initialize it to the amount of data
391 		 * in flight
392 		 */
393 		tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una;
394 	}
395 
396 	/*
397 	 * Compute average and mean absolute deviation of the
398 	 * window at which packet loss occurred.
399 	 */
400 	if (tp->t_ccstate->cub_avg_lastmax == 0) {
401 		tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max;
402 	} else {
403 		/*
404 		 * Average is computed by taking 63 parts of
405 		 * history and one part of the most recent value
406 		 */
407 		avg = tp->t_ccstate->cub_avg_lastmax;
408 		avg = (avg << 6) - avg;
409 		tp->t_ccstate->cub_avg_lastmax =
410 		    (avg + tp->t_ccstate->cub_last_max) >> 6;
411 	}
412 
413 	/* caluclate deviation from average */
414 	dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max;
415 
416 	/* Take the absolute value */
417 	if (dev < 0) {
418 		dev = -dev;
419 	}
420 
421 	if (tp->t_ccstate->cub_mean_dev == 0) {
422 		tp->t_ccstate->cub_mean_dev = dev;
423 	} else {
424 		dev = dev + ((tp->t_ccstate->cub_mean_dev << 4)
425 		    - tp->t_ccstate->cub_mean_dev);
426 		tp->t_ccstate->cub_mean_dev = dev >> 4;
427 	}
428 
429 	/* Backoff congestion window by tcp_cubic_backoff factor */
430 	win = (uint32_t)(win - (win * tcp_cubic_backoff));
431 	win = tcp_round_to(win, tp->t_maxseg);
432 	if (win < 2 * tp->t_maxseg) {
433 		win =  2 * tp->t_maxseg;
434 	}
435 	tp->snd_ssthresh = win;
436 	tcp_cc_resize_sndbuf(tp);
437 }
438 
439 static void
tcp_cubic_post_fr(struct tcpcb * tp,struct tcphdr * th)440 tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th)
441 {
442 	uint32_t flight_size = 0;
443 	uint32_t ack;
444 
445 	if (th != NULL) {
446 		ack = th->th_ack;
447 	} else {
448 		ack = tp->snd_una;
449 	}
450 
451 	VERIFY(SEQ_LEQ(ack, tp->snd_max));
452 	flight_size = tp->snd_max - ack;
453 
454 	/*
455 	 * Complete ack. The current window was inflated for fast recovery.
456 	 * It has to be deflated post recovery.
457 	 *
458 	 * Window inflation should have left us with approx snd_ssthresh
459 	 * outstanding data. If the flight size is zero or one segment,
460 	 * make congestion window to be at least as big as 2 segments to
461 	 * avoid delayed acknowledgements. This is according to RFC 6582.
462 	 */
463 	if (flight_size < tp->snd_ssthresh) {
464 		tp->snd_cwnd = max(flight_size, tp->t_maxseg) + tp->t_maxseg;
465 	} else {
466 		tp->snd_cwnd = tp->snd_ssthresh;
467 	}
468 
469 	tp->t_ccstate->cub_tcp_win = 0;
470 	tp->t_ccstate->cub_tcp_bytes_acked = 0;
471 
472 	tcp_update_pacer_state(tp);
473 }
474 
475 static void
tcp_cubic_after_timeout(struct tcpcb * tp)476 tcp_cubic_after_timeout(struct tcpcb *tp)
477 {
478 	VERIFY(tp->t_ccstate != NULL);
479 
480 	/*
481 	 * Avoid adjusting congestion window due to SYN retransmissions.
482 	 * If more than one byte (SYN) is outstanding then it is still
483 	 * needed to adjust the window.
484 	 */
485 	if (tp->t_state < TCPS_ESTABLISHED &&
486 	    ((int)(tp->snd_max - tp->snd_una) <= 1)) {
487 		return;
488 	}
489 
490 	if (!IN_FASTRECOVERY(tp)) {
491 		tcp_cubic_clear_state(tp);
492 		tcp_cubic_pre_fr(tp);
493 	}
494 
495 	/*
496 	 * Close the congestion window down to one segment as a retransmit
497 	 * timeout might indicate severe congestion.
498 	 */
499 	tp->snd_cwnd = tp->t_maxseg;
500 
501 	tcp_update_pacer_state(tp);
502 }
503 
504 static int
tcp_cubic_delay_ack(struct tcpcb * tp,struct tcphdr * th)505 tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th)
506 {
507 	return tcp_cc_delay_ack(tp, th);
508 }
509 
510 /*
511  * When switching from a different CC it is better for Cubic to start
512  * fresh. The state required for Cubic calculation might be stale and it
513  * might not represent the current state of the network. If it starts as
514  * a new connection it will probe and learn the existing network conditions.
515  */
516 static void
tcp_cubic_switch_cc(struct tcpcb * tp)517 tcp_cubic_switch_cc(struct tcpcb *tp)
518 {
519 	tcp_cubic_cwnd_init_or_reset(tp);
520 
521 	os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed);
522 }
523 
524 static inline void
tcp_cubic_clear_state(struct tcpcb * tp)525 tcp_cubic_clear_state(struct tcpcb *tp)
526 {
527 	tp->t_ccstate->cub_last_max = 0;
528 	tp->t_ccstate->cub_epoch_start = 0;
529 	tp->t_ccstate->cub_origin_point = 0;
530 	tp->t_ccstate->cub_tcp_win = 0;
531 	tp->t_ccstate->cub_tcp_bytes_acked = 0;
532 	tp->t_ccstate->cub_epoch_period = 0;
533 }
534