xref: /xnu-8020.140.41/bsd/netinet/tcp_rledbat.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/sysctl.h>
30 
31 #include <netinet/in.h>
32 #include <netinet/ip.h>
33 #include <netinet/ip6.h>
34 #include <netinet/ip_var.h>
35 
36 #include <netinet/tcp.h>
37 #include <netinet/tcp_fsm.h>
38 #include <netinet/tcp_seq.h>
39 #include <netinet/tcp_var.h>
40 #include <netinet/tcp_cc.h>
41 
42 /*
43  * This file implements a LBE congestion control algorithm
44  * to compute the receive window of a background transport
45  * which uses same algorithm as ledbat-plus-plus.
46  */
47 
48 #define GAIN_CONSTANT               (16)
49 #define DEFER_SLOWDOWN_DURATION     (30 * 1000) /* 30s */
50 
51 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rledbat, CTLFLAG_RW | CTLFLAG_LOCKED,
52     int, tcp_rledbat, 1, "Use Receive LEDBAT");
53 
54 void tcp_rledbat_init(struct tcpcb *tp);
55 void tcp_rledbat_cleanup(struct tcpcb *tp);
56 void tcp_rledbat_rwnd_init(struct tcpcb *tp);
57 void tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
58     struct tcpopt *to, uint32_t segment_len);
59 uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
60 void tcp_rledbat_after_idle(struct tcpcb *tp);
61 void tcp_rledbat_switch_to(struct tcpcb *tp);
62 
63 struct tcp_rcv_cc_algo tcp_cc_rledbat = {
64 	.name = "rledbat",
65 	.init = tcp_rledbat_init,
66 	.cleanup = tcp_rledbat_cleanup,
67 	.rwnd_init = tcp_rledbat_rwnd_init,
68 	.data_rcvd = tcp_rledbat_data_rcvd,
69 	.get_rlwin = tcp_rledbat_get_rlwin,
70 	.after_idle = tcp_rledbat_after_idle,
71 	.switch_to = tcp_rledbat_switch_to,
72 };
73 
74 static inline void
rledbat_clear_state(struct tcpcb * tp)75 rledbat_clear_state(struct tcpcb *tp)
76 {
77 	tp->t_rlstate.num_slowdown_events = 0;
78 	tp->t_rlstate.slowdown_ts = 0;
79 	tp->t_rlstate.slowdown_begin = 0;
80 	tp->t_rlstate.rcvd_bytes = 0;
81 	tp->t_rlstate.md_rcvd_bytes = 0;
82 	tp->t_rlstate.drained_bytes = 0;
83 }
84 
85 void
tcp_rledbat_init(struct tcpcb * tp)86 tcp_rledbat_init(struct tcpcb *tp)
87 {
88 	OSIncrementAtomic((volatile int *)&tcp_cc_rledbat.num_sockets);
89 	rledbat_clear_state(tp);
90 
91 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
92 	tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
93 }
94 
95 void
tcp_rledbat_cleanup(struct tcpcb * tp)96 tcp_rledbat_cleanup(struct tcpcb *tp)
97 {
98 #pragma unused(tp)
99 	OSDecrementAtomic((volatile int *)&tcp_cc_rledbat.num_sockets);
100 }
101 
102 /*
103  * Initialize the receive window for a connection
104  */
105 void
tcp_rledbat_rwnd_init(struct tcpcb * tp)106 tcp_rledbat_rwnd_init(struct tcpcb *tp)
107 {
108 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
109 
110 	/* If the ssthresh hasn't been set, do it now */
111 	if (tp->t_rlstate.ssthresh == 0) {
112 		tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
113 	}
114 }
115 
116 /*
117  * Compute the denominator
118  * MIN(16, ceil(2 * TARGET / base))
119  */
120 static uint32_t
rledbat_gain(uint32_t base_rtt)121 rledbat_gain(uint32_t base_rtt)
122 {
123 	return MIN(GAIN_CONSTANT, tcp_ceil(2 * target_qdelay /
124 	           (double)base_rtt));
125 }
126 
127 /*
128  * Congestion avoidance for ledbat++
129  */
130 static void
rledbat_congestion_avd(struct tcpcb * tp,uint32_t segment_len,uint32_t base_rtt,uint32_t curr_rtt)131 rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
132     uint32_t base_rtt, uint32_t curr_rtt)
133 {
134 	uint32_t update = 0;
135 	/*
136 	 * Set the next slowdown time i.e. 9 times the duration
137 	 * of previous slowdown except the initial slowdown.
138 	 */
139 	if (tp->t_rlstate.slowdown_ts == 0) {
140 		uint32_t slowdown_duration = 0;
141 		if (tp->t_rlstate.num_slowdown_events > 0) {
142 			slowdown_duration = tcp_now -
143 			    tp->t_rlstate.slowdown_begin;
144 
145 			if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
146 				/*
147 				 * Special case for slowdowns (other than initial)
148 				 * where cwnd doesn't recover fully to previous
149 				 * ssthresh
150 				 */
151 				slowdown_duration *= 2;
152 			}
153 		}
154 		tp->t_rlstate.slowdown_ts = tcp_now +
155 		    (9 * slowdown_duration);
156 		if (slowdown_duration == 0) {
157 			tp->t_rlstate.slowdown_ts += (2 * (tp->rcv_srtt >> TCP_RTT_SHIFT));
158 		}
159 		/* Reset the start */
160 		tp->t_rlstate.slowdown_begin = 0;
161 
162 		/* On exit slow start due to higher qdelay, cap the ssthresh */
163 		if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
164 			tp->t_rlstate.ssthresh = tp->t_rlstate.win;
165 		}
166 	}
167 
168 	if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
169 		/* Additive increase */
170 		tp->t_rlstate.rcvd_bytes += segment_len;
171 		if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
172 			update = tp->t_maxseg;
173 			tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
174 			/*
175 			 * Move background slow-start threshold to current
176 			 * congestion window so that the next time (after some idle
177 			 * period), we can attempt to do slow-start till here if there
178 			 * is no increase in rtt
179 			 */
180 			if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
181 				tp->t_rlstate.ssthresh = tp->t_rlstate.win;
182 			}
183 			tp->t_rlstate.win += update;
184 			tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win,
185 			    tp->t_maxseg), TCP_MAXWIN << tp->rcv_scale);
186 		}
187 	} else {
188 		/*
189 		 * If we are still within 1 RTT of previous reduction
190 		 * due to loss, do nothing
191 		 */
192 		if (tcp_now < tp->t_rlstate.reduction_end) {
193 			return;
194 		}
195 		/*
196 		 * Multiplicative decrease
197 		 * W -= min(W * (qdelay/target - 1), W/2) (per RTT)
198 		 * To calculate per bytes acked, it becomes
199 		 * W -= min((qdelay/target - 1), 1/2) * bytes_acked
200 		 */
201 		uint32_t qdelay = curr_rtt > base_rtt ?
202 		    (curr_rtt - base_rtt) : 0;
203 
204 		tp->t_rlstate.md_rcvd_bytes += segment_len;
205 		if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
206 			update = (uint32_t)(MIN(((double)qdelay / target_qdelay - 1), 0.5) *
207 			    (double)tp->t_rlstate.win);
208 			tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
209 			tp->t_rlstate.win -= update;
210 
211 			if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
212 				tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
213 			}
214 
215 			tp->t_rlstate.win = tcp_round_to(tp->t_rlstate.win, tp->t_maxseg);
216 			/*
217 			 * Lower background slow-start threshold so that the connection
218 			 * will stay in congestion avoidance phase
219 			 */
220 			if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
221 				tp->t_rlstate.ssthresh = tp->t_rlstate.win;
222 			}
223 
224 			if (tp->t_rlstate.slowdown_ts != 0) {
225 				/* As the window has been reduced, defer the slowdown */
226 				tp->t_rlstate.slowdown_ts = tcp_now +
227 				    DEFER_SLOWDOWN_DURATION;
228 			}
229 		}
230 	}
231 }
232 
233 /*
234  * Update win based on ledbat++ algo
235  */
236 void
tcp_rledbat_data_rcvd(struct tcpcb * tp,struct tcphdr * th,struct tcpopt * to,uint32_t segment_len)237 tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
238     struct tcpopt *to, uint32_t segment_len)
239 {
240 	uint32_t update = 0;
241 	const uint32_t base_rtt = get_base_rtt(tp);
242 	const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
243 	    tp->t_rttcur;
244 	const uint32_t ss_target = (uint32_t)(3 * target_qdelay / 4);
245 	tp->t_rlstate.drained_bytes += segment_len;
246 
247 	/*
248 	 * Slowdown period - first slowdown
249 	 * is 2RTT after we exit initial slow start.
250 	 * Subsequent slowdowns are after 9 times the
251 	 * previous slow down durations.
252 	 */
253 	if (tp->t_rlstate.slowdown_ts != 0 &&
254 	    tcp_now >= tp->t_rlstate.slowdown_ts) {
255 		if (tp->t_rlstate.slowdown_begin == 0) {
256 			tp->t_rlstate.slowdown_begin = tcp_now;
257 			tp->t_rlstate.num_slowdown_events++;
258 		}
259 		if (tcp_now < tp->t_rlstate.slowdown_ts +
260 		    (2 * (tp->rcv_srtt >> TCP_RTT_SHIFT))) {
261 			// Set rwnd to 2 packets and return
262 			if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
263 				if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
264 					tp->t_rlstate.ssthresh = tp->t_rlstate.win;
265 				}
266 				tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
267 				/* Reset total bytes acked */
268 				tp->t_rlstate.rcvd_bytes = 0;
269 			}
270 			return;
271 		}
272 	}
273 
274 	/*
275 	 * Detect retransmissions first by checking if the current
276 	 * received sequence is smaller than largest and its
277 	 * timestamp is higher than the largest so far. Reduce
278 	 * win based on fast recovery only once per effective RTT.
279 	 *
280 	 * Note: As we are detecting retransmissions (not packet loss),
281 	 * we are giving some leeway for the next window reduction.
282 	 */
283 	if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
284 	    TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
285 		if (tcp_now < tp->t_rlstate.reduction_end) {
286 			/* still need to wait for reduction end to elapse */
287 			return;
288 		}
289 
290 		uint32_t win = tp->t_rlstate.win / 2;
291 		win = tcp_round_to(win, tp->t_maxseg);
292 		if (win < 2 * tp->t_maxseg) {
293 			win = 2 * tp->t_maxseg;
294 		}
295 		tp->t_rlstate.ssthresh = win;
296 		tp->t_rlstate.win = win;
297 
298 		/* Reset the received bytes */
299 		tp->t_rlstate.rcvd_bytes = 0;
300 		tp->t_rlstate.md_rcvd_bytes = 0;
301 
302 		/* Update the reduction end time */
303 		tp->t_rlstate.reduction_end = tcp_now + 2 *
304 		    (tp->rcv_srtt >> TCP_RTT_SHIFT);
305 
306 		if (tp->t_rlstate.slowdown_ts != 0) {
307 			/* As the window has been halved, defer the slowdown. */
308 			tp->t_rlstate.slowdown_ts = tcp_now +
309 			    DEFER_SLOWDOWN_DURATION;
310 		}
311 		return;
312 	}
313 
314 	/* Now we can do slow start or CA */
315 	if (curr_rtt == 0 || base_rtt == 0) {
316 		update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
317 		    tp->t_maxseg);
318 		tp->t_rlstate.win += update;
319 		tp->t_rlstate.win = min(tp->t_rlstate.win,
320 		    TCP_MAXWIN << tp->rcv_scale);
321 	} else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
322 	    ((tp->t_rlstate.num_slowdown_events > 0 &&
323 	    curr_rtt <= (base_rtt + (uint32_t)target_qdelay)) ||
324 	    curr_rtt <= (base_rtt + ss_target))) {
325 		/*
326 		 * Modified slow start with a dynamic GAIN
327 		 * If the queuing delay is larger than 3/4 of the target
328 		 * delay, exit slow start, iff, it is the initial slow start.
329 		 * After the initial slow start, during CA, window growth
330 		 * will be bound by ssthresh.
331 		 */
332 		tp->t_rlstate.rcvd_bytes += segment_len;
333 		uint32_t gain_factor = rledbat_gain(base_rtt);
334 		if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
335 			update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
336 			    TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
337 			tp->t_rlstate.rcvd_bytes = 0;
338 			tp->t_rlstate.win += update;
339 			tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win,
340 			    tp->t_maxseg), TCP_MAXWIN << tp->rcv_scale);
341 		}
342 
343 		/* Reset the next slowdown timestamp */
344 		if (tp->t_rlstate.slowdown_ts != 0) {
345 			tp->t_rlstate.slowdown_ts = 0;
346 		}
347 	} else {
348 		/* Congestion avoidance */
349 		rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt);
350 	}
351 }
352 
353 uint32_t
tcp_rledbat_get_rlwin(struct tcpcb * tp)354 tcp_rledbat_get_rlwin(struct tcpcb *tp)
355 {
356 	/* rlwin is either greater or smaller by at most drained bytes */
357 	if (tp->t_rlstate.win > tp->t_rlstate.win_ws ||
358 	    tp->t_rlstate.win_ws - tp->t_rlstate.win <
359 	    tp->t_rlstate.drained_bytes) {
360 		tp->t_rlstate.win_ws = tp->t_rlstate.win;
361 	} else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
362 		/*
363 		 * rlwin is smaller, decrease the advertised window
364 		 * only by drained bytes at a time
365 		 */
366 		tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
367 		    tp->t_rlstate.drained_bytes;
368 	}
369 	tp->t_rlstate.drained_bytes = 0;
370 	/* Round up to the receive window scale */
371 	tp->t_rlstate.win_ws = tcp_round_up(tp->t_rlstate.win_ws,
372 	    1 << tp->rcv_scale);
373 
374 	return tp->t_rlstate.win_ws;
375 }
376 
377 /*
378  * Function to handle connections that have been idle for
379  * some time. Slow start to get ack "clock" running again.
380  * Clear base history after idle time.
381  */
382 void
tcp_rledbat_after_idle(struct tcpcb * tp)383 tcp_rledbat_after_idle(struct tcpcb *tp)
384 {
385 	rledbat_clear_state(tp);
386 	/* Reset the rledbat window */
387 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
388 }
389 
390 void
tcp_rledbat_switch_to(struct tcpcb * tp)391 tcp_rledbat_switch_to(struct tcpcb *tp)
392 {
393 	rledbat_clear_state(tp);
394 
395 	uint32_t win = 0;
396 
397 	if (tp->t_rlstate.win == 0) {
398 		/*
399 		 * Use half of previous window, the algorithm
400 		 * will quickly reduce the window if there is still
401 		 * high queueing delay.
402 		 */
403 		win = (tp->rcv_adv - tp->rcv_nxt) / 2;
404 	} else {
405 		/* Reduce the window by half from the previous value */
406 		win = tp->t_rlstate.win / 2;
407 	}
408 
409 	win = tcp_round_to(win, tp->t_maxseg);
410 	if (win < bg_ss_fltsz * tp->t_maxseg) {
411 		win = bg_ss_fltsz * tp->t_maxseg;
412 	}
413 	tp->t_rlstate.win = win;
414 
415 	/* ssthresh should be at most the inital value */
416 	if (tp->t_rlstate.ssthresh == 0) {
417 		tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
418 	} else {
419 		tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
420 		    TCP_MAXWIN << TCP_MAX_WINSHIFT);
421 	}
422 }
423