xref: /xnu-11215.41.3/bsd/netinet/tcp_rledbat.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "tcp_includes.h"
30 
31 /*
32  * This file implements a LBE congestion control algorithm
33  * to compute the receive window of a background transport
34  * which uses same algorithm as ledbat-plus-plus.
35  */
36 
37 #define GAIN_CONSTANT               (16)
38 #define TCP_BASE_RTT_INTERVAL       (60 * TCP_RETRANSHZ)
39 
40 void tcp_rledbat_init(struct tcpcb *tp);
41 void tcp_rledbat_cleanup(struct tcpcb *tp);
42 void tcp_rledbat_rwnd_init(struct tcpcb *tp);
43 void tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
44     struct tcpopt *to, uint32_t segment_len);
45 uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
46 void tcp_rledbat_after_idle(struct tcpcb *tp);
47 void tcp_rledbat_switch_to(struct tcpcb *tp);
48 
49 struct tcp_rcv_cc_algo tcp_cc_rledbat = {
50 	.name = "rledbat",
51 	.init = tcp_rledbat_init,
52 	.cleanup = tcp_rledbat_cleanup,
53 	.rwnd_init = tcp_rledbat_rwnd_init,
54 	.data_rcvd = tcp_rledbat_data_rcvd,
55 	.get_rlwin = tcp_rledbat_get_rlwin,
56 	.after_idle = tcp_rledbat_after_idle,
57 	.switch_to = tcp_rledbat_switch_to,
58 };
59 
60 static inline void
rledbat_clear_state(struct tcpcb * tp)61 rledbat_clear_state(struct tcpcb *tp)
62 {
63 	tp->t_rlstate.num_slowdown_events = 0;
64 	tp->t_rlstate.slowdown_ts = 0;
65 	tp->t_rlstate.slowdown_begin = 0;
66 	tp->t_rlstate.rcvd_bytes = 0;
67 	tp->t_rlstate.md_rcvd_bytes = 0;
68 	tp->t_rlstate.drained_bytes = 0;
69 }
70 
71 void
tcp_rledbat_init(struct tcpcb * tp)72 tcp_rledbat_init(struct tcpcb *tp)
73 {
74 	os_atomic_inc(&tcp_cc_rledbat.num_sockets, relaxed);
75 	rledbat_clear_state(tp);
76 
77 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
78 	tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
79 }
80 
81 void
tcp_rledbat_cleanup(struct tcpcb * tp)82 tcp_rledbat_cleanup(struct tcpcb *tp)
83 {
84 #pragma unused(tp)
85 	os_atomic_dec(&tcp_cc_rledbat.num_sockets, relaxed);
86 }
87 
88 /*
89  * Initialize the receive window for a connection
90  */
91 void
tcp_rledbat_rwnd_init(struct tcpcb * tp)92 tcp_rledbat_rwnd_init(struct tcpcb *tp)
93 {
94 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
95 
96 	/* If the ssthresh hasn't been set, do it now */
97 	if (tp->t_rlstate.ssthresh == 0) {
98 		tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
99 	}
100 }
101 
102 /*
103  * Compute the denominator
104  * MIN(16, ceil(2 * TARGET / base))
105  */
106 static uint32_t
rledbat_gain(uint32_t base_rtt)107 rledbat_gain(uint32_t base_rtt)
108 {
109 	return MIN(GAIN_CONSTANT, tcp_ceil(2 * target_qdelay /
110 	           (double)base_rtt));
111 }
112 
113 /*
114  * Congestion avoidance for ledbat++
115  */
116 static void
rledbat_congestion_avd(struct tcpcb * tp,uint32_t segment_len,uint32_t base_rtt,uint32_t curr_rtt,uint32_t now)117 rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
118     uint32_t base_rtt, uint32_t curr_rtt, uint32_t now)
119 {
120 	uint32_t update = 0;
121 	/*
122 	 * Set the next slowdown time i.e. 9 times the duration
123 	 * of previous slowdown except the initial slowdown.
124 	 *
125 	 * Updated: we will slowdown once in 60s based on our
126 	 * base RTT interval.
127 	 */
128 	if (tp->t_rlstate.slowdown_ts == 0) {
129 		uint32_t slowdown_duration = TCP_BASE_RTT_INTERVAL;
130 		if (tp->t_rlstate.num_slowdown_events > 0) {
131 			if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
132 				/*
133 				 * Special case for slowdowns (other than initial)
134 				 * where cwnd doesn't recover fully to previous
135 				 * ssthresh
136 				 */
137 				slowdown_duration *= 2;
138 			}
139 		}
140 		tp->t_rlstate.slowdown_ts = now + slowdown_duration;
141 
142 		/* Reset the start */
143 		tp->t_rlstate.slowdown_begin = 0;
144 
145 		/* On exit slow start due to higher qdelay, cap the ssthresh */
146 		if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
147 			tp->t_rlstate.ssthresh = tp->t_rlstate.win;
148 		}
149 	}
150 
151 	if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
152 		/* Additive increase */
153 		tp->t_rlstate.rcvd_bytes += segment_len;
154 		if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
155 			update = tp->t_maxseg;
156 			tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
157 			/*
158 			 * Move background slow-start threshold to current
159 			 * congestion window so that the next time (after some idle
160 			 * period), we can attempt to do slow-start till here if there
161 			 * is no increase in rtt
162 			 */
163 			if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
164 				tp->t_rlstate.ssthresh = tp->t_rlstate.win;
165 			}
166 			tp->t_rlstate.win += update;
167 			tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win, tp->t_maxseg),
168 			    TCP_MAXWIN << tp->rcv_scale);
169 		}
170 	} else {
171 		/*
172 		 * If we are still within 1 RTT of previous reduction
173 		 * due to loss, do nothing
174 		 */
175 		if (now < tp->t_rlstate.reduction_end) {
176 			return;
177 		}
178 		/*
179 		 * Multiplicative decrease
180 		 * W -= min(W * (qdelay/target - 1), W/2) (per RTT)
181 		 * To calculate per bytes acked, it becomes
182 		 * W -= min((qdelay/target - 1), 1/2) * bytes_acked
183 		 */
184 		uint32_t qdelay = curr_rtt > base_rtt ?
185 		    (curr_rtt - base_rtt) : 0;
186 
187 		tp->t_rlstate.md_rcvd_bytes += segment_len;
188 		if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
189 			update = (uint32_t)(MIN(((double)qdelay / target_qdelay - 1), 0.5) *
190 			    (double)tp->t_rlstate.win);
191 			tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
192 			tp->t_rlstate.win -= update;
193 
194 			if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
195 				tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
196 			}
197 
198 			tp->t_rlstate.win = tcp_round_to(tp->t_rlstate.win, tp->t_maxseg);
199 			/*
200 			 * Lower background slow-start threshold so that the connection
201 			 * will stay in congestion avoidance phase
202 			 */
203 			if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
204 				tp->t_rlstate.ssthresh = tp->t_rlstate.win;
205 			}
206 
207 			if (tp->t_rlstate.slowdown_ts != 0) {
208 				/* As the window has been reduced, defer the slowdown. */
209 				tp->t_rlstate.slowdown_ts = now + TCP_BASE_RTT_INTERVAL;
210 			}
211 		}
212 	}
213 }
214 
215 /*
216  * Update win based on ledbat++ algo
217  */
218 void
tcp_rledbat_data_rcvd(struct tcpcb * tp,struct tcphdr * th,struct tcpopt * to,uint32_t segment_len)219 tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
220     struct tcpopt *to, uint32_t segment_len)
221 {
222 	uint32_t update = 0;
223 	const uint32_t base_rtt = get_base_rtt(tp);
224 	const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
225 	    tp->t_rttcur;
226 	const uint32_t srtt = tp->rcv_srtt >> TCP_RTT_SHIFT;
227 	const uint32_t ss_target = (uint32_t)(3 * target_qdelay / 4);
228 	tp->t_rlstate.drained_bytes += segment_len;
229 	struct tcp_globals *globals = tcp_get_globals(tp);
230 
231 	/*
232 	 * Slowdown period - first slowdown
233 	 * is 2RTT after we exit initial slow start.
234 	 * Subsequent slowdowns are after 9 times the
235 	 * previous slow down durations.
236 	 *
237 	 * Updated: slowdown periods are once
238 	 * every 60s unless they are deferred.
239 	 */
240 	if (tp->t_rlstate.slowdown_ts != 0 &&
241 	    tcp_globals_now(globals) >= tp->t_rlstate.slowdown_ts) {
242 		if (tp->t_rlstate.slowdown_begin == 0) {
243 			tp->t_rlstate.slowdown_begin = tcp_globals_now(globals);
244 			tp->t_rlstate.num_slowdown_events++;
245 		}
246 		if (tcp_globals_now(globals) < tp->t_rlstate.slowdown_ts + (2 * srtt)) {
247 			// Set rwnd to 2 packets and return
248 			if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
249 				if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
250 					tp->t_rlstate.ssthresh = tp->t_rlstate.win;
251 				}
252 				tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
253 				/* Reset total bytes acked */
254 				tp->t_rlstate.rcvd_bytes = 0;
255 			}
256 			return;
257 		}
258 	}
259 
260 	/*
261 	 * Detect retransmissions first by checking if the current
262 	 * received sequence is smaller than largest and its
263 	 * timestamp is higher than the largest so far. Reduce
264 	 * win based on fast recovery only once per effective RTT.
265 	 *
266 	 * Note: As we are detecting retransmissions (not packet loss),
267 	 * we are giving some leeway for the next window reduction.
268 	 */
269 	if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
270 	    TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
271 		if (tcp_globals_now(globals) < tp->t_rlstate.reduction_end) {
272 			/* still need to wait for reduction end to elapse */
273 			return;
274 		}
275 
276 		uint32_t win = tp->t_rlstate.win / 2;
277 		win = tcp_round_to(win, tp->t_maxseg);
278 		if (win < 2 * tp->t_maxseg) {
279 			win = 2 * tp->t_maxseg;
280 		}
281 		tp->t_rlstate.ssthresh = win;
282 		tp->t_rlstate.win = win;
283 
284 		/* Reset the received bytes */
285 		tp->t_rlstate.rcvd_bytes = 0;
286 		tp->t_rlstate.md_rcvd_bytes = 0;
287 
288 		/* Update the reduction end time */
289 		tp->t_rlstate.reduction_end = tcp_globals_now(globals) + 2 * srtt;
290 
291 		if (tp->t_rlstate.slowdown_ts != 0) {
292 			/* As the window has been halved, defer the slowdown. */
293 			tp->t_rlstate.slowdown_ts = tcp_globals_now(globals) +
294 			    TCP_BASE_RTT_INTERVAL;
295 		}
296 		return;
297 	}
298 
299 	/* Now we can do slow start or CA */
300 	if (curr_rtt == 0 || base_rtt == 0) {
301 		update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
302 		    tp->t_maxseg);
303 		tp->t_rlstate.win += update;
304 		tp->t_rlstate.win = min(tp->t_rlstate.win,
305 		    TCP_MAXWIN << tp->rcv_scale);
306 	} else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
307 	    ((tp->t_rlstate.num_slowdown_events > 0 &&
308 	    curr_rtt <= (base_rtt + ((uint32_t)target_qdelay << 1))) ||
309 	    curr_rtt <= (base_rtt + ss_target))) {
310 		/*
311 		 * Modified slow start with a dynamic GAIN
312 		 * If the queuing delay is larger than 3/4 of the target
313 		 * delay, exit slow start, iff, it is the initial slow start.
314 		 * After the initial slow start, during CA, window growth
315 		 * will be bound by ssthresh.
316 		 *
317 		 * We enter slow start again only after a slowdown event
318 		 * and in that case, we want to allow the window to grow. The
319 		 * check for target_qdelay is only a safety net in case
320 		 * the queuing delay increases more than twice.
321 		 */
322 		tp->t_rlstate.rcvd_bytes += segment_len;
323 		uint32_t gain_factor = rledbat_gain(base_rtt);
324 		if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
325 			update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
326 			    TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
327 			tp->t_rlstate.rcvd_bytes = 0;
328 			tp->t_rlstate.win += update;
329 			tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win, tp->t_maxseg),
330 			    TCP_MAXWIN << tp->rcv_scale);
331 		}
332 
333 		/* Reset the next slowdown timestamp */
334 		if (tp->t_rlstate.slowdown_ts != 0) {
335 			tp->t_rlstate.slowdown_ts = 0;
336 		}
337 	} else {
338 		/* Congestion avoidance */
339 		rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt, tcp_globals_now(globals));
340 	}
341 }
342 
343 uint32_t
tcp_rledbat_get_rlwin(struct tcpcb * tp)344 tcp_rledbat_get_rlwin(struct tcpcb *tp)
345 {
346 	/* rlwin is either greater or smaller by at most drained bytes */
347 	if (tp->t_rlstate.win > tp->t_rlstate.win_ws ||
348 	    tp->t_rlstate.win_ws - tp->t_rlstate.win < tp->t_rlstate.drained_bytes) {
349 		tp->t_rlstate.win_ws = tp->t_rlstate.win;
350 	} else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
351 		/*
352 		 * rlwin is smaller, decrease the advertised window
353 		 * only by drained bytes at a time
354 		 */
355 		tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
356 		    tp->t_rlstate.drained_bytes;
357 	}
358 	tp->t_rlstate.drained_bytes = 0;
359 	/* Round up to the receive window scale */
360 	tp->t_rlstate.win_ws = tcp_round_up(tp->t_rlstate.win_ws, 1 << tp->rcv_scale);
361 
362 	return tp->t_rlstate.win_ws;
363 }
364 
365 /*
366  * Function to handle connections that have been idle for
367  * some time. Slow start to get ack "clock" running again.
368  * Clear base history after idle time.
369  */
370 void
tcp_rledbat_after_idle(struct tcpcb * tp)371 tcp_rledbat_after_idle(struct tcpcb *tp)
372 {
373 	rledbat_clear_state(tp);
374 	/* Reset the rledbat window */
375 	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
376 }
377 
378 void
tcp_rledbat_switch_to(struct tcpcb * tp)379 tcp_rledbat_switch_to(struct tcpcb *tp)
380 {
381 	rledbat_clear_state(tp);
382 	uint32_t recwin = 0;
383 
384 	if (tp->t_rlstate.win == 0) {
385 		/*
386 		 * Use half of previous window, the algorithm
387 		 * will quickly reduce the window if there is still
388 		 * high queueing delay.
389 		 */
390 		int32_t win = tcp_sbspace(tp);
391 		if (win < 0) {
392 			win = 0;
393 		}
394 
395 		recwin = MAX(win, (int)(tp->rcv_adv - tp->rcv_nxt));
396 		recwin = recwin / 2;
397 	} else {
398 		/*
399 		 * Reduce the window by half from the previous value
400 		 * but it should be at least 64K
401 		 */
402 		recwin = MAX(tp->t_rlstate.win / 2, TCP_MAXWIN);
403 	}
404 
405 	recwin = tcp_round_to(recwin, tp->t_maxseg);
406 	if (recwin < bg_ss_fltsz * tp->t_maxseg) {
407 		recwin = bg_ss_fltsz * tp->t_maxseg;
408 	}
409 	tp->t_rlstate.win = recwin;
410 
411 	/* ssthresh should be at most the inital value */
412 	if (tp->t_rlstate.ssthresh == 0) {
413 		tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
414 	} else {
415 		tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
416 		    TCP_MAXWIN << TCP_MAX_WINSHIFT);
417 	}
418 }
419