xref: /xnu-11215.1.10/bsd/netinet/tcp_rack.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "tcp_includes.h"
30 
31 #define REORDERING_WINDOW_FLOOR (2) /* If min_rtt is too small, at least wait for a reordering window of 2ms */
32 
33 /* RACK is implemented by following RFC 8985 */
34 void
tcp_rack_transmit_seg(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)35 tcp_rack_transmit_seg(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
36     uint32_t xmit_ts, uint8_t flags)
37 {
38 	seg->start_seq = start;
39 	seg->end_seq = end;
40 	seg->xmit_ts = xmit_ts;
41 
42 	/*
43 	 * Set dsack round_end at the start of a (re) transmission
44 	 * round to the segment with the smallest sequence sent
45 	 */
46 	if (SEQ_LT(start, tp->rack.dsack_round_end)) {
47 		tp->rack.dsack_round_end = start;
48 	}
49 	seg->flags |= flags;
50 	if (seg->flags & TCP_RACK_RETRANSMITTED) {
51 		tp->bytes_retransmitted += tcp_seg_len(seg);
52 	}
53 	/*
54 	 * Set segs_retransmitted ONLY when it is not set, otherwise new segments
55 	 * can clear this even if there are retransmitted segments
56 	 */
57 	if (!tp->rack.segs_retransmitted) {
58 		tp->rack.segs_retransmitted = !!(flags & TCP_RACK_RETRANSMITTED);
59 	}
60 }
61 
62 /* If segment (t1, seq1) was sent after segment (t2, seq2) */
63 static bool
tcp_rack_sent_after(uint32_t t1,uint32_t seq1,uint32_t t2,uint32_t seq2)64 tcp_rack_sent_after(uint32_t t1, uint32_t seq1, uint32_t t2, uint32_t seq2)
65 {
66 	return (t1 > t2) || (t1 == t2 && SEQ_GT(seq1, seq2));
67 }
68 
69 void
tcp_rack_update_reordering_win_persist(struct tcpcb * tp)70 tcp_rack_update_reordering_win_persist(struct tcpcb *tp)
71 {
72 	if (tp->rack.reo_wnd_persist != 0) {
73 		tp->rack.reo_wnd_persist--;
74 	}
75 }
76 
77 void
tcp_rack_bad_rexmt_restore(struct tcpcb * tp)78 tcp_rack_bad_rexmt_restore(struct tcpcb *tp)
79 {
80 	/* Force RACK to re-examine losses */
81 	tp->rack.advanced = 1;
82 
83 	/* Restore reordering window persist value */
84 	tp->rack.reo_wnd_persist = MIN(tp->rack.reo_wnd_persist + 1,
85 	    TCP_RACK_RECOVERY_PERSIST_MAX);
86 }
87 
88 void
tcp_rack_reset_segs_retransmitted(struct tcpcb * tp)89 tcp_rack_reset_segs_retransmitted(struct tcpcb *tp)
90 {
91 	tp->rack.segs_retransmitted = false;
92 }
93 
94 /* MUST be called before we have processed dup ACKs and made a decision to enter recovery */
95 static uint32_t
tcp_rack_reordering_window(struct tcpcb * tp,uint32_t dup_acks,bool in_rto)96 tcp_rack_reordering_window(struct tcpcb *tp, uint32_t dup_acks, bool in_rto)
97 {
98 	if (tp->t_reordered_pkts == 0) {
99 		/*
100 		 * When no reordering has been observed, the RACK.reo_wnd is set
101 		 * to 0 both during fast and RTO recovery. OR if we are entering
102 		 * fast recovery due to SACKed segments/dup ACKs >= DupThresh.
103 		 * When reo_wnd is set to 0, loss is detected if RACK.RTT time has
104 		 * elapsed since packet was sent.
105 		 */
106 		if (IN_FASTRECOVERY(tp) || dup_acks >= tp->t_rexmtthresh || in_rto) {
107 			return 0;
108 		}
109 	}
110 	/*
111 	 * reordering window = N * Min_RTT/4,
112 	 * limited to a max value of SRTT.
113 	 */
114 	uint32_t reordering_window = MIN((tp->rack.reo_wnd_multi * (get_base_rtt(tp) >> 2)),
115 	    (uint32_t)(tp->t_srtt >> TCP_RTT_SHIFT));
116 	reordering_window = MAX(reordering_window, REORDERING_WINDOW_FLOOR);
117 
118 	return reordering_window;
119 }
120 
121 static uint32_t
tcp_rack_detect_segment_lost(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t reordering_window,bool * loss_detected)122 tcp_rack_detect_segment_lost(struct tcpcb *tp, struct tcp_seg_sent *seg,
123     uint32_t reordering_window, bool *loss_detected)
124 {
125 	/* Skip already marked lost but not yet retransmitted segments */
126 	if (seg->flags & TCP_SEGMENT_LOST &&
127 	    !(seg->flags & TCP_RACK_RETRANSMITTED)) {
128 		return 0;
129 	}
130 
131 	if (seg->flags & TCP_SEGMENT_SACKED) {
132 		return 0;
133 	}
134 
135 	/* After the segment is sent, wait for (RTT + reordering window) */
136 	uint32_t wait_ts = seg->xmit_ts + tp->rack.rtt + reordering_window;
137 	if (TSTMP_GEQ(tcp_now, wait_ts)) {
138 		/*
139 		 * Segment should be marked as lost as it was sent
140 		 * (RTT + reordering window) time ago.
141 		 */
142 		tcp_mark_seg_lost(tp, seg);
143 		if (loss_detected != NULL) {
144 			*loss_detected = true;
145 		}
146 		return 0;
147 	}
148 	return wait_ts - tcp_now;
149 }
150 
151 /*
152  * RFC 8985,
153  * Step 1: Update RACK.min_RTT (done in tcp_input)
154  * Step 2: Update the state for the most recently sent segment that has been delivered.
155  */
156 void
tcp_rack_update_segment_acked(struct tcpcb * tp,uint32_t tsecr,uint32_t xmit_ts,uint32_t end_seq,bool retransmitted)157 tcp_rack_update_segment_acked(struct tcpcb *tp, uint32_t tsecr,
158     uint32_t xmit_ts, uint32_t end_seq,
159     bool retransmitted)
160 {
161 	/*
162 	 * Step 1: Update RACK.min_RTT - is done in tcp_input ACK processing
163 	 */
164 	uint32_t rtt = tcp_now - xmit_ts;
165 	if (rtt == 0) {
166 		/*
167 		 * As rtt has millisecond precision,
168 		 * make adjustment for sub ms RTT
169 		 */
170 		rtt = 1;
171 	}
172 	/*
173 	 * RFC 8985 - An ACK can acknowledge retransmitted data and because retransmissions
174 	 * can be spurious, ignore ACKs for such retransmitted segments.
175 	 * Ignore a segment if any of its sequence range has been retransmitted
176 	 * before and if either of two conditions is true:
177 	 * 1. The TSecr of the ACK's timestamp option (if available) indicates the ACK was not
178 	 *	acknowledging the last (re)transmission OR tsecr was invalid (greater than tcp_now)
179 	 * 2. If TSecr is not available or ACK arrives immediately after last retransmission,
180 	 *  check if the segment was last (re)transmitted less than RACK.min_rtt ago.
181 	 */
182 	if (retransmitted) {
183 		if ((tsecr != 0 && (TSTMP_LT(tsecr, xmit_ts) || TSTMP_GT(tsecr, tcp_now)))
184 		    || rtt < get_base_rtt(tp)) {
185 			// Spurious inference
186 			TCP_LOG(tp, "Spurious inference as either "
187 			    "tsecr (%u) doesn't lie between xmit_ts(%u) and now (%u) OR "
188 			    "the rtt (%u) is less than base-rtt (%u). end_seq is:%u",
189 			    tsecr, xmit_ts, tcp_now, rtt, get_base_rtt(tp), end_seq);
190 			return;
191 		}
192 	}
193 
194 	if (tcp_rack_sent_after(xmit_ts, end_seq, tp->rack.xmit_ts, tp->rack.end_seq)) {
195 		tp->rack.advanced = 1;
196 		tp->rack.xmit_ts = xmit_ts;
197 		tp->rack.end_seq = end_seq;
198 		tp->rack.rtt = rtt;
199 
200 		/* Cancel the RACK reordering timer as we have received a new ACK */
201 		tp->t_timer[TCPT_REORDER] = 0;
202 	}
203 }
204 
205 /*
206  * Step 3: Reordering detection is done in tcp_sack_detect_reordering
207  * Step 4: Update the RACK reordering window.
208  */
209 void
tcp_rack_update_reordering_window(struct tcpcb * tp,tcp_seq th_ack)210 tcp_rack_update_reordering_window(struct tcpcb *tp, tcp_seq th_ack)
211 {
212 	/*
213 	 * RACK.reo_wnd starts with a value of RACK.min_RTT/4. After that, RACK
214 	 * dynamically adapts to higher degrees of reordering using DSACK
215 	 * option from the receiver.
216 	 * To deal with temporary reordering, RACK persists using the inflated
217 	 * RACK.reo_wnd for up to 16 loss recoveries, after which it resets
218 	 * RACK.reo_wnd to its starting value.
219 	 */
220 
221 	/*
222 	 * Ignore DSACK if an RTT hasn't passed as th_ack < previous dsack_round_end
223 	 */
224 	if (SEQ_LT(th_ack, tp->rack.dsack_round_end)) {
225 		tp->rack.dsack_round_seen = 0;
226 	}
227 	/*
228 	 * Start of the new dsack round.
229 	 * Grow the reordering window once per round that sees DSACK on an ACK.
230 	 * Reordering window persists for 16 loss recoveries (that don't receive DSACK).
231 	 * On receiving DSACK, we reset window persist to 16 as it
232 	 * indicates that reordering is still happening.
233 	 */
234 	if (tp->rack.dsack_round_seen == 1) {
235 		tp->rack.dsack_round_seen = 0;
236 		tp->rack.dsack_round_end = tp->snd_nxt;
237 		tp->rack.reo_wnd_multi = (uint8_t)(min(0xFF, tp->rack.reo_wnd_multi + 1));
238 		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_PERSIST_MAX;
239 	} else if (tp->rack.reo_wnd_persist == 0) {
240 		tp->rack.reo_wnd_multi = 1;
241 	}
242 }
243 
244 /*
245  * Step 5: Detect losses
246  * Call this only after S/ACK has been processed, so that s/acked segments
247  * are either removed or marked accordingly
248  */
249 static uint32_t
tcp_rack_detect_loss(struct tcpcb * tp,uint32_t dup_acks,bool * loss_detected)250 tcp_rack_detect_loss(struct tcpcb *tp, uint32_t dup_acks, bool *loss_detected)
251 {
252 	struct tcp_seg_sent *seg = NULL;
253 	uint32_t reordering_timeout = 0;
254 	uint32_t reordering_window = tcp_rack_reordering_window(tp, dup_acks, false);
255 
256 	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
257 		/*
258 		 * No segment after this segment has been acknowledged yet,
259 		 * hence RACK.segment is not after this segment
260 		 */
261 		if (!tcp_rack_sent_after(tp->rack.xmit_ts, tp->rack.end_seq,
262 		    seg->xmit_ts, seg->end_seq)) {
263 			break;
264 		}
265 
266 		uint32_t remaining = tcp_rack_detect_segment_lost(tp, seg, reordering_window, loss_detected);
267 		if (remaining) {
268 			/*
269 			 * We only want to arm the timer at max wait time as we are
270 			 * expecting to get ACKs to do RACK processing. Only in the
271 			 * worst case, when we don't receive ACKs, we set the timeout
272 			 * to be the wait time for the most recently sent packet.
273 			 */
274 			reordering_timeout = max(remaining, reordering_timeout);
275 		}
276 	}
277 	return reordering_timeout;
278 }
279 
280 /*
281  * Call during input processing to detect loss.
282  * If loss is detected, enter_fr will be true and
283  * tcp_input will enter fast recovery
284  */
285 bool
tcp_rack_detect_loss_and_arm_timer(struct tcpcb * tp,uint32_t dup_acks)286 tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks)
287 {
288 	uint32_t reordering_timeout = 0;
289 	bool loss_detected = false;
290 
291 	if (!tp->rack.advanced) {
292 		return false;
293 	}
294 
295 	/* Cancel any existing RACK reordering timer as we are going to re-fire it if needed */
296 	tp->t_timer[TCPT_REORDER] = 0;
297 
298 	reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected);
299 	if (reordering_timeout) {
300 		tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp,
301 		    reordering_timeout + REORDERING_WINDOW_FLOOR);
302 		/* Since losses can be marked at future point, clear the TLP timer */
303 		tp->t_timer[TCPT_PTO] = 0;
304 	} else {
305 		/* Cancel any pending timers */
306 		tp->t_timer[TCPT_REORDER] = 0;
307 	}
308 
309 	return loss_detected;
310 }
311 
312 /* Reordering timeout has expired, detect loss and enter recovery */
313 void
tcp_rack_reordering_timeout(struct tcpcb * tp,uint32_t dup_acks)314 tcp_rack_reordering_timeout(struct tcpcb *tp, uint32_t dup_acks)
315 {
316 	bool enter_fr = false;
317 
318 	tcp_rack_detect_loss(tp, dup_acks, &enter_fr);
319 
320 	if (enter_fr) {
321 		/* Some packets have been marked as lost */
322 		if (!IN_FASTRECOVERY(tp)) {
323 			tcp_rexmt_save_state(tp);
324 			tcp_enter_fast_recovery(tp);
325 		}
326 		tcpstat.tcps_rack_reordering_timeout_recovery_episode++;
327 		tp->t_rack_reo_timeout_recovery_episode++;
328 		tcp_output(tp);
329 	}
330 }
331 
332 void
tcp_rack_loss_on_rto(struct tcpcb * tp,bool in_rto)333 tcp_rack_loss_on_rto(struct tcpcb *tp, bool in_rto)
334 {
335 	struct tcp_seg_sent *seg = NULL;
336 	uint32_t reordering_window = tcp_rack_reordering_window(tp, 0, in_rto);
337 
338 	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
339 		/* Mark the first unacknowledged segment as lost */
340 		if (seg->start_seq == tp->snd_una) {
341 			tcp_mark_seg_lost(tp, seg);
342 		}
343 		/*
344 		 * Mark any segment for which time elapsed since transmit
345 		 * is at least the sum of recent RTT and reordering window
346 		 */
347 		tcp_rack_detect_segment_lost(tp, seg, reordering_window, NULL);
348 	}
349 }
350 
351 uint32_t
tcp_rack_adjust(struct tcpcb * tp,uint32_t cwin)352 tcp_rack_adjust(struct tcpcb *tp, uint32_t cwin)
353 {
354 	uint32_t max_len = 0;
355 	struct tcp_seg_sent *seg = NULL;
356 
357 	/*
358 	 * We traverse RB tree (instead of time-ordered list)
359 	 * as it would be faster to look for a seg such that
360 	 * seg->start <= snd_nxt < seg->end
361 	 */
362 	RB_FOREACH(seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
363 		if (max_len >= cwin) {
364 			break;
365 		}
366 		if (seg->flags & TCP_SEGMENT_SACKED) {
367 			if (SEQ_LT(tp->snd_nxt, seg->end_seq) &&
368 			    SEQ_GEQ(tp->snd_nxt, seg->start_seq)) {
369 				tp->snd_nxt = seg->end_seq;
370 			}
371 			continue;
372 		}
373 		if (SEQ_LT(tp->snd_nxt, seg->end_seq)) {
374 			max_len += tcp_seg_len(seg);
375 		}
376 	}
377 
378 	return max_len;
379 }
380 
381 /* This function is only used during retransmissions. */
382 struct tcp_seg_sent *
tcp_rack_output(struct tcpcb * tp,uint32_t cwin,uint16_t * rack_seg_len)383 tcp_rack_output(struct tcpcb *tp, uint32_t cwin, uint16_t *rack_seg_len)
384 {
385 	struct tcp_seg_sent *seg = NULL;
386 
387 	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
388 		if (seg->flags & TCP_SEGMENT_SACKED) {
389 			continue;
390 		}
391 		if (seg->flags & TCP_SEGMENT_LOST && !(seg->flags & TCP_RACK_RETRANSMITTED)) {
392 			/* We don't do TSO for retransmissions and only send MSS sized segments */
393 			uint16_t allowed_size = (uint16_t)min(cwin, tp->t_maxseg);
394 			/*
395 			 * When entire segment can be retransmitted,
396 			 * lost segment is moved to the end of the time-ordered
397 			 * list in tcp_seg_sent_insert.
398 			 *
399 			 * When entire segment can't be retransmitted,
400 			 * we move the seg->start by amount of data
401 			 * retransmitted during tcp_seg_sent_insert
402 			 */
403 			*rack_seg_len = tcp_seg_len(seg) <= allowed_size ?
404 			    (uint16_t)tcp_seg_len(seg) : allowed_size;
405 
406 			break;
407 		}
408 	}
409 
410 	return seg;
411 }
412 
413 /*
414  * Check if a retransmitted segment was completed covered by received
415  * (first) DSACK block
416  */
417 void
tcp_rack_detect_reordering_dsack(struct tcpcb * tp,tcp_seq start,tcp_seq end)418 tcp_rack_detect_reordering_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end)
419 {
420 	struct tcp_seg_sent *seg = NULL;
421 
422 	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
423 		if (seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) {
424 			if (SEQ_LEQ(start, seg->start_seq) && SEQ_GEQ(end, seg->end_seq)) {
425 				tp->t_reordered_pkts++;
426 			}
427 		}
428 	}
429 }
430 
431 void
tcp_rack_detect_reordering_acked(struct tcpcb * tp,struct tcp_seg_sent * seg)432 tcp_rack_detect_reordering_acked(struct tcpcb *tp, struct tcp_seg_sent *seg)
433 {
434 	/*
435 	 * A never retransmitted segment below fack was delivered.
436 	 * Ignore the segments that have already been sacked before
437 	 */
438 	if (SEQ_LT(seg->end_seq, tp->snd_fack) &&
439 	    (seg->flags & (TCP_SEGMENT_SACKED | TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)) == 0) {
440 		tp->t_reordered_pkts++;
441 	}
442 }
443