1 /*
2 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include "tcp_includes.h"
30
31 #define REORDERING_WINDOW_FLOOR (2) /* If min_rtt is too small, at least wait for a reordering window of 2ms */
32
33 /* RACK is implemented by following RFC 8985 */
34 void
tcp_rack_transmit_seg(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)35 tcp_rack_transmit_seg(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
36 uint32_t xmit_ts, uint8_t flags)
37 {
38 seg->start_seq = start;
39 seg->end_seq = end;
40 seg->xmit_ts = xmit_ts;
41
42 /*
43 * Set dsack round_end at the start of a (re) transmission
44 * round to the segment with the smallest sequence sent
45 */
46 if (SEQ_LT(start, tp->rack.dsack_round_end)) {
47 tp->rack.dsack_round_end = start;
48 }
49 seg->flags |= flags;
50 if (seg->flags & TCP_RACK_RETRANSMITTED) {
51 tp->bytes_retransmitted += tcp_seg_len(seg);
52 }
53 /*
54 * Set segs_retransmitted ONLY when it is not set, otherwise new segments
55 * can clear this even if there are retransmitted segments
56 */
57 if (!tp->rack.segs_retransmitted) {
58 tp->rack.segs_retransmitted = !!(flags & TCP_RACK_RETRANSMITTED);
59 }
60 }
61
62 /* If segment (t1, seq1) was sent after segment (t2, seq2) */
63 static bool
tcp_rack_sent_after(uint32_t t1,uint32_t seq1,uint32_t t2,uint32_t seq2)64 tcp_rack_sent_after(uint32_t t1, uint32_t seq1, uint32_t t2, uint32_t seq2)
65 {
66 return (t1 > t2) || (t1 == t2 && SEQ_GT(seq1, seq2));
67 }
68
69 void
tcp_rack_update_reordering_win_persist(struct tcpcb * tp)70 tcp_rack_update_reordering_win_persist(struct tcpcb *tp)
71 {
72 if (tp->rack.reo_wnd_persist != 0) {
73 tp->rack.reo_wnd_persist--;
74 }
75 }
76
77 void
tcp_rack_bad_rexmt_restore(struct tcpcb * tp)78 tcp_rack_bad_rexmt_restore(struct tcpcb *tp)
79 {
80 /* Force RACK to re-examine losses */
81 tp->rack.advanced = 1;
82
83 /* Restore reordering window persist value */
84 tp->rack.reo_wnd_persist = MIN(tp->rack.reo_wnd_persist + 1,
85 TCP_RACK_RECOVERY_PERSIST_MAX);
86 }
87
88 void
tcp_rack_reset_segs_retransmitted(struct tcpcb * tp)89 tcp_rack_reset_segs_retransmitted(struct tcpcb *tp)
90 {
91 tp->rack.segs_retransmitted = false;
92 }
93
94 /* MUST be called before we have processed dup ACKs and made a decision to enter recovery */
95 static uint32_t
tcp_rack_reordering_window(struct tcpcb * tp,uint32_t dup_acks,bool in_rto)96 tcp_rack_reordering_window(struct tcpcb *tp, uint32_t dup_acks, bool in_rto)
97 {
98 if (tp->t_reordered_pkts == 0) {
99 /*
100 * When no reordering has been observed, the RACK.reo_wnd is set
101 * to 0 both during fast and RTO recovery. OR if we are entering
102 * fast recovery due to SACKed segments/dup ACKs >= DupThresh.
103 * When reo_wnd is set to 0, loss is detected if RACK.RTT time has
104 * elapsed since packet was sent.
105 */
106 if (IN_FASTRECOVERY(tp) || dup_acks >= tp->t_rexmtthresh || in_rto) {
107 return 0;
108 }
109 }
110 /*
111 * reordering window = N * Min_RTT/4,
112 * limited to a max value of 2*SRTT.
113 */
114 uint32_t srtt = (uint32_t)tp->t_srtt >> TCP_RTT_SHIFT;
115 uint32_t reordering_window = (tp->rack.reo_wnd_multi * get_base_rtt(tp)) >> 2;
116 if (reordering_window > 2 * srtt) {
117 reordering_window = 2 * srtt;
118 }
119 reordering_window = MAX(reordering_window, REORDERING_WINDOW_FLOOR);
120
121 return reordering_window;
122 }
123
124 static uint32_t
tcp_rack_detect_segment_lost(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t reordering_window,bool * loss_detected)125 tcp_rack_detect_segment_lost(struct tcpcb *tp, struct tcp_seg_sent *seg,
126 uint32_t reordering_window, bool *loss_detected)
127 {
128 /* After the segment is sent, wait for (RTT + reordering window) */
129 uint32_t wait_ts = seg->xmit_ts + tp->rack.rtt + reordering_window;
130 if (TSTMP_GEQ(tcp_now, wait_ts)) {
131 /*
132 * Segment should be marked as lost as it was sent
133 * (RTT + reordering window) time ago.
134 */
135 tcp_mark_seg_lost(tp, seg);
136 if (loss_detected != NULL) {
137 *loss_detected = true;
138 }
139 return 0;
140 }
141 return wait_ts - tcp_now;
142 }
143
144 /*
145 * RFC 8985,
146 * Step 1: Update RACK.min_RTT (done in tcp_input)
147 * Step 2: Update the state for the most recently sent segment that has been delivered.
148 */
149 void
tcp_rack_update_segment_acked(struct tcpcb * tp,uint32_t tsecr,uint32_t xmit_ts,uint32_t end_seq,bool retransmitted)150 tcp_rack_update_segment_acked(struct tcpcb *tp, uint32_t tsecr,
151 uint32_t xmit_ts, uint32_t end_seq,
152 bool retransmitted)
153 {
154 /*
155 * Step 1: Update RACK.min_RTT - is done in tcp_input ACK processing
156 */
157 uint32_t rtt = tcp_now - xmit_ts;
158 if (rtt == 0) {
159 /*
160 * As rtt has millisecond precision,
161 * make adjustment for sub ms RTT
162 */
163 rtt = 1;
164 }
165 /*
166 * RFC 8985 - An ACK can acknowledge retransmitted data and because retransmissions
167 * can be spurious, ignore ACKs for such retransmitted segments.
168 * Ignore a segment if any of its sequence range has been retransmitted
169 * before and if either of two conditions is true:
170 * 1. The TSecr of the ACK's timestamp option (if available) indicates the ACK was not
171 * acknowledging the last (re)transmission OR tsecr was invalid (greater than tcp_now)
172 * 2. If TSecr is not available or ACK arrives immediately after last retransmission,
173 * check if the segment was last (re)transmitted less than RACK.min_rtt ago.
174 */
175 if (retransmitted) {
176 if ((tsecr != 0 && (TSTMP_LT(tsecr, xmit_ts) || TSTMP_GT(tsecr, tcp_now)))
177 || rtt < get_base_rtt(tp)) {
178 /* This is a spurious inference as either
179 * tsecr doesn't lie between xmit_ts and now OR
180 * the rtt computed using the xmit_ts of this segment
181 * is less than base-rtt.
182 */
183 return;
184 }
185 }
186
187 if (tcp_rack_sent_after(xmit_ts, end_seq, tp->rack.xmit_ts, tp->rack.end_seq)) {
188 tp->rack.advanced = 1;
189 tp->rack.xmit_ts = xmit_ts;
190 tp->rack.end_seq = end_seq;
191 tp->rack.rtt = rtt;
192
193 /* Cancel the RACK reordering timer as we have received a new ACK */
194 tp->t_timer[TCPT_REORDER] = 0;
195 }
196 }
197
198 /*
199 * Step 3: Reordering detection is done in tcp_sack_detect_reordering
200 * Step 4: Update the RACK reordering window.
201 */
202 void
tcp_rack_update_reordering_window(struct tcpcb * tp,tcp_seq highest_acked_sacked)203 tcp_rack_update_reordering_window(struct tcpcb *tp, tcp_seq highest_acked_sacked)
204 {
205 /*
206 * RACK.reo_wnd starts with a value of RACK.min_RTT/4. After that, RACK
207 * dynamically adapts to higher degrees of reordering using DSACK
208 * option from the receiver.
209 * To deal with temporary reordering, RACK persists using the inflated
210 * RACK.reo_wnd for up to 16 loss recoveries, after which it resets
211 * RACK.reo_wnd to its starting value.
212 */
213
214 /*
215 * Ignore DSACK if an RTT hasn't passed as
216 * highest_acked_sacked <= previous dsack_round_end
217 */
218 if (SEQ_LEQ(highest_acked_sacked, tp->rack.dsack_round_end)) {
219 tp->rack.dsack_round_seen = 0;
220 }
221 /*
222 * Start of the new dsack round.
223 * Grow the reordering window once per round that sees DSACK on an ACK.
224 * Reordering window persists for 16 loss recoveries (that don't receive DSACK).
225 * On receiving DSACK, we reset window persist to 16 as it
226 * indicates that reordering is still happening.
227 */
228 if (tp->rack.dsack_round_seen == 1) {
229 tp->rack.dsack_round_seen = 0;
230 tp->rack.dsack_round_end = tp->snd_nxt;
231 tp->rack.reo_wnd_multi = (uint8_t)(min(0xFF, tp->rack.reo_wnd_multi + 1));
232 tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_PERSIST_MAX;
233 } else if (tp->rack.reo_wnd_persist == 0) {
234 tp->rack.reo_wnd_multi = 1;
235 }
236 }
237
238 /*
239 * Step 5: Detect losses
240 * Call this only after S/ACK has been processed, so that s/acked segments
241 * are either removed or marked accordingly
242 */
243 static uint32_t
tcp_rack_detect_loss(struct tcpcb * tp,uint32_t dup_acks,bool * loss_detected)244 tcp_rack_detect_loss(struct tcpcb *tp, uint32_t dup_acks, bool *loss_detected)
245 {
246 struct tcp_seg_sent *seg = NULL;
247 uint32_t reordering_timeout = 0;
248 uint32_t reordering_window = tcp_rack_reordering_window(tp, dup_acks, false);
249
250 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
251 /*
252 * No segment after this segment has been acknowledged yet,
253 * hence RACK.segment is not after this segment
254 */
255 if (!tcp_rack_sent_after(tp->rack.xmit_ts, tp->rack.end_seq,
256 seg->xmit_ts, seg->end_seq)) {
257 break;
258 }
259
260 /* Skip already marked lost but not yet retransmitted segments */
261 if (seg->flags & TCP_SEGMENT_LOST &&
262 !(seg->flags & TCP_RACK_RETRANSMITTED)) {
263 continue;
264 }
265
266 if (seg->flags & TCP_SEGMENT_SACKED) {
267 continue;
268 }
269
270 uint32_t remaining = tcp_rack_detect_segment_lost(tp, seg, reordering_window, loss_detected);
271 if (remaining) {
272 /*
273 * We only want to arm the timer at max wait time as we are
274 * expecting to get ACKs to do RACK processing. Only in the
275 * worst case, when we don't receive ACKs, we set the timeout
276 * to be the wait time for the most recently sent packet.
277 */
278 reordering_timeout = max(remaining, reordering_timeout);
279 }
280 }
281 return reordering_timeout;
282 }
283
284 /*
285 * Call during input processing to detect loss.
286 * If loss is detected, enter_fr will be true and
287 * tcp_input will enter fast recovery
288 */
289 bool
tcp_rack_detect_loss_and_arm_timer(struct tcpcb * tp,uint32_t dup_acks)290 tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks)
291 {
292 uint32_t reordering_timeout = 0;
293 bool loss_detected = false;
294
295 if (!tp->rack.advanced) {
296 return false;
297 }
298
299 /* Cancel any existing RACK reordering timer as we are going to re-fire it if needed */
300 tp->t_timer[TCPT_REORDER] = 0;
301
302 reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected);
303 if (reordering_timeout) {
304 tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp,
305 reordering_timeout + REORDERING_WINDOW_FLOOR);
306 /* Since losses can be marked at future point, clear the TLP timer */
307 tp->t_timer[TCPT_PTO] = 0;
308 } else {
309 /* Cancel any pending timers */
310 tp->t_timer[TCPT_REORDER] = 0;
311 }
312
313 return loss_detected;
314 }
315
316 /* Reordering timeout has expired, detect loss and enter recovery */
317 void
tcp_rack_reordering_timeout(struct tcpcb * tp,uint32_t dup_acks)318 tcp_rack_reordering_timeout(struct tcpcb *tp, uint32_t dup_acks)
319 {
320 bool enter_fr = false;
321
322 tcp_rack_detect_loss(tp, dup_acks, &enter_fr);
323
324 if (enter_fr) {
325 /* Some packets have been marked as lost */
326 if (!IN_FASTRECOVERY(tp)) {
327 tcp_rexmt_save_state(tp);
328 tcp_enter_fast_recovery(tp);
329 }
330 tcpstat.tcps_rack_reordering_timeout_recovery_episode++;
331 tp->t_rack_reo_timeout_recovery_episode++;
332 tcp_output(tp);
333 }
334 }
335
336 void
tcp_rack_loss_on_rto(struct tcpcb * tp,bool in_rto)337 tcp_rack_loss_on_rto(struct tcpcb *tp, bool in_rto)
338 {
339 struct tcp_seg_sent *seg = NULL;
340 uint32_t reordering_window = tcp_rack_reordering_window(tp, 0, in_rto);
341
342 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
343 /* Mark the first unacknowledged segment as lost */
344 if (seg->start_seq == tp->snd_una) {
345 tcp_mark_seg_lost(tp, seg);
346 }
347 /*
348 * Mark any segment for which time elapsed since transmit
349 * is at least the sum of recent RTT and reordering window
350 */
351 tcp_rack_detect_segment_lost(tp, seg, reordering_window, NULL);
352 }
353 }
354
355 uint32_t
tcp_rack_adjust(struct tcpcb * tp,uint32_t cwin)356 tcp_rack_adjust(struct tcpcb *tp, uint32_t cwin)
357 {
358 uint32_t max_len = 0;
359 struct tcp_seg_sent *seg = NULL;
360
361 /*
362 * We traverse RB tree (instead of time-ordered list)
363 * as it would be faster to look for a seg such that
364 * seg->start <= snd_nxt < seg->end
365 */
366 RB_FOREACH(seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
367 if (max_len >= cwin) {
368 break;
369 }
370 if (seg->flags & TCP_SEGMENT_SACKED) {
371 if (SEQ_LT(tp->snd_nxt, seg->end_seq) &&
372 SEQ_GEQ(tp->snd_nxt, seg->start_seq)) {
373 tp->snd_nxt = seg->end_seq;
374 }
375 break;
376 }
377 if (SEQ_LT(tp->snd_nxt, seg->end_seq)) {
378 max_len += tcp_seg_len(seg);
379 }
380 }
381
382 return max_len;
383 }
384
385 /* This function is only used during retransmissions. */
386 struct tcp_seg_sent *
tcp_rack_output(struct tcpcb * tp,uint32_t cwin,uint16_t * rack_seg_len)387 tcp_rack_output(struct tcpcb *tp, uint32_t cwin, uint16_t *rack_seg_len)
388 {
389 struct tcp_seg_sent *seg = NULL;
390
391 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
392 if (seg->flags & TCP_SEGMENT_SACKED) {
393 continue;
394 }
395 if (seg->flags & TCP_SEGMENT_LOST && !(seg->flags & TCP_RACK_RETRANSMITTED)) {
396 /* We don't do TSO for retransmissions and only send MSS sized segments */
397 uint16_t allowed_size = (uint16_t)min(cwin, tp->t_maxseg);
398 /*
399 * When entire segment can be retransmitted,
400 * lost segment is moved to the end of the time-ordered
401 * list in tcp_seg_sent_insert.
402 *
403 * When entire segment can't be retransmitted,
404 * we move the seg->start by amount of data
405 * retransmitted during tcp_seg_sent_insert
406 */
407 *rack_seg_len = tcp_seg_len(seg) <= allowed_size ?
408 (uint16_t)tcp_seg_len(seg) : allowed_size;
409
410 break;
411 }
412 }
413
414 return seg;
415 }
416
417 /*
418 * Check if a retransmitted segment was completed covered by received
419 * (first) DSACK block
420 */
421 void
tcp_rack_detect_reordering_dsack(struct tcpcb * tp,tcp_seq start,tcp_seq end)422 tcp_rack_detect_reordering_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end)
423 {
424 struct tcp_seg_sent *seg = NULL;
425
426 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
427 if (seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) {
428 if (SEQ_LEQ(start, seg->start_seq) && SEQ_GEQ(end, seg->end_seq)) {
429 tp->t_reordered_pkts++;
430 }
431 }
432 }
433 }
434
435 void
tcp_rack_detect_reordering_acked(struct tcpcb * tp,struct tcp_seg_sent * seg)436 tcp_rack_detect_reordering_acked(struct tcpcb *tp, struct tcp_seg_sent *seg)
437 {
438 /*
439 * A never retransmitted segment below fack was delivered.
440 * Ignore the segments that have already been sacked before
441 */
442 if (SEQ_LT(seg->end_seq, tp->snd_fack) &&
443 (seg->flags & (TCP_SEGMENT_SACKED | TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)) == 0) {
444 tp->t_reordered_pkts++;
445 }
446 }
447