1 /*
2 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include "tcp_includes.h"
30
31 #define REORDERING_WINDOW_FLOOR (2) /* If min_rtt is too small, at least wait for a reordering window of 2ms */
32
33 /* RACK is implemented by following RFC 8985 */
34 void
tcp_rack_transmit_seg(struct tcpcb * tp,struct tcp_seg_sent * seg,tcp_seq start,tcp_seq end,uint32_t xmit_ts,uint8_t flags)35 tcp_rack_transmit_seg(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
36 uint32_t xmit_ts, uint8_t flags)
37 {
38 seg->start_seq = start;
39 seg->end_seq = end;
40 seg->xmit_ts = xmit_ts;
41
42 /*
43 * Set dsack round_end at the start of a (re) transmission
44 * round to the segment with the smallest sequence sent
45 */
46 if (SEQ_LT(start, tp->rack.dsack_round_end)) {
47 tp->rack.dsack_round_end = start;
48 }
49 seg->flags |= flags;
50 if (seg->flags & TCP_RACK_RETRANSMITTED) {
51 tp->bytes_retransmitted += tcp_seg_len(seg);
52 }
53 /*
54 * Set segs_retransmitted ONLY when it is not set, otherwise new segments
55 * can clear this even if there are retransmitted segments
56 */
57 if (!tp->rack.segs_retransmitted) {
58 tp->rack.segs_retransmitted = !!(flags & TCP_RACK_RETRANSMITTED);
59 }
60 }
61
62 /* If segment (t1, seq1) was sent after segment (t2, seq2) */
63 static bool
tcp_rack_sent_after(uint32_t t1,uint32_t seq1,uint32_t t2,uint32_t seq2)64 tcp_rack_sent_after(uint32_t t1, uint32_t seq1, uint32_t t2, uint32_t seq2)
65 {
66 return (t1 > t2) || (t1 == t2 && SEQ_GT(seq1, seq2));
67 }
68
69 void
tcp_rack_update_reordering_win_persist(struct tcpcb * tp)70 tcp_rack_update_reordering_win_persist(struct tcpcb *tp)
71 {
72 if (tp->rack.reo_wnd_persist != 0) {
73 tp->rack.reo_wnd_persist--;
74 }
75 }
76
77 void
tcp_rack_bad_rexmt_restore(struct tcpcb * tp)78 tcp_rack_bad_rexmt_restore(struct tcpcb *tp)
79 {
80 /* Force RACK to re-examine losses */
81 tp->rack.advanced = 1;
82
83 /* Restore reordering window persist value */
84 tp->rack.reo_wnd_persist = MIN(tp->rack.reo_wnd_persist + 1,
85 TCP_RACK_RECOVERY_PERSIST_MAX);
86 }
87
88 void
tcp_rack_reset_segs_retransmitted(struct tcpcb * tp)89 tcp_rack_reset_segs_retransmitted(struct tcpcb *tp)
90 {
91 tp->rack.segs_retransmitted = false;
92 }
93
94 /* MUST be called before we have processed dup ACKs and made a decision to enter recovery */
95 static uint32_t
tcp_rack_reordering_window(struct tcpcb * tp,uint32_t dup_acks,bool in_rto)96 tcp_rack_reordering_window(struct tcpcb *tp, uint32_t dup_acks, bool in_rto)
97 {
98 if (tp->t_reordered_pkts == 0) {
99 /*
100 * When no reordering has been observed, the RACK.reo_wnd is set
101 * to 0 both during fast and RTO recovery. OR if we are entering
102 * fast recovery due to SACKed segments/dup ACKs >= DupThresh.
103 * When reo_wnd is set to 0, loss is detected if RACK.RTT time has
104 * elapsed since packet was sent.
105 */
106 if (IN_FASTRECOVERY(tp) || dup_acks >= tp->t_rexmtthresh || in_rto) {
107 return 0;
108 }
109 }
110 /*
111 * reordering window = N * Min_RTT/4,
112 * limited to a max value of SRTT.
113 */
114 uint32_t reordering_window = MIN((tp->rack.reo_wnd_multi * (get_base_rtt(tp) >> 2)),
115 (uint32_t)(tp->t_srtt >> TCP_RTT_SHIFT));
116 reordering_window = MAX(reordering_window, REORDERING_WINDOW_FLOOR);
117
118 return reordering_window;
119 }
120
121 static uint32_t
tcp_rack_detect_segment_lost(struct tcpcb * tp,struct tcp_seg_sent * seg,uint32_t reordering_window,bool * loss_detected)122 tcp_rack_detect_segment_lost(struct tcpcb *tp, struct tcp_seg_sent *seg,
123 uint32_t reordering_window, bool *loss_detected)
124 {
125 /* Skip already marked lost but not yet retransmitted segments */
126 if (seg->flags & TCP_SEGMENT_LOST &&
127 !(seg->flags & TCP_RACK_RETRANSMITTED)) {
128 return 0;
129 }
130
131 if (seg->flags & TCP_SEGMENT_SACKED) {
132 return 0;
133 }
134
135 /* After the segment is sent, wait for (RTT + reordering window) */
136 uint32_t wait_ts = seg->xmit_ts + tp->rack.rtt + reordering_window;
137 if (TSTMP_GEQ(tcp_now, wait_ts)) {
138 /*
139 * Segment should be marked as lost as it was sent
140 * (RTT + reordering window) time ago.
141 */
142 tcp_mark_seg_lost(tp, seg);
143 if (loss_detected != NULL) {
144 *loss_detected = true;
145 }
146 return 0;
147 }
148 return wait_ts - tcp_now;
149 }
150
151 /*
152 * RFC 8985,
153 * Step 1: Update RACK.min_RTT (done in tcp_input)
154 * Step 2: Update the state for the most recently sent segment that has been delivered.
155 */
156 void
tcp_rack_update_segment_acked(struct tcpcb * tp,uint32_t tsecr,uint32_t xmit_ts,uint32_t end_seq,bool retransmitted)157 tcp_rack_update_segment_acked(struct tcpcb *tp, uint32_t tsecr,
158 uint32_t xmit_ts, uint32_t end_seq,
159 bool retransmitted)
160 {
161 /*
162 * Step 1: Update RACK.min_RTT - is done in tcp_input ACK processing
163 */
164 uint32_t rtt = tcp_now - xmit_ts;
165 if (rtt == 0) {
166 /*
167 * As rtt has millisecond precision,
168 * make adjustment for sub ms RTT
169 */
170 rtt = 1;
171 }
172 /*
173 * RFC 8985 - An ACK can acknowledge retransmitted data and because retransmissions
174 * can be spurious, ignore ACKs for such retransmitted segments.
175 * Ignore a segment if any of its sequence range has been retransmitted
176 * before and if either of two conditions is true:
177 * 1. The TSecr of the ACK's timestamp option (if available) indicates the ACK was not
178 * acknowledging the last (re)transmission OR tsecr was invalid (greater than tcp_now)
179 * 2. If TSecr is not available or ACK arrives immediately after last retransmission,
180 * check if the segment was last (re)transmitted less than RACK.min_rtt ago.
181 */
182 if (retransmitted) {
183 if ((tsecr != 0 && (TSTMP_LT(tsecr, xmit_ts) || TSTMP_GT(tsecr, tcp_now)))
184 || rtt < get_base_rtt(tp)) {
185 // Spurious inference
186 TCP_LOG(tp, "Spurious inference as either "
187 "tsecr (%u) doesn't lie between xmit_ts(%u) and now (%u) OR "
188 "the rtt (%u) is less than base-rtt (%u). end_seq is:%u",
189 tsecr, xmit_ts, tcp_now, rtt, get_base_rtt(tp), end_seq);
190 return;
191 }
192 }
193
194 if (tcp_rack_sent_after(xmit_ts, end_seq, tp->rack.xmit_ts, tp->rack.end_seq)) {
195 tp->rack.advanced = 1;
196 tp->rack.xmit_ts = xmit_ts;
197 tp->rack.end_seq = end_seq;
198 tp->rack.rtt = rtt;
199
200 /* Cancel the RACK reordering timer as we have received a new ACK */
201 tp->t_timer[TCPT_REORDER] = 0;
202 }
203 }
204
205 /*
206 * Step 3: Reordering detection is done in tcp_sack_detect_reordering
207 * Step 4: Update the RACK reordering window.
208 */
209 void
tcp_rack_update_reordering_window(struct tcpcb * tp,tcp_seq th_ack)210 tcp_rack_update_reordering_window(struct tcpcb *tp, tcp_seq th_ack)
211 {
212 /*
213 * RACK.reo_wnd starts with a value of RACK.min_RTT/4. After that, RACK
214 * dynamically adapts to higher degrees of reordering using DSACK
215 * option from the receiver.
216 * To deal with temporary reordering, RACK persists using the inflated
217 * RACK.reo_wnd for up to 16 loss recoveries, after which it resets
218 * RACK.reo_wnd to its starting value.
219 */
220
221 /*
222 * Ignore DSACK if an RTT hasn't passed as th_ack < previous dsack_round_end
223 */
224 if (SEQ_LT(th_ack, tp->rack.dsack_round_end)) {
225 tp->rack.dsack_round_seen = 0;
226 }
227 /*
228 * Start of the new dsack round.
229 * Grow the reordering window once per round that sees DSACK on an ACK.
230 * Reordering window persists for 16 loss recoveries (that don't receive DSACK).
231 * On receiving DSACK, we reset window persist to 16 as it
232 * indicates that reordering is still happening.
233 */
234 if (tp->rack.dsack_round_seen == 1) {
235 tp->rack.dsack_round_seen = 0;
236 tp->rack.dsack_round_end = tp->snd_nxt;
237 tp->rack.reo_wnd_multi = (uint8_t)(min(0xFF, tp->rack.reo_wnd_multi + 1));
238 tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_PERSIST_MAX;
239 } else if (tp->rack.reo_wnd_persist == 0) {
240 tp->rack.reo_wnd_multi = 1;
241 }
242 }
243
244 /*
245 * Step 5: Detect losses
246 * Call this only after S/ACK has been processed, so that s/acked segments
247 * are either removed or marked accordingly
248 */
249 static uint32_t
tcp_rack_detect_loss(struct tcpcb * tp,uint32_t dup_acks,bool * loss_detected)250 tcp_rack_detect_loss(struct tcpcb *tp, uint32_t dup_acks, bool *loss_detected)
251 {
252 struct tcp_seg_sent *seg = NULL;
253 uint32_t reordering_timeout = 0;
254 uint32_t reordering_window = tcp_rack_reordering_window(tp, dup_acks, false);
255
256 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
257 /*
258 * No segment after this segment has been acknowledged yet,
259 * hence RACK.segment is not after this segment
260 */
261 if (!tcp_rack_sent_after(tp->rack.xmit_ts, tp->rack.end_seq,
262 seg->xmit_ts, seg->end_seq)) {
263 break;
264 }
265
266 uint32_t remaining = tcp_rack_detect_segment_lost(tp, seg, reordering_window, loss_detected);
267 if (remaining) {
268 /*
269 * We only want to arm the timer at max wait time as we are
270 * expecting to get ACKs to do RACK processing. Only in the
271 * worst case, when we don't receive ACKs, we set the timeout
272 * to be the wait time for the most recently sent packet.
273 */
274 reordering_timeout = max(remaining, reordering_timeout);
275 }
276 }
277 return reordering_timeout;
278 }
279
280 /*
281 * Call during input processing to detect loss.
282 * If loss is detected, enter_fr will be true and
283 * tcp_input will enter fast recovery
284 */
285 bool
tcp_rack_detect_loss_and_arm_timer(struct tcpcb * tp,uint32_t dup_acks)286 tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks)
287 {
288 uint32_t reordering_timeout = 0;
289 bool loss_detected = false;
290
291 if (!tp->rack.advanced) {
292 return false;
293 }
294
295 /* Cancel any existing RACK reordering timer as we are going to re-fire it if needed */
296 tp->t_timer[TCPT_REORDER] = 0;
297
298 reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected);
299 if (reordering_timeout) {
300 tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp,
301 reordering_timeout + REORDERING_WINDOW_FLOOR);
302 /* Since losses can be marked at future point, clear the TLP timer */
303 tp->t_timer[TCPT_PTO] = 0;
304 } else {
305 /* Cancel any pending timers */
306 tp->t_timer[TCPT_REORDER] = 0;
307 }
308
309 return loss_detected;
310 }
311
312 /* Reordering timeout has expired, detect loss and enter recovery */
313 void
tcp_rack_reordering_timeout(struct tcpcb * tp,uint32_t dup_acks)314 tcp_rack_reordering_timeout(struct tcpcb *tp, uint32_t dup_acks)
315 {
316 bool enter_fr = false;
317
318 tcp_rack_detect_loss(tp, dup_acks, &enter_fr);
319
320 if (enter_fr) {
321 /* Some packets have been marked as lost */
322 if (!IN_FASTRECOVERY(tp)) {
323 tcp_rexmt_save_state(tp);
324 tcp_enter_fast_recovery(tp);
325 }
326 tcpstat.tcps_rack_reordering_timeout_recovery_episode++;
327 tp->t_rack_reo_timeout_recovery_episode++;
328 tcp_output(tp);
329 }
330 }
331
332 void
tcp_rack_loss_on_rto(struct tcpcb * tp,bool in_rto)333 tcp_rack_loss_on_rto(struct tcpcb *tp, bool in_rto)
334 {
335 struct tcp_seg_sent *seg = NULL;
336 uint32_t reordering_window = tcp_rack_reordering_window(tp, 0, in_rto);
337
338 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
339 /* Mark the first unacknowledged segment as lost */
340 if (seg->start_seq == tp->snd_una) {
341 tcp_mark_seg_lost(tp, seg);
342 }
343 /*
344 * Mark any segment for which time elapsed since transmit
345 * is at least the sum of recent RTT and reordering window
346 */
347 tcp_rack_detect_segment_lost(tp, seg, reordering_window, NULL);
348 }
349 }
350
351 uint32_t
tcp_rack_adjust(struct tcpcb * tp,uint32_t cwin)352 tcp_rack_adjust(struct tcpcb *tp, uint32_t cwin)
353 {
354 uint32_t max_len = 0;
355 struct tcp_seg_sent *seg = NULL;
356
357 /*
358 * We traverse RB tree (instead of time-ordered list)
359 * as it would be faster to look for a seg such that
360 * seg->start <= snd_nxt < seg->end
361 */
362 RB_FOREACH(seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
363 if (max_len >= cwin) {
364 break;
365 }
366 if (seg->flags & TCP_SEGMENT_SACKED) {
367 if (SEQ_LT(tp->snd_nxt, seg->end_seq) &&
368 SEQ_GEQ(tp->snd_nxt, seg->start_seq)) {
369 tp->snd_nxt = seg->end_seq;
370 }
371 continue;
372 }
373 if (SEQ_LT(tp->snd_nxt, seg->end_seq)) {
374 max_len += tcp_seg_len(seg);
375 }
376 }
377
378 return max_len;
379 }
380
381 /* This function is only used during retransmissions. */
382 struct tcp_seg_sent *
tcp_rack_output(struct tcpcb * tp,uint32_t cwin,uint16_t * rack_seg_len)383 tcp_rack_output(struct tcpcb *tp, uint32_t cwin, uint16_t *rack_seg_len)
384 {
385 struct tcp_seg_sent *seg = NULL;
386
387 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
388 if (seg->flags & TCP_SEGMENT_SACKED) {
389 continue;
390 }
391 if (seg->flags & TCP_SEGMENT_LOST && !(seg->flags & TCP_RACK_RETRANSMITTED)) {
392 /* We don't do TSO for retransmissions and only send MSS sized segments */
393 uint16_t allowed_size = (uint16_t)min(cwin, tp->t_maxseg);
394 /*
395 * When entire segment can be retransmitted,
396 * lost segment is moved to the end of the time-ordered
397 * list in tcp_seg_sent_insert.
398 *
399 * When entire segment can't be retransmitted,
400 * we move the seg->start by amount of data
401 * retransmitted during tcp_seg_sent_insert
402 */
403 *rack_seg_len = tcp_seg_len(seg) <= allowed_size ?
404 (uint16_t)tcp_seg_len(seg) : allowed_size;
405
406 break;
407 }
408 }
409
410 return seg;
411 }
412
413 /*
414 * Check if a retransmitted segment was completed covered by received
415 * (first) DSACK block
416 */
417 void
tcp_rack_detect_reordering_dsack(struct tcpcb * tp,tcp_seq start,tcp_seq end)418 tcp_rack_detect_reordering_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end)
419 {
420 struct tcp_seg_sent *seg = NULL;
421
422 TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
423 if (seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) {
424 if (SEQ_LEQ(start, seg->start_seq) && SEQ_GEQ(end, seg->end_seq)) {
425 tp->t_reordered_pkts++;
426 }
427 }
428 }
429 }
430
431 void
tcp_rack_detect_reordering_acked(struct tcpcb * tp,struct tcp_seg_sent * seg)432 tcp_rack_detect_reordering_acked(struct tcpcb *tp, struct tcp_seg_sent *seg)
433 {
434 /*
435 * A never retransmitted segment below fack was delivered.
436 * Ignore the segments that have already been sacked before
437 */
438 if (SEQ_LT(seg->end_seq, tp->snd_fack) &&
439 (seg->flags & (TCP_SEGMENT_SACKED | TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)) == 0) {
440 tp->t_reordered_pkts++;
441 }
442 }
443