1 /*
2 * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/fsw_var.h>
31 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
32 #include <netinet/tcp.h>
33 #include <netinet/tcp_fsm.h>
34 #include <netinet/tcp_seq.h>
35 #include <netinet/tcp_timer.h>
36 #include <netinet/tcp_var.h>
37 #include <netinet/udp.h>
38 #include <netinet/in_stat.h>
39 #include <netinet/ip.h>
40 #include <netinet/ip6.h>
41 #include <sys/kdebug.h>
42
43 /* min/max linger time (in seconds */
44 #define FLOWTRACK_LINGER_MIN 1
45 #define FLOWTRACK_LINGER_MAX 120
46
47 /* maximum allowed rate of SYNs per second */
48 #define FLOWTRACK_SYN_RATE 20
49
50 static int flow_track_tcp(struct flow_entry *, struct flow_track *,
51 struct flow_track *, struct __kern_packet *, bool);
52 static int flow_track_udp(struct flow_entry *, struct flow_track *,
53 struct flow_track *, struct __kern_packet *, bool);
54
55 static void
flow_track_tcp_get_wscale(struct flow_track * s,struct __kern_packet * pkt)56 flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt)
57 {
58 const uint8_t *hdr = __unsafe_forge_bidi_indexable(uint8_t *,
59 pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
60 int hlen = pkt->pkt_flow_tcp_hlen;
61 uint8_t optlen, wscale = 0;
62 const uint8_t *opt;
63
64 _CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t));
65 ASSERT(hlen >= (int)sizeof(struct tcphdr));
66
67 opt = hdr + sizeof(struct tcphdr);
68 hlen -= sizeof(struct tcphdr);
69 while (hlen >= 3) {
70 switch (*opt) {
71 case TCPOPT_EOL:
72 case TCPOPT_NOP:
73 ++opt;
74 --hlen;
75 break;
76 case TCPOPT_WINDOW:
77 wscale = opt[2];
78 if (wscale > TCP_MAX_WINSHIFT) {
79 wscale = TCP_MAX_WINSHIFT;
80 }
81 os_atomic_or(&s->fse_flags, FLOWSTATEF_WSCALE, relaxed);
82 OS_FALLTHROUGH;
83 default:
84 optlen = opt[1];
85 if (optlen < 2) {
86 optlen = 2;
87 }
88 hlen -= optlen;
89 opt += optlen;
90 break;
91 }
92 }
93 s->fse_wscale = wscale;
94 }
95
96 static void
flow_track_tcp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)97 flow_track_tcp_init(struct flow_entry *fe, struct flow_track *src,
98 struct flow_track *dst, struct __kern_packet *pkt)
99 {
100 #pragma unused(dst)
101 const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
102
103 /*
104 * Source state initialization.
105 */
106 src->fse_state = TCPS_SYN_SENT;
107 src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
108 src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + 1);
109 if (tcp_flags & TH_SYN) {
110 src->fse_seqhi++;
111 flow_track_tcp_get_wscale(src, pkt);
112 }
113 if (tcp_flags & TH_FIN) {
114 src->fse_seqhi++;
115 }
116
117 src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), 1);
118 if (src->fse_flags & FLOWSTATEF_WSCALE) {
119 /* remove scale factor from initial window */
120 int win = src->fse_max_win;
121 ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
122 win += (1 << src->fse_wscale);
123 src->fse_max_win = (uint16_t)((win - 1) >> src->fse_wscale);
124 }
125
126 /*
127 * Destination state initialization.
128 */
129 dst->fse_state = TCPS_CLOSED;
130 dst->fse_seqhi = 1;
131 dst->fse_max_win = 1;
132
133 /*
134 * Linger time (in seconds).
135 */
136 fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
137 if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
138 fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
139 } else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
140 fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
141 }
142
143 os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
144 }
145
146 /*
147 * The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
148 * for a endpoint to process incoming segment and generate ACK, at the point of
149 * observation. For flowswitch, it means that:
150 *
151 * local end RTT = local stack processing time
152 * remote end RTT = driver + network + remote endpoint's processing time
153 *
154 * Since the measurement is lightweight and sampling based, it won't learn and
155 * distinguish lost segment's ACK. So we could occasionally get large RTT
156 * sample from an ACK to a retransmitted segment. Thus rtt_max is not any
157 * meaningful to us.
158 */
159 __attribute__((always_inline))
160 static inline void
flow_track_tcp_rtt(struct flow_entry * fe,boolean_t input,struct flow_track * src,struct flow_track * dst,uint8_t tcp_flags,uint32_t seq,uint32_t ack,uint32_t ulen)161 flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
162 struct flow_track *src, struct flow_track *dst, uint8_t tcp_flags,
163 uint32_t seq, uint32_t ack, uint32_t ulen)
164 {
165 #pragma unused(fe, input) /* KDBG defined as noop in release build */
166 uint64_t dst_last, src_last;
167 uint64_t now, time_diff;
168 uint32_t curval, oldval;
169 clock_sec_t tv_sec;
170 clock_usec_t tv_usec;
171
172 src_last = src->fse_rtt.frtt_last;
173 dst_last = dst->fse_rtt.frtt_last;
174
175 /* start a new RTT tracking session under sampling rate limit */
176 if (dst_last == 0 ||
177 _net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
178 if (ulen > 0 &&
179 dst->fse_rtt.frtt_timestamp == 0) {
180 dst->fse_rtt.frtt_timestamp = mach_absolute_time();
181 dst->fse_rtt.frtt_last = _net_uptime;
182 dst->fse_rtt.frtt_seg_begin = seq;
183 dst->fse_rtt.frtt_seg_end = seq + ulen;
184 KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START),
185 SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
186 input ? 1 : 0);
187 }
188 }
189
190 /* we have an ACK, see if current tracking session matches it */
191 if (tcp_flags & TH_ACK) {
192 if (src->fse_rtt.frtt_timestamp != 0 &&
193 src->fse_rtt.frtt_seg_begin <= ack) {
194 now = mach_absolute_time();
195 time_diff = now - src->fse_rtt.frtt_timestamp;
196
197 absolutetime_to_microtime(time_diff, &tv_sec, &tv_usec);
198 curval = (uint32_t)(tv_usec + tv_sec * 1000 * 1000);
199 oldval = src->fse_rtt.frtt_usec;
200 if (oldval == 0) {
201 src->fse_rtt.frtt_usec = curval;
202 } else {
203 /* same EWMA decay as TCP RTT */
204 src->fse_rtt.frtt_usec =
205 ((oldval << 4) - oldval + curval) >> 4;
206 }
207
208 /* reset RTT tracking session */
209 src->fse_rtt.frtt_timestamp = 0;
210 src->fse_rtt.frtt_last = 0;
211 KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_END),
212 SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
213 input ? 0 : 1);
214
215 /* publish rtt stats into flow_stats object */
216 /* just store both to avoid branch prediction etc. */
217 fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
218 fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
219 }
220 }
221 }
222
223 /*
224 * The TCP connection tracking logic is based on Guido van Rooij's paper:
225 * http://www.sane.nl/events/sane2000/papers/rooij.pdf
226 *
227 * In some ways, we act as a middlebox that passively tracks the TCP windows
228 * of each connection on flows marked with FLOWENTF_TRACK. We never modify
229 * the packet or generate any response (e.g. RST) to the sender; thus we are
230 * simply a silent observer. The information we gather here is used later
231 * if we need to generate a valid {FIN|RST} segment when the flow is nonviable.
232 *
233 * The implementation is borrowed from Packet Filter, and is further
234 * simplified to cater for our use cases.
235 */
236 #define FTF_HALFCLOSED 0x1 /* want flow to be marked as half closed */
237 #define FTF_WAITCLOSE 0x2 /* want flow to linger after close */
238 #define FTF_CLOSENOTIFY 0x4 /* want to notify NECP upon torn down */
239 #define FTF_WITHDRAWN 0x8 /* want flow to be torn down */
240 #define FTF_SYN_RLIM 0x10 /* want flow to rate limit SYN */
241 #define FTF_RST_RLIM 0x20 /* want flow to rate limit RST */
242 __attribute__((always_inline))
243 static inline int
flow_track_tcp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)244 flow_track_tcp(struct flow_entry *fe, struct flow_track *src,
245 struct flow_track *dst, struct __kern_packet *pkt, bool input)
246 {
247 const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
248 uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
249 uint32_t ack, end, seq, orig_seq;
250 uint32_t ftflags = 0;
251 uint8_t sws, dws;
252 int ackskew, err = 0;
253
254 if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
255 flow_track_tcp_init(fe, src, dst, pkt);
256 }
257
258 flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
259 ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
260 pkt->pkt_flow_ulen);
261
262 if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
263 src->fse_state >= TCPS_FIN_WAIT_2)) {
264 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
265 src->fse_state = dst->fse_state = TCPS_CLOSED;
266 ftflags |= FTF_SYN_RLIM;
267 }
268 if (tcp_flags & TH_RST) {
269 ftflags |= FTF_RST_RLIM;
270 }
271 if (input) {
272 err = ENETRESET;
273 }
274 goto done;
275 }
276
277 if (__probable((tcp_flags & TH_SYN) == 0 &&
278 src->fse_wscale != 0 && dst->fse_wscale != 0)) {
279 sws = src->fse_wscale;
280 dws = dst->fse_wscale;
281 } else {
282 sws = dws = 0;
283 }
284
285 orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
286 if (__probable(src->fse_seqlo != 0)) {
287 ack = ntohl(pkt->pkt_flow_tcp_ack);
288 end = seq + pkt->pkt_flow_ulen;
289 if (tcp_flags & TH_SYN) {
290 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
291 ftflags |= FTF_SYN_RLIM;
292 }
293 end++;
294 }
295 if (tcp_flags & TH_FIN) {
296 end++;
297 }
298 if (tcp_flags & TH_RST) {
299 ftflags |= FTF_RST_RLIM;
300 }
301 } else {
302 /* first packet from this end; set its state */
303 ack = ntohl(pkt->pkt_flow_tcp_ack);
304
305 /* We saw the first SYN, but stack does not reply with a SYN */
306 if (dst->fse_state == TCPS_SYN_SENT && ((tcp_flags & TH_SYN) == 0)) {
307 /* Act as if no sequence number is set */
308 seq = 0;
309 /* Pretend the outgoing SYN was not ACK'ed */
310 ack = dst->fse_seqlo;
311 }
312
313 end = seq + pkt->pkt_flow_ulen;
314 if (tcp_flags & TH_SYN) {
315 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
316 ftflags |= FTF_SYN_RLIM;
317 }
318 end++;
319 if (dst->fse_flags & FLOWSTATEF_WSCALE) {
320 flow_track_tcp_get_wscale(src, pkt);
321 if (src->fse_flags & FLOWSTATEF_WSCALE) {
322 /*
323 * Remove scale factor from
324 * initial window.
325 */
326 sws = src->fse_wscale;
327 win = (uint16_t)(((u_int32_t)win + (1 << sws) - 1)
328 >> sws);
329 dws = dst->fse_wscale;
330 } else {
331 /* fixup other window */
332 dst->fse_max_win = (uint16_t)(dst->fse_max_win << dst->fse_wscale);
333 /* in case of a retrans SYN|ACK */
334 dst->fse_wscale = 0;
335 }
336 }
337 }
338 if (tcp_flags & TH_FIN) {
339 end++;
340 }
341 if (tcp_flags & TH_RST) {
342 ftflags |= FTF_RST_RLIM;
343 }
344
345 src->fse_seqlo = seq;
346 if (src->fse_state < TCPS_SYN_SENT) {
347 if (tcp_flags & TH_SYN) {
348 src->fse_state = TCPS_SYN_SENT;
349 } else {
350 /* Picking up the connection in the middle */
351 src->fse_state = TCPS_ESTABLISHED;
352 }
353 }
354
355 /*
356 * May need to slide the window (seqhi may have been set by
357 * the crappy stack check or if we picked up the connection
358 * after establishment).
359 */
360 if (src->fse_seqhi == 1 || SEQ_GEQ(end +
361 MAX(1, dst->fse_max_win << dws), src->fse_seqhi)) {
362 src->fse_seqhi = end + MAX(1, dst->fse_max_win << dws);
363 }
364 if (win > src->fse_max_win) {
365 src->fse_max_win = win;
366 }
367 }
368
369 if (!(tcp_flags & TH_ACK)) {
370 /* let it pass through the ack skew check */
371 ack = dst->fse_seqlo;
372 } else if ((ack == 0 &&
373 (tcp_flags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) ||
374 /* broken tcp stacks do not set ack */
375 (dst->fse_state < TCPS_SYN_SENT)) {
376 /*
377 * Many stacks (ours included) will set the ACK number in an
378 * FIN|ACK if the SYN times out -- no sequence to ACK.
379 */
380 ack = dst->fse_seqlo;
381 }
382
383 if (seq == end) {
384 /* ease sequencing restrictions on no data packets */
385 seq = src->fse_seqlo;
386 end = seq;
387 }
388
389 ackskew = dst->fse_seqlo - ack;
390
391 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
392 if (SEQ_GEQ(src->fse_seqhi, end) &&
393 /* last octet inside other's window space */
394 SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
395 /* retrans: not more than one window back */
396 (ackskew >= -MAXACKWINDOW) &&
397 /* acking not more than one reassembled fragment backwards */
398 (ackskew <= (MAXACKWINDOW << sws)) &&
399 /* acking not more than one window forward */
400 (!(tcp_flags & TH_RST) || orig_seq == src->fse_seqlo ||
401 (orig_seq == src->fse_seqlo + 1) ||
402 (orig_seq + 1 == src->fse_seqlo))) {
403 /* require an exact/+1 sequence match on resets when possible */
404
405 /* update max window */
406 if (src->fse_max_win < win) {
407 src->fse_max_win = win;
408 }
409 /* synchronize sequencing */
410 if (SEQ_GT(end, src->fse_seqlo)) {
411 src->fse_seqlo = end;
412 }
413 /* slide the window of what the other end can send */
414 if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
415 dst->fse_seqhi = ack + MAX((win << sws), 1);
416 }
417
418 /* update states */
419 if (tcp_flags & TH_SYN) {
420 if (src->fse_state < TCPS_SYN_SENT) {
421 src->fse_state = TCPS_SYN_SENT;
422 }
423 }
424 if (tcp_flags & TH_FIN) {
425 if (src->fse_state < TCPS_CLOSING) {
426 src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
427 src->fse_state = TCPS_CLOSING;
428 }
429 }
430 if (tcp_flags & TH_ACK) {
431 /*
432 * Avoid transitioning to ESTABLISHED when our SYN
433 * is ACK'd along with a RST. The sending TCP may
434 * still retransmit the SYN (after dropping some
435 * options like ECN, etc.)
436 */
437 if (dst->fse_state == TCPS_SYN_SENT &&
438 !(tcp_flags & TH_RST)) {
439 dst->fse_state = TCPS_ESTABLISHED;
440 ftflags |= (FTF_WAITCLOSE | FTF_CLOSENOTIFY);
441 } else if (dst->fse_state == TCPS_CLOSING &&
442 ack == dst->fse_seqlast + 1) {
443 dst->fse_state = TCPS_FIN_WAIT_2;
444 ftflags |= FTF_WAITCLOSE;
445 if (src->fse_state >= TCPS_FIN_WAIT_2) {
446 ftflags |= FTF_WITHDRAWN;
447 } else {
448 ftflags |= FTF_HALFCLOSED;
449 }
450 }
451 }
452 if ((tcp_flags & TH_RST) &&
453 (src->fse_state == TCPS_ESTABLISHED ||
454 dst->fse_state == TCPS_ESTABLISHED)) {
455 /*
456 * If either endpoint is in ESTABLISHED, transition
457 * both to TIME_WAIT. Otherwise, keep the existing
458 * state as is, e.g. SYN_SENT.
459 */
460 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
461 ftflags |= (FTF_WITHDRAWN | FTF_WAITCLOSE);
462 }
463 } else if ((dst->fse_state < TCPS_SYN_SENT ||
464 dst->fse_state >= TCPS_FIN_WAIT_2 ||
465 src->fse_state >= TCPS_FIN_WAIT_2) &&
466 SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
467 /* within a window forward of the originating packet */
468 SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
469 /* within a window backward of the originating packet */
470
471 /* BEGIN CSTYLED */
472 /*
473 * This currently handles three situations:
474 * 1) Stupid stacks will shotgun SYNs before their peer
475 * replies.
476 * 2) When flow tracking catches an already established
477 * stream (the flow states are cleared, etc.)
478 * 3) Packets get funky immediately after the connection
479 * closes (this should catch spurious ACK|FINs that
480 * web servers like to spew after a close).
481 *
482 * This must be a little more careful than the above code
483 * since packet floods will also be caught here.
484 */
485 /* END CSTYLED */
486
487 /* update max window */
488 if (src->fse_max_win < win) {
489 src->fse_max_win = win;
490 }
491 /* synchronize sequencing */
492 if (SEQ_GT(end, src->fse_seqlo)) {
493 src->fse_seqlo = end;
494 }
495 /* slide the window of what the other end can send */
496 if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
497 dst->fse_seqhi = ack + MAX((win << sws), 1);
498 }
499
500 /*
501 * Cannot set dst->fse_seqhi here since this could be a
502 * shotgunned SYN and not an already established connection.
503 */
504
505 if (tcp_flags & TH_FIN) {
506 if (src->fse_state < TCPS_CLOSING) {
507 src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
508 src->fse_state = TCPS_CLOSING;
509 }
510 }
511 if (tcp_flags & TH_RST) {
512 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
513 ftflags |= FTF_WAITCLOSE;
514 }
515 } else {
516 if (dst->fse_state == TCPS_SYN_SENT &&
517 src->fse_state == TCPS_SYN_SENT) {
518 src->fse_seqlo = 0;
519 src->fse_seqhi = 1;
520 src->fse_max_win = 1;
521 }
522 }
523
524 done:
525 if (__improbable((ftflags & FTF_HALFCLOSED) != 0)) {
526 os_atomic_or(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
527 ftflags &= ~FTF_HALFCLOSED;
528 }
529
530 /*
531 * Hold on to namespace for a while after the flow is closed.
532 */
533 if (__improbable((ftflags & FTF_WAITCLOSE) != 0 &&
534 (fe->fe_flags & FLOWENTF_WAIT_CLOSE) == 0)) {
535 os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
536 ftflags &= ~FTF_WAITCLOSE;
537 }
538
539 /*
540 * Notify NECP upon tear down (for established flows).
541 */
542 if (__improbable((ftflags & FTF_CLOSENOTIFY) != 0 &&
543 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == 0)) {
544 os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY, relaxed);
545 ftflags &= ~FTF_CLOSENOTIFY;
546 }
547
548 /*
549 * Flow is withdrawn; the port we have should not be included in
550 * the list of offloaded ports, as the connection is no longer
551 * usable (we're not expecting any more data).
552 * Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
553 * thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
554 * FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
555 */
556 if (__improbable((ftflags & FTF_WITHDRAWN) != 0)) {
557 ftflags &= ~FTF_WITHDRAWN;
558 if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
559 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
560 }
561 fe->fe_want_withdraw = 1;
562 }
563
564 /*
565 * If no other work is needed, we're done.
566 */
567 if (ftflags == 0 || input) {
568 return err;
569 }
570
571 /*
572 * If we're over the rate limit for outbound SYNs, drop packet.
573 */
574 if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) {
575 uint32_t now = (uint32_t)_net_uptime;
576 if ((now - src->fse_syn_ts) > 1) {
577 src->fse_syn_ts = now;
578 src->fse_syn_cnt = 0;
579 }
580 if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
581 err = EPROTO;
582 }
583 }
584
585 return err;
586 }
587 #undef FTF_WAITCLOSE
588 #undef FTF_CLOSENOTIFY
589 #undef FTF_WITHDRAWN
590 #undef FTF_SYN_RLIM
591 #undef FTF_RST_RLIM
592
593 boolean_t
flow_track_tcp_want_abort(struct flow_entry * fe)594 flow_track_tcp_want_abort(struct flow_entry *fe)
595 {
596 struct flow_track *src = &fe->fe_ltrack;
597 struct flow_track *dst = &fe->fe_rtrack;
598
599 if (fe->fe_key.fk_proto != IPPROTO_TCP ||
600 (fe->fe_flags & FLOWENTF_ABORTED)) {
601 goto done;
602 }
603
604 /* this can be enhanced; for now rely on established state */
605 if (src->fse_state == TCPS_ESTABLISHED ||
606 dst->fse_state == TCPS_ESTABLISHED) {
607 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
608 /* don't process more than once */
609 os_atomic_or(&fe->fe_flags, FLOWENTF_ABORTED, relaxed);
610 return TRUE;
611 }
612 done:
613 return FALSE;
614 }
615
616 static void
flow_track_udp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)617 flow_track_udp_init(struct flow_entry *fe, struct flow_track *src,
618 struct flow_track *dst, struct __kern_packet *pkt)
619 {
620 #pragma unused(pkt)
621 /*
622 * Source state initialization.
623 */
624 src->fse_state = FT_STATE_NO_TRAFFIC;
625
626 /*
627 * Destination state initialization.
628 */
629 dst->fse_state = FT_STATE_NO_TRAFFIC;
630
631 os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
632 }
633
634 __attribute__((always_inline))
635 static inline int
flow_track_udp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)636 flow_track_udp(struct flow_entry *fe, struct flow_track *src,
637 struct flow_track *dst, struct __kern_packet *pkt, bool input)
638 {
639 #pragma unused(input)
640 if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
641 flow_track_udp_init(fe, src, dst, pkt);
642 }
643
644 if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
645 src->fse_state = FT_STATE_SINGLE;
646 }
647 if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
648 dst->fse_state = FT_STATE_MULTIPLE;
649 }
650
651 return 0;
652 }
653
654 void
flow_track_stats(struct flow_entry * fe,uint64_t bytes,uint64_t packets,bool active,bool in)655 flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
656 bool active, bool in)
657 {
658 volatile struct sk_stats_flow_track *fst;
659
660 if (in) {
661 fst = &fe->fe_stats->fs_rtrack;
662 } else {
663 fst = &fe->fe_stats->fs_ltrack;
664 }
665
666 fst->sft_bytes += bytes;
667 fst->sft_packets += packets;
668
669 if (__probable(active)) {
670 in_stat_set_activity_bitmap(&fe->fe_stats->fs_activity,
671 _net_uptime);
672 }
673 }
674
675 int
flow_pkt_track(struct flow_entry * fe,struct __kern_packet * pkt,bool in)676 flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in)
677 {
678 struct flow_track *src, *dst;
679 int ret = 0;
680
681 _CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED);
682 _CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN);
683 _CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
684 _CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
685 _CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
686 _CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
687 _CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
688 _CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING);
689 _CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
690 _CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
691 _CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
692 _CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
693 _CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE);
694 _CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
695 _CASSERT(SFT_STATE_MAX == FT_STATE_MAX);
696
697 _CASSERT(FT_STATE_CLOSED == TCPS_CLOSED);
698 _CASSERT(FT_STATE_LISTEN == TCPS_LISTEN);
699 _CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
700 _CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
701 _CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
702 _CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
703 _CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
704 _CASSERT(FT_STATE_CLOSING == TCPS_CLOSING);
705 _CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
706 _CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
707 _CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
708
709 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
710
711 if (in) {
712 src = &fe->fe_rtrack;
713 dst = &fe->fe_ltrack;
714 } else {
715 src = &fe->fe_ltrack;
716 dst = &fe->fe_rtrack;
717 }
718
719 flow_track_stats(fe, (pkt->pkt_length - pkt->pkt_l2_len), 1,
720 (pkt->pkt_flow_ulen != 0), in);
721
722 /* skip flow state tracking on non-initial fragments */
723 if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
724 return 0;
725 }
726
727 switch (pkt->pkt_flow_ip_proto) {
728 case IPPROTO_TCP:
729 if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
730 ret = flow_track_tcp(fe, src, dst, pkt, in);
731 }
732 break;
733
734 case IPPROTO_UDP:
735 if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
736 ret = flow_track_udp(fe, src, dst, pkt, in);
737 }
738 break;
739 }
740
741 return ret;
742 }
743
744 /*
745 * @function flow_track_abort_tcp
746 * @abstract send RST for a given TCP flow.
747 * @param in_pkt incoming packet that triggers RST.
748 * @param rst_pkt use as RST template for SEQ/ACK information.
749 */
750 void
flow_track_abort_tcp(struct flow_entry * fe,struct __kern_packet * in_pkt,struct __kern_packet * rst_pkt)751 flow_track_abort_tcp(struct flow_entry *fe, struct __kern_packet *in_pkt,
752 struct __kern_packet *rst_pkt)
753 {
754 struct nx_flowswitch *fsw = fe->fe_fsw;
755 struct flow_track *src, *dst;
756 struct ip *ip;
757 struct ip6_hdr *ip6;
758 struct tcphdr *th;
759 uint16_t len, tlen;
760 struct mbuf *m;
761
762 /* guaranteed by caller */
763 ASSERT(fsw->fsw_ifp != NULL);
764 ASSERT(in_pkt == NULL || rst_pkt == NULL);
765
766 src = &fe->fe_ltrack;
767 dst = &fe->fe_rtrack;
768
769 tlen = sizeof(struct tcphdr);
770 if (fe->fe_key.fk_ipver == IPVERSION) {
771 len = sizeof(struct ip) + tlen;
772 } else {
773 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
774 len = sizeof(struct ip6_hdr) + tlen;
775 }
776
777 m = m_gethdr(M_NOWAIT, MT_HEADER);
778 if (__improbable(m == NULL)) {
779 return;
780 }
781
782 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
783 m->m_data += max_linkhdr; /* 32-bit aligned */
784 m->m_pkthdr.len = m->m_len = len;
785
786 /* zero out for checksum */
787 bzero(m_mtod_current(m), len);
788
789 if (fe->fe_key.fk_ipver == IPVERSION) {
790 ip = mtod(m, struct ip *);
791
792 /* IP header fields included in the TCP checksum */
793 ip->ip_p = IPPROTO_TCP;
794 ip->ip_len = htons(tlen);
795 if (rst_pkt == NULL) {
796 ip->ip_src = fe->fe_key.fk_src4;
797 ip->ip_dst = fe->fe_key.fk_dst4;
798 } else {
799 ip->ip_src = rst_pkt->pkt_flow_ipv4_src;
800 ip->ip_dst = rst_pkt->pkt_flow_ipv4_dst;
801 }
802
803 th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
804 } else {
805 ip6 = mtod(m, struct ip6_hdr *);
806
807 /* IP header fields included in the TCP checksum */
808 ip6->ip6_nxt = IPPROTO_TCP;
809 ip6->ip6_plen = htons(tlen);
810 if (rst_pkt == NULL) {
811 ip6->ip6_src = fe->fe_key.fk_src6;
812 ip6->ip6_dst = fe->fe_key.fk_dst6;
813 } else {
814 ip6->ip6_src = rst_pkt->pkt_flow_ipv6_src;
815 ip6->ip6_dst = rst_pkt->pkt_flow_ipv6_dst;
816 }
817
818 th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
819 }
820
821 /*
822 * TCP header (fabricate a pure RST).
823 */
824 if (in_pkt != NULL) {
825 th->th_sport = in_pkt->pkt_flow_tcp_dst;
826 th->th_dport = in_pkt->pkt_flow_tcp_src;
827 if (__probable(in_pkt->pkt_flow_tcp_flags | TH_ACK)) {
828 /* <SEQ=SEG.ACK><CTL=RST> */
829 th->th_seq = in_pkt->pkt_flow_tcp_ack;
830 th->th_ack = 0;
831 th->th_flags = TH_RST;
832 } else {
833 /* <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> */
834 th->th_seq = 0;
835 th->th_ack = in_pkt->pkt_flow_tcp_seq +
836 in_pkt->pkt_flow_ulen;
837 th->th_flags = TH_RST | TH_ACK;
838 }
839 } else if (rst_pkt != NULL) {
840 th->th_sport = rst_pkt->pkt_flow_tcp_src;
841 th->th_dport = rst_pkt->pkt_flow_tcp_dst;
842 th->th_seq = rst_pkt->pkt_flow_tcp_seq;
843 th->th_ack = rst_pkt->pkt_flow_tcp_ack;
844 th->th_flags = rst_pkt->pkt_flow_tcp_flags;
845 } else {
846 th->th_sport = fe->fe_key.fk_sport;
847 th->th_dport = fe->fe_key.fk_dport;
848 th->th_seq = htonl(src->fse_seqlo); /* peer's last ACK */
849 th->th_ack = 0;
850 th->th_flags = TH_RST;
851 }
852 th->th_off = (tlen >> 2);
853 th->th_win = 0;
854
855 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
856
857 if (fe->fe_key.fk_ipver == IPVERSION) {
858 struct ip_out_args ipoa;
859 struct route ro;
860
861 bzero(&ipoa, sizeof(ipoa));
862 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
863 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
864 IPOAF_BOUND_SRCADDR);
865 ipoa.ipoa_sotc = SO_TC_UNSPEC;
866 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
867
868 /* TCP checksum */
869 th->th_sum = in_cksum(m, len);
870
871 ip->ip_v = IPVERSION;
872 ip->ip_hl = sizeof(*ip) >> 2;
873 ip->ip_tos = 0;
874 /*
875 * ip_output() expects ip_len and ip_off to be in host order.
876 */
877 ip->ip_len = len;
878 ip->ip_off = IP_DF;
879 ip->ip_ttl = (uint8_t)ip_defttl;
880 ip->ip_sum = 0;
881
882 bzero(&ro, sizeof(ro));
883 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
884 ROUTE_RELEASE(&ro);
885 } else {
886 struct ip6_out_args ip6oa;
887 struct route_in6 ro6;
888
889 bzero(&ip6oa, sizeof(ip6oa));
890 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
891 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
892 IP6OAF_BOUND_SRCADDR);
893 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
894 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
895
896 /* TCP checksum */
897 th->th_sum = in6_cksum(m, IPPROTO_TCP,
898 sizeof(struct ip6_hdr), tlen);
899
900 ip6->ip6_vfc |= IPV6_VERSION;
901 ip6->ip6_hlim = IPV6_DEFHLIM;
902
903 bzero(&ro6, sizeof(ro6));
904 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
905 NULL, NULL, &ip6oa);
906 ROUTE_RELEASE(&ro6);
907 }
908 }
909
910 void
flow_track_abort_quic(struct flow_entry * fe,uint8_t * __counted_by (QUIC_STATELESS_RESET_TOKEN_SIZE)token)911 flow_track_abort_quic(struct flow_entry *fe,
912 uint8_t *__counted_by(QUIC_STATELESS_RESET_TOKEN_SIZE)token)
913 {
914 struct quic_stateless_reset {
915 uint8_t ssr_header[30];
916 uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
917 };
918 struct nx_flowswitch *fsw = fe->fe_fsw;
919 struct ip *ip;
920 struct ip6_hdr *ip6;
921 struct udphdr *uh;
922 struct quic_stateless_reset *qssr;
923 uint16_t len, l3hlen, ulen;
924 struct mbuf *__single m;
925 unsigned int one = 1;
926 int error;
927
928 /* guaranteed by caller */
929 ASSERT(fsw->fsw_ifp != NULL);
930
931 /* skip zero token */
932 bool is_zero_token = true;
933 for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
934 if (token[i] != 0) {
935 is_zero_token = false;
936 break;
937 }
938 }
939 if (is_zero_token) {
940 return;
941 }
942
943 ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
944 if (fe->fe_key.fk_ipver == IPVERSION) {
945 l3hlen = sizeof(struct ip);
946 } else {
947 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
948 l3hlen = sizeof(struct ip6_hdr);
949 }
950
951 len = l3hlen + ulen;
952
953 error = mbuf_allocpacket(MBUF_DONTWAIT, max_linkhdr + len, &one, &m);
954 if (__improbable(error != 0)) {
955 return;
956 }
957 VERIFY(m != 0);
958
959 m->m_pkthdr.pkt_proto = IPPROTO_UDP;
960 m->m_data += max_linkhdr; /* 32-bit aligned */
961 m->m_pkthdr.len = m->m_len = len;
962
963 /* zero out for checksum */
964 bzero(m_mtod_current(m), len);
965
966 if (fe->fe_key.fk_ipver == IPVERSION) {
967 ip = mtod(m, struct ip *);
968 ip->ip_p = IPPROTO_UDP;
969 ip->ip_len = htons(ulen);
970 ip->ip_src = fe->fe_key.fk_src4;
971 ip->ip_dst = fe->fe_key.fk_dst4;
972 uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
973 } else {
974 ip6 = mtod(m, struct ip6_hdr *);
975 ip6->ip6_nxt = IPPROTO_UDP;
976 ip6->ip6_plen = htons(ulen);
977 ip6->ip6_src = fe->fe_key.fk_src6;
978 ip6->ip6_dst = fe->fe_key.fk_dst6;
979 uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
980 }
981
982 /* UDP header */
983 uh->uh_sport = fe->fe_key.fk_sport;
984 uh->uh_dport = fe->fe_key.fk_dport;
985 uh->uh_ulen = htons(ulen);
986
987 /* QUIC stateless reset */
988 qssr = (struct quic_stateless_reset *)(uh + 1);
989 read_frandom(&qssr->ssr_header, sizeof(qssr->ssr_header));
990 qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
991 memcpy(qssr->ssr_token, token, QUIC_STATELESS_RESET_TOKEN_SIZE);
992
993 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
994
995 if (fe->fe_key.fk_ipver == IPVERSION) {
996 struct ip_out_args ipoa;
997 struct route ro;
998
999 bzero(&ipoa, sizeof(ipoa));
1000 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
1001 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
1002 IPOAF_BOUND_SRCADDR);
1003 ipoa.ipoa_sotc = SO_TC_UNSPEC;
1004 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1005
1006 uh->uh_sum = in_cksum(m, len);
1007 if (uh->uh_sum == 0) {
1008 uh->uh_sum = 0xffff;
1009 }
1010
1011 ip->ip_v = IPVERSION;
1012 ip->ip_hl = sizeof(*ip) >> 2;
1013 ip->ip_tos = 0;
1014 /*
1015 * ip_output() expects ip_len and ip_off to be in host order.
1016 */
1017 ip->ip_len = len;
1018 ip->ip_off = IP_DF;
1019 ip->ip_ttl = (uint8_t)ip_defttl;
1020 ip->ip_sum = 0;
1021
1022 bzero(&ro, sizeof(ro));
1023 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
1024 ROUTE_RELEASE(&ro);
1025 } else {
1026 struct ip6_out_args ip6oa;
1027 struct route_in6 ro6;
1028
1029 bzero(&ip6oa, sizeof(ip6oa));
1030 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
1031 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
1032 IP6OAF_BOUND_SRCADDR);
1033 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
1034 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1035
1036 uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
1037 ulen);
1038 if (uh->uh_sum == 0) {
1039 uh->uh_sum = 0xffff;
1040 }
1041
1042 ip6->ip6_vfc |= IPV6_VERSION;
1043 ip6->ip6_hlim = IPV6_DEFHLIM;
1044
1045 bzero(&ro6, sizeof(ro6));
1046 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
1047 NULL, NULL, &ip6oa);
1048 ROUTE_RELEASE(&ro6);
1049 }
1050 }
1051