xref: /xnu-10063.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_track.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/fsw_var.h>
31 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
32 #include <netinet/tcp.h>
33 #include <netinet/tcp_fsm.h>
34 #include <netinet/tcp_seq.h>
35 #include <netinet/tcp_timer.h>
36 #include <netinet/tcp_var.h>
37 #include <netinet/udp.h>
38 #include <netinet/in_stat.h>
39 #include <netinet/ip.h>
40 #include <netinet/ip6.h>
41 #include <sys/kdebug.h>
42 
43 /* min/max linger time (in seconds */
44 #define FLOWTRACK_LINGER_MIN    1
45 #define FLOWTRACK_LINGER_MAX    120
46 
47 /* maximum allowed rate of SYNs per second */
48 #define FLOWTRACK_SYN_RATE      20
49 
50 static int flow_track_tcp(struct flow_entry *, struct flow_track *,
51     struct flow_track *, struct __kern_packet *, bool);
52 static int flow_track_udp(struct flow_entry *, struct flow_track *,
53     struct flow_track *, struct __kern_packet *, bool);
54 
55 static void
flow_track_tcp_get_wscale(struct flow_track * s,struct __kern_packet * pkt)56 flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt)
57 {
58 	const uint8_t *hdr = (uint8_t *)(void *)pkt->pkt_flow_tcp_hdr;
59 	int hlen = pkt->pkt_flow_tcp_hlen;
60 	uint8_t optlen, wscale = 0;
61 	const uint8_t *opt;
62 
63 	_CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t));
64 	ASSERT(hlen >= (int)sizeof(struct tcphdr));
65 
66 	opt = hdr + sizeof(struct tcphdr);
67 	hlen -= sizeof(struct tcphdr);
68 	while (hlen >= 3) {
69 		switch (*opt) {
70 		case TCPOPT_EOL:
71 		case TCPOPT_NOP:
72 			++opt;
73 			--hlen;
74 			break;
75 		case TCPOPT_WINDOW:
76 			wscale = opt[2];
77 			if (wscale > TCP_MAX_WINSHIFT) {
78 				wscale = TCP_MAX_WINSHIFT;
79 			}
80 			os_atomic_or(&s->fse_flags, FLOWSTATEF_WSCALE, relaxed);
81 			OS_FALLTHROUGH;
82 		default:
83 			optlen = opt[1];
84 			if (optlen < 2) {
85 				optlen = 2;
86 			}
87 			hlen -= optlen;
88 			opt += optlen;
89 			break;
90 		}
91 	}
92 	s->fse_wscale = wscale;
93 }
94 
95 static void
flow_track_tcp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)96 flow_track_tcp_init(struct flow_entry *fe, struct flow_track *src,
97     struct flow_track *dst, struct __kern_packet *pkt)
98 {
99 #pragma unused(dst)
100 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
101 
102 	/*
103 	 * Source state initialization.
104 	 */
105 	src->fse_state = TCPS_SYN_SENT;
106 	src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
107 	src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + 1);
108 	if (tcp_flags & TH_SYN) {
109 		src->fse_seqhi++;
110 		flow_track_tcp_get_wscale(src, pkt);
111 	}
112 	if (tcp_flags & TH_FIN) {
113 		src->fse_seqhi++;
114 	}
115 
116 	src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), 1);
117 	if (src->fse_flags & FLOWSTATEF_WSCALE) {
118 		/* remove scale factor from initial window */
119 		int win = src->fse_max_win;
120 		ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
121 		win += (1 << src->fse_wscale);
122 		src->fse_max_win = (uint16_t)((win - 1) >> src->fse_wscale);
123 	}
124 
125 	/*
126 	 * Destination state initialization.
127 	 */
128 	dst->fse_state = TCPS_CLOSED;
129 	dst->fse_seqhi = 1;
130 	dst->fse_max_win = 1;
131 
132 	/*
133 	 * Linger time (in seconds).
134 	 */
135 	fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
136 	if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
137 		fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
138 	} else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
139 		fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
140 	}
141 
142 	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
143 }
144 
145 /*
146  * The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
147  * for a endpoint to process incoming segment and generate ACK, at the point of
148  * observation. For flowswitch, it means that:
149  *
150  *     local end RTT  = local stack processing time
151  *     remote end RTT = driver + network + remote endpoint's processing time
152  *
153  * Since the measurement is lightweight and sampling based, it won't learn and
154  * distinguish lost segment's ACK.  So we could occasionally get large RTT
155  * sample from an ACK to a retransmitted segment.  Thus rtt_max is not any
156  * meaningful to us.
157  */
158 __attribute__((always_inline))
159 static inline void
flow_track_tcp_rtt(struct flow_entry * fe,boolean_t input,struct flow_track * src,struct flow_track * dst,uint8_t tcp_flags,uint32_t seq,uint32_t ack,uint32_t ulen)160 flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
161     struct flow_track *src, struct flow_track *dst, uint8_t tcp_flags,
162     uint32_t seq, uint32_t ack, uint32_t ulen)
163 {
164 #pragma unused(fe, input) /* KDBG defined as noop in release build */
165 	uint64_t dst_last, src_last;
166 	uint64_t now, time_diff;
167 	uint32_t curval, oldval;
168 	clock_sec_t tv_sec;
169 	clock_usec_t tv_usec;
170 
171 	src_last = src->fse_rtt.frtt_last;
172 	dst_last = dst->fse_rtt.frtt_last;
173 
174 	/* start a new RTT tracking session under sampling rate limit */
175 	if (dst_last == 0 ||
176 	    _net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
177 		if (ulen > 0 &&
178 		    dst->fse_rtt.frtt_timestamp == 0) {
179 			dst->fse_rtt.frtt_timestamp = mach_absolute_time();
180 			dst->fse_rtt.frtt_last = _net_uptime;
181 			dst->fse_rtt.frtt_seg_begin = seq;
182 			dst->fse_rtt.frtt_seg_end = seq + ulen;
183 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START),
184 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
185 			    input ? 1 : 0);
186 		}
187 	}
188 
189 	/* we have an ACK, see if current tracking session matches it */
190 	if (tcp_flags & TH_ACK) {
191 		if (src->fse_rtt.frtt_timestamp != 0 &&
192 		    src->fse_rtt.frtt_seg_begin <= ack) {
193 			now = mach_absolute_time();
194 			time_diff = now - src->fse_rtt.frtt_timestamp;
195 
196 			absolutetime_to_microtime(time_diff, &tv_sec, &tv_usec);
197 			curval = (uint32_t)(tv_usec + tv_sec * 1000 * 1000);
198 			oldval = src->fse_rtt.frtt_usec;
199 			if (oldval == 0) {
200 				src->fse_rtt.frtt_usec = curval;
201 			} else {
202 				/* same EWMA decay as TCP RTT */
203 				src->fse_rtt.frtt_usec =
204 				    ((oldval << 4) - oldval + curval) >> 4;
205 			}
206 
207 			/* reset RTT tracking session */
208 			src->fse_rtt.frtt_timestamp = 0;
209 			src->fse_rtt.frtt_last = 0;
210 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_END),
211 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
212 			    input ? 0 : 1);
213 
214 			/* publish rtt stats into flow_stats object */
215 			/* just store both to avoid branch prediction etc. */
216 			fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
217 			fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
218 		}
219 	}
220 }
221 
222 /*
223  * The TCP connection tracking logic is based on Guido van Rooij's paper:
224  * http://www.sane.nl/events/sane2000/papers/rooij.pdf
225  *
226  * In some ways, we act as a middlebox that passively tracks the TCP windows
227  * of each connection on flows marked with FLOWENTF_TRACK.  We never modify
228  * the packet or generate any response (e.g. RST) to the sender; thus we are
229  * simply a silent observer.  The information we gather here is used later
230  * if we need to generate a valid {FIN|RST} segment when the flow is nonviable.
231  *
232  * The implementation is borrowed from Packet Filter, and is further
233  * simplified to cater for our use cases.
234  */
235 #define FTF_HALFCLOSED  0x1     /* want flow to be marked as half closed */
236 #define FTF_WAITCLOSE   0x2     /* want flow to linger after close */
237 #define FTF_CLOSENOTIFY 0x4     /* want to notify NECP upon torn down */
238 #define FTF_WITHDRAWN   0x8     /* want flow to be torn down */
239 #define FTF_SYN_RLIM    0x10    /* want flow to rate limit SYN */
240 #define FTF_RST_RLIM    0x20    /* want flow to rate limit RST */
241 __attribute__((always_inline))
242 static inline int
flow_track_tcp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)243 flow_track_tcp(struct flow_entry *fe, struct flow_track *src,
244     struct flow_track *dst, struct __kern_packet *pkt, bool input)
245 {
246 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
247 	uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
248 	uint32_t ack, end, seq, orig_seq;
249 	uint32_t ftflags = 0;
250 	uint8_t sws, dws;
251 	int ackskew, err = 0;
252 
253 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
254 		flow_track_tcp_init(fe, src, dst, pkt);
255 	}
256 
257 	flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
258 	    ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
259 	    pkt->pkt_flow_ulen);
260 
261 	if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
262 	    src->fse_state >= TCPS_FIN_WAIT_2)) {
263 		if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
264 			src->fse_state = dst->fse_state = TCPS_CLOSED;
265 			ftflags |= FTF_SYN_RLIM;
266 		}
267 		if (tcp_flags & TH_RST) {
268 			ftflags |= FTF_RST_RLIM;
269 		}
270 		if (input) {
271 			err = ENETRESET;
272 		}
273 		goto done;
274 	}
275 
276 	if (__probable((tcp_flags & TH_SYN) == 0 &&
277 	    src->fse_wscale != 0 && dst->fse_wscale != 0)) {
278 		sws = src->fse_wscale;
279 		dws = dst->fse_wscale;
280 	} else {
281 		sws = dws = 0;
282 	}
283 
284 	orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
285 	if (__probable(src->fse_seqlo != 0)) {
286 		ack = ntohl(pkt->pkt_flow_tcp_ack);
287 		end = seq + pkt->pkt_flow_ulen;
288 		if (tcp_flags & TH_SYN) {
289 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
290 				ftflags |= FTF_SYN_RLIM;
291 			}
292 			end++;
293 		}
294 		if (tcp_flags & TH_FIN) {
295 			end++;
296 		}
297 		if (tcp_flags & TH_RST) {
298 			ftflags |= FTF_RST_RLIM;
299 		}
300 	} else {
301 		/* first packet from this end; set its state */
302 		ack = ntohl(pkt->pkt_flow_tcp_ack);
303 
304 		/* We saw the first SYN, but stack does not reply with a SYN */
305 		if (dst->fse_state == TCPS_SYN_SENT && ((tcp_flags & TH_SYN) == 0)) {
306 			/* Act as if no sequence number is set */
307 			seq = 0;
308 			/* Pretend the outgoing SYN was not ACK'ed */
309 			ack = dst->fse_seqlo;
310 		}
311 
312 		end = seq + pkt->pkt_flow_ulen;
313 		if (tcp_flags & TH_SYN) {
314 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
315 				ftflags |= FTF_SYN_RLIM;
316 			}
317 			end++;
318 			if (dst->fse_flags & FLOWSTATEF_WSCALE) {
319 				flow_track_tcp_get_wscale(src, pkt);
320 				if (src->fse_flags & FLOWSTATEF_WSCALE) {
321 					/*
322 					 * Remove scale factor from
323 					 * initial window.
324 					 */
325 					sws = src->fse_wscale;
326 					win = (uint16_t)(((u_int32_t)win + (1 << sws) - 1)
327 					    >> sws);
328 					dws = dst->fse_wscale;
329 				} else {
330 					/* fixup other window */
331 					dst->fse_max_win = (uint16_t)(dst->fse_max_win << dst->fse_wscale);
332 					/* in case of a retrans SYN|ACK */
333 					dst->fse_wscale = 0;
334 				}
335 			}
336 		}
337 		if (tcp_flags & TH_FIN) {
338 			end++;
339 		}
340 		if (tcp_flags & TH_RST) {
341 			ftflags |= FTF_RST_RLIM;
342 		}
343 
344 		src->fse_seqlo = seq;
345 		if (src->fse_state < TCPS_SYN_SENT) {
346 			if (tcp_flags & TH_SYN) {
347 				src->fse_state = TCPS_SYN_SENT;
348 			} else {
349 				/* Picking up the connection in the middle */
350 				src->fse_state = TCPS_ESTABLISHED;
351 			}
352 		}
353 
354 		/*
355 		 * May need to slide the window (seqhi may have been set by
356 		 * the crappy stack check or if we picked up the connection
357 		 * after establishment).
358 		 */
359 		if (src->fse_seqhi == 1 || SEQ_GEQ(end +
360 		    MAX(1, dst->fse_max_win << dws), src->fse_seqhi)) {
361 			src->fse_seqhi = end + MAX(1, dst->fse_max_win << dws);
362 		}
363 		if (win > src->fse_max_win) {
364 			src->fse_max_win = win;
365 		}
366 	}
367 
368 	if (!(tcp_flags & TH_ACK)) {
369 		/* let it pass through the ack skew check */
370 		ack = dst->fse_seqlo;
371 	} else if ((ack == 0 &&
372 	    (tcp_flags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) ||
373 	    /* broken tcp stacks do not set ack */
374 	    (dst->fse_state < TCPS_SYN_SENT)) {
375 		/*
376 		 * Many stacks (ours included) will set the ACK number in an
377 		 * FIN|ACK if the SYN times out -- no sequence to ACK.
378 		 */
379 		ack = dst->fse_seqlo;
380 	}
381 
382 	if (seq == end) {
383 		/* ease sequencing restrictions on no data packets */
384 		seq = src->fse_seqlo;
385 		end = seq;
386 	}
387 
388 	ackskew = dst->fse_seqlo - ack;
389 
390 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
391 	if (SEQ_GEQ(src->fse_seqhi, end) &&
392 	    /* last octet inside other's window space */
393 	    SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
394 	    /* retrans: not more than one window back */
395 	    (ackskew >= -MAXACKWINDOW) &&
396 	    /* acking not more than one reassembled fragment backwards */
397 	    (ackskew <= (MAXACKWINDOW << sws)) &&
398 	    /* acking not more than one window forward */
399 	    (!(tcp_flags & TH_RST) || orig_seq == src->fse_seqlo ||
400 	    (orig_seq == src->fse_seqlo + 1) ||
401 	    (orig_seq + 1 == src->fse_seqlo))) {
402 		/* require an exact/+1 sequence match on resets when possible */
403 
404 		/* update max window */
405 		if (src->fse_max_win < win) {
406 			src->fse_max_win = win;
407 		}
408 		/* synchronize sequencing */
409 		if (SEQ_GT(end, src->fse_seqlo)) {
410 			src->fse_seqlo = end;
411 		}
412 		/* slide the window of what the other end can send */
413 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
414 			dst->fse_seqhi = ack + MAX((win << sws), 1);
415 		}
416 
417 		/* update states */
418 		if (tcp_flags & TH_SYN) {
419 			if (src->fse_state < TCPS_SYN_SENT) {
420 				src->fse_state = TCPS_SYN_SENT;
421 			}
422 		}
423 		if (tcp_flags & TH_FIN) {
424 			if (src->fse_state < TCPS_CLOSING) {
425 				src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
426 				src->fse_state = TCPS_CLOSING;
427 			}
428 		}
429 		if (tcp_flags & TH_ACK) {
430 			/*
431 			 * Avoid transitioning to ESTABLISHED when our SYN
432 			 * is ACK'd along with a RST.  The sending TCP may
433 			 * still retransmit the SYN (after dropping some
434 			 * options like ECN, etc.)
435 			 */
436 			if (dst->fse_state == TCPS_SYN_SENT &&
437 			    !(tcp_flags & TH_RST)) {
438 				dst->fse_state = TCPS_ESTABLISHED;
439 				ftflags |= (FTF_WAITCLOSE | FTF_CLOSENOTIFY);
440 			} else if (dst->fse_state == TCPS_CLOSING &&
441 			    ack == dst->fse_seqlast + 1) {
442 				dst->fse_state = TCPS_FIN_WAIT_2;
443 				ftflags |= FTF_WAITCLOSE;
444 				if (src->fse_state >= TCPS_FIN_WAIT_2) {
445 					ftflags |= FTF_WITHDRAWN;
446 				} else {
447 					ftflags |= FTF_HALFCLOSED;
448 				}
449 			}
450 		}
451 		if ((tcp_flags & TH_RST) &&
452 		    (src->fse_state == TCPS_ESTABLISHED ||
453 		    dst->fse_state == TCPS_ESTABLISHED)) {
454 			/*
455 			 * If either endpoint is in ESTABLISHED, transition
456 			 * both to TIME_WAIT.  Otherwise, keep the existing
457 			 * state as is, e.g. SYN_SENT.
458 			 */
459 			src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
460 			ftflags |= (FTF_WITHDRAWN | FTF_WAITCLOSE);
461 		}
462 	} else if ((dst->fse_state < TCPS_SYN_SENT ||
463 	    dst->fse_state >= TCPS_FIN_WAIT_2 ||
464 	    src->fse_state >= TCPS_FIN_WAIT_2) &&
465 	    SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
466 	    /* within a window forward of the originating packet */
467 	    SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
468 		/* within a window backward of the originating packet */
469 
470 		/* BEGIN CSTYLED */
471 		/*
472 		 * This currently handles three situations:
473 		 *  1) Stupid stacks will shotgun SYNs before their peer
474 		 *     replies.
475 		 *  2) When flow tracking catches an already established
476 		 *     stream (the flow states are cleared, etc.)
477 		 *  3) Packets get funky immediately after the connection
478 		 *     closes (this should catch spurious ACK|FINs that
479 		 *     web servers like to spew after a close).
480 		 *
481 		 * This must be a little more careful than the above code
482 		 * since packet floods will also be caught here.
483 		 */
484 		/* END CSTYLED */
485 
486 		/* update max window */
487 		if (src->fse_max_win < win) {
488 			src->fse_max_win = win;
489 		}
490 		/* synchronize sequencing */
491 		if (SEQ_GT(end, src->fse_seqlo)) {
492 			src->fse_seqlo = end;
493 		}
494 		/* slide the window of what the other end can send */
495 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
496 			dst->fse_seqhi = ack + MAX((win << sws), 1);
497 		}
498 
499 		/*
500 		 * Cannot set dst->fse_seqhi here since this could be a
501 		 * shotgunned SYN and not an already established connection.
502 		 */
503 
504 		if (tcp_flags & TH_FIN) {
505 			if (src->fse_state < TCPS_CLOSING) {
506 				src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
507 				src->fse_state = TCPS_CLOSING;
508 			}
509 		}
510 		if (tcp_flags & TH_RST) {
511 			src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
512 			ftflags |= FTF_WAITCLOSE;
513 		}
514 	} else {
515 		if (dst->fse_state == TCPS_SYN_SENT &&
516 		    src->fse_state == TCPS_SYN_SENT) {
517 			src->fse_seqlo = 0;
518 			src->fse_seqhi = 1;
519 			src->fse_max_win = 1;
520 		}
521 	}
522 
523 done:
524 	if (__improbable((ftflags & FTF_HALFCLOSED) != 0)) {
525 		os_atomic_or(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
526 		ftflags &= ~FTF_HALFCLOSED;
527 	}
528 
529 	/*
530 	 * Hold on to namespace for a while after the flow is closed.
531 	 */
532 	if (__improbable((ftflags & FTF_WAITCLOSE) != 0 &&
533 	    (fe->fe_flags & FLOWENTF_WAIT_CLOSE) == 0)) {
534 		os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
535 		ftflags &= ~FTF_WAITCLOSE;
536 	}
537 
538 	/*
539 	 * Notify NECP upon tear down (for established flows).
540 	 */
541 	if (__improbable((ftflags & FTF_CLOSENOTIFY) != 0 &&
542 	    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == 0)) {
543 		os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY, relaxed);
544 		ftflags &= ~FTF_CLOSENOTIFY;
545 	}
546 
547 	/*
548 	 * Flow is withdrawn; the port we have should not be included in
549 	 * the list of offloaded ports, as the connection is no longer
550 	 * usable (we're not expecting any more data).
551 	 * Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
552 	 * thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
553 	 * FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
554 	 */
555 	if (__improbable((ftflags & FTF_WITHDRAWN) != 0)) {
556 		ftflags &= ~FTF_WITHDRAWN;
557 		if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
558 			os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
559 		}
560 		fe->fe_want_withdraw = 1;
561 	}
562 
563 	/*
564 	 * If no other work is needed, we're done.
565 	 */
566 	if (ftflags == 0 || input) {
567 		return err;
568 	}
569 
570 	/*
571 	 * If we're over the rate limit for outbound SYNs, drop packet.
572 	 */
573 	if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) {
574 		uint32_t now = (uint32_t)_net_uptime;
575 		if ((now - src->fse_syn_ts) > 1) {
576 			src->fse_syn_ts = now;
577 			src->fse_syn_cnt = 0;
578 		}
579 		if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
580 			err = EPROTO;
581 		}
582 	}
583 
584 	return err;
585 }
586 #undef FTF_WAITCLOSE
587 #undef FTF_CLOSENOTIFY
588 #undef FTF_WITHDRAWN
589 #undef FTF_SYN_RLIM
590 #undef FTF_RST_RLIM
591 
592 boolean_t
flow_track_tcp_want_abort(struct flow_entry * fe)593 flow_track_tcp_want_abort(struct flow_entry *fe)
594 {
595 	struct flow_track *src = &fe->fe_ltrack;
596 	struct flow_track *dst = &fe->fe_rtrack;
597 
598 	if (fe->fe_key.fk_proto != IPPROTO_TCP ||
599 	    (fe->fe_flags & FLOWENTF_ABORTED)) {
600 		goto done;
601 	}
602 
603 	/* this can be enhanced; for now rely on established state */
604 	if (src->fse_state == TCPS_ESTABLISHED ||
605 	    dst->fse_state == TCPS_ESTABLISHED) {
606 		src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
607 		/* don't process more than once */
608 		os_atomic_or(&fe->fe_flags, FLOWENTF_ABORTED, relaxed);
609 		return TRUE;
610 	}
611 done:
612 	return FALSE;
613 }
614 
615 static void
flow_track_udp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)616 flow_track_udp_init(struct flow_entry *fe, struct flow_track *src,
617     struct flow_track *dst, struct __kern_packet *pkt)
618 {
619 #pragma unused(pkt)
620 	/*
621 	 * Source state initialization.
622 	 */
623 	src->fse_state = FT_STATE_NO_TRAFFIC;
624 
625 	/*
626 	 * Destination state initialization.
627 	 */
628 	dst->fse_state = FT_STATE_NO_TRAFFIC;
629 
630 	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
631 }
632 
633 __attribute__((always_inline))
634 static inline int
flow_track_udp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)635 flow_track_udp(struct flow_entry *fe, struct flow_track *src,
636     struct flow_track *dst, struct __kern_packet *pkt, bool input)
637 {
638 #pragma unused(input)
639 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
640 		flow_track_udp_init(fe, src, dst, pkt);
641 	}
642 
643 	if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
644 		src->fse_state = FT_STATE_SINGLE;
645 	}
646 	if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
647 		dst->fse_state = FT_STATE_MULTIPLE;
648 	}
649 
650 	return 0;
651 }
652 
653 void
flow_track_stats(struct flow_entry * fe,uint64_t bytes,uint64_t packets,bool active,bool in)654 flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
655     bool active, bool in)
656 {
657 	volatile struct sk_stats_flow_track *fst;
658 
659 	if (in) {
660 		fst = &fe->fe_stats->fs_rtrack;
661 	} else {
662 		fst = &fe->fe_stats->fs_ltrack;
663 	}
664 
665 	fst->sft_bytes += bytes;
666 	fst->sft_packets += packets;
667 
668 	if (__probable(active)) {
669 		in_stat_set_activity_bitmap(&fe->fe_stats->fs_activity,
670 		    _net_uptime);
671 	}
672 }
673 
674 int
flow_pkt_track(struct flow_entry * fe,struct __kern_packet * pkt,bool in)675 flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in)
676 {
677 	struct flow_track *src, *dst;
678 	int ret = 0;
679 
680 	_CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED);
681 	_CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN);
682 	_CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
683 	_CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
684 	_CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
685 	_CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
686 	_CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
687 	_CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING);
688 	_CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
689 	_CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
690 	_CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
691 	_CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
692 	_CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE);
693 	_CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
694 	_CASSERT(SFT_STATE_MAX == FT_STATE_MAX);
695 
696 	_CASSERT(FT_STATE_CLOSED == TCPS_CLOSED);
697 	_CASSERT(FT_STATE_LISTEN == TCPS_LISTEN);
698 	_CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
699 	_CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
700 	_CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
701 	_CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
702 	_CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
703 	_CASSERT(FT_STATE_CLOSING == TCPS_CLOSING);
704 	_CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
705 	_CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
706 	_CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
707 
708 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
709 
710 	if (in) {
711 		src = &fe->fe_rtrack;
712 		dst = &fe->fe_ltrack;
713 	} else {
714 		src = &fe->fe_ltrack;
715 		dst = &fe->fe_rtrack;
716 	}
717 
718 	flow_track_stats(fe, (pkt->pkt_length - pkt->pkt_l2_len), 1,
719 	    (pkt->pkt_flow_ulen != 0), in);
720 
721 	/* skip flow state tracking on non-initial fragments */
722 	if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
723 		return 0;
724 	}
725 
726 	switch (pkt->pkt_flow_ip_proto) {
727 	case IPPROTO_TCP:
728 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
729 			ret = flow_track_tcp(fe, src, dst, pkt, in);
730 		}
731 		break;
732 
733 	case IPPROTO_UDP:
734 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
735 			ret = flow_track_udp(fe, src, dst, pkt, in);
736 		}
737 		break;
738 	}
739 
740 	return ret;
741 }
742 
743 /*
744  * @function flow_track_abort_tcp
745  * @abstract send RST for a given TCP flow.
746  * @param in_pkt incoming packet that triggers RST.
747  * @param rst_pkt use as RST template for SEQ/ACK information.
748  */
749 void
flow_track_abort_tcp(struct flow_entry * fe,struct __kern_packet * in_pkt,struct __kern_packet * rst_pkt)750 flow_track_abort_tcp(struct flow_entry *fe, struct __kern_packet *in_pkt,
751     struct __kern_packet *rst_pkt)
752 {
753 	struct nx_flowswitch *fsw = fe->fe_fsw;
754 	struct flow_track *src, *dst;
755 	struct ip *ip;
756 	struct ip6_hdr *ip6;
757 	struct tcphdr *th;
758 	uint16_t len, tlen;
759 	struct mbuf *m;
760 
761 	/* guaranteed by caller */
762 	ASSERT(fsw->fsw_ifp != NULL);
763 	ASSERT(in_pkt == NULL || rst_pkt == NULL);
764 
765 	src = &fe->fe_ltrack;
766 	dst = &fe->fe_rtrack;
767 
768 	tlen = sizeof(struct tcphdr);
769 	if (fe->fe_key.fk_ipver == IPVERSION) {
770 		len = sizeof(struct ip) + tlen;
771 	} else {
772 		ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
773 		len = sizeof(struct ip6_hdr) + tlen;
774 	}
775 
776 	m = m_gethdr(M_NOWAIT, MT_HEADER);
777 	if (__improbable(m == NULL)) {
778 		return;
779 	}
780 
781 	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
782 	m->m_data += max_linkhdr;               /* 32-bit aligned */
783 	m->m_pkthdr.len = m->m_len = len;
784 
785 	/* zero out for checksum */
786 	bzero(m_mtod_current(m), len);
787 
788 	if (fe->fe_key.fk_ipver == IPVERSION) {
789 		ip = mtod(m, struct ip *);
790 
791 		/* IP header fields included in the TCP checksum */
792 		ip->ip_p = IPPROTO_TCP;
793 		ip->ip_len = htons(tlen);
794 		if (rst_pkt == NULL) {
795 			ip->ip_src = fe->fe_key.fk_src4;
796 			ip->ip_dst = fe->fe_key.fk_dst4;
797 		} else {
798 			ip->ip_src = rst_pkt->pkt_flow_ipv4_src;
799 			ip->ip_dst = rst_pkt->pkt_flow_ipv4_dst;
800 		}
801 
802 		th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
803 	} else {
804 		ip6 = mtod(m, struct ip6_hdr *);
805 
806 		/* IP header fields included in the TCP checksum */
807 		ip6->ip6_nxt = IPPROTO_TCP;
808 		ip6->ip6_plen = htons(tlen);
809 		if (rst_pkt == NULL) {
810 			ip6->ip6_src = fe->fe_key.fk_src6;
811 			ip6->ip6_dst = fe->fe_key.fk_dst6;
812 		} else {
813 			ip6->ip6_src = rst_pkt->pkt_flow_ipv6_src;
814 			ip6->ip6_dst = rst_pkt->pkt_flow_ipv6_dst;
815 		}
816 
817 		th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
818 	}
819 
820 	/*
821 	 * TCP header (fabricate a pure RST).
822 	 */
823 	if (in_pkt != NULL) {
824 		th->th_sport = in_pkt->pkt_flow_tcp_dst;
825 		th->th_dport = in_pkt->pkt_flow_tcp_src;
826 		if (__probable(in_pkt->pkt_flow_tcp_flags | TH_ACK)) {
827 			/* <SEQ=SEG.ACK><CTL=RST> */
828 			th->th_seq = in_pkt->pkt_flow_tcp_ack;
829 			th->th_ack = 0;
830 			th->th_flags = TH_RST;
831 		} else {
832 			/* <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> */
833 			th->th_seq = 0;
834 			th->th_ack = in_pkt->pkt_flow_tcp_seq +
835 			    in_pkt->pkt_flow_ulen;
836 			th->th_flags = TH_RST | TH_ACK;
837 		}
838 	} else if (rst_pkt != NULL) {
839 		th->th_sport = rst_pkt->pkt_flow_tcp_src;
840 		th->th_dport = rst_pkt->pkt_flow_tcp_dst;
841 		th->th_seq = rst_pkt->pkt_flow_tcp_seq;
842 		th->th_ack = rst_pkt->pkt_flow_tcp_ack;
843 		th->th_flags = rst_pkt->pkt_flow_tcp_flags;
844 	} else {
845 		th->th_sport = fe->fe_key.fk_sport;
846 		th->th_dport = fe->fe_key.fk_dport;
847 		th->th_seq = htonl(src->fse_seqlo);     /* peer's last ACK */
848 		th->th_ack = 0;
849 		th->th_flags = TH_RST;
850 	}
851 	th->th_off = (tlen >> 2);
852 	th->th_win = 0;
853 
854 	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
855 
856 	if (fe->fe_key.fk_ipver == IPVERSION) {
857 		struct ip_out_args ipoa;
858 		struct route ro;
859 
860 		bzero(&ipoa, sizeof(ipoa));
861 		ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
862 		ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
863 		    IPOAF_BOUND_SRCADDR);
864 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
865 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
866 
867 		/* TCP checksum */
868 		th->th_sum = in_cksum(m, len);
869 
870 		ip->ip_v = IPVERSION;
871 		ip->ip_hl = sizeof(*ip) >> 2;
872 		ip->ip_tos = 0;
873 		/*
874 		 * ip_output() expects ip_len and ip_off to be in host order.
875 		 */
876 		ip->ip_len = len;
877 		ip->ip_off = IP_DF;
878 		ip->ip_ttl = (uint8_t)ip_defttl;
879 		ip->ip_sum = 0;
880 
881 		bzero(&ro, sizeof(ro));
882 		(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
883 		ROUTE_RELEASE(&ro);
884 	} else {
885 		struct ip6_out_args ip6oa;
886 		struct route_in6 ro6;
887 
888 		bzero(&ip6oa, sizeof(ip6oa));
889 		ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
890 		ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
891 		    IP6OAF_BOUND_SRCADDR);
892 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
893 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
894 
895 		/* TCP checksum */
896 		th->th_sum = in6_cksum(m, IPPROTO_TCP,
897 		    sizeof(struct ip6_hdr), tlen);
898 
899 		ip6->ip6_vfc |= IPV6_VERSION;
900 		ip6->ip6_hlim = IPV6_DEFHLIM;
901 
902 		bzero(&ro6, sizeof(ro6));
903 		(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
904 		    NULL, NULL, &ip6oa);
905 		ROUTE_RELEASE(&ro6);
906 	}
907 }
908 
909 void
flow_track_abort_quic(struct flow_entry * fe,uint8_t * token)910 flow_track_abort_quic(struct flow_entry *fe, uint8_t *token)
911 {
912 	struct quic_stateless_reset {
913 		uint8_t ssr_header[30];
914 		uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
915 	};
916 	struct nx_flowswitch *fsw = fe->fe_fsw;
917 	struct ip *ip;
918 	struct ip6_hdr *ip6;
919 	struct udphdr *uh;
920 	struct quic_stateless_reset *qssr;
921 	uint16_t len, l3hlen, ulen;
922 	struct mbuf *m;
923 	unsigned int one = 1;
924 	int error;
925 
926 	/* guaranteed by caller */
927 	ASSERT(fsw->fsw_ifp != NULL);
928 
929 	/* skip zero token */
930 	bool is_zero_token = true;
931 	for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
932 		if (token[i] != 0) {
933 			is_zero_token = false;
934 			break;
935 		}
936 	}
937 	if (is_zero_token) {
938 		return;
939 	}
940 
941 	ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
942 	if (fe->fe_key.fk_ipver == IPVERSION) {
943 		l3hlen = sizeof(struct ip);
944 	} else {
945 		ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
946 		l3hlen = sizeof(struct ip6_hdr);
947 	}
948 
949 	len = l3hlen + ulen;
950 
951 	error = mbuf_allocpacket(MBUF_DONTWAIT, max_linkhdr + len, &one, &m);
952 	if (__improbable(error != 0)) {
953 		return;
954 	}
955 	VERIFY(m != 0);
956 
957 	m->m_pkthdr.pkt_proto = IPPROTO_UDP;
958 	m->m_data += max_linkhdr;               /* 32-bit aligned */
959 	m->m_pkthdr.len = m->m_len = len;
960 
961 	/* zero out for checksum */
962 	bzero(m_mtod_current(m), len);
963 
964 	if (fe->fe_key.fk_ipver == IPVERSION) {
965 		ip = mtod(m, struct ip *);
966 		ip->ip_p = IPPROTO_UDP;
967 		ip->ip_len = htons(ulen);
968 		ip->ip_src = fe->fe_key.fk_src4;
969 		ip->ip_dst = fe->fe_key.fk_dst4;
970 		uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
971 	} else {
972 		ip6 = mtod(m, struct ip6_hdr *);
973 		ip6->ip6_nxt = IPPROTO_UDP;
974 		ip6->ip6_plen = htons(ulen);
975 		ip6->ip6_src = fe->fe_key.fk_src6;
976 		ip6->ip6_dst = fe->fe_key.fk_dst6;
977 		uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
978 	}
979 
980 	/* UDP header */
981 	uh->uh_sport = fe->fe_key.fk_sport;
982 	uh->uh_dport = fe->fe_key.fk_dport;
983 	uh->uh_ulen = htons(ulen);
984 
985 	/* QUIC stateless reset */
986 	qssr = (struct quic_stateless_reset *)(uh + 1);
987 	read_frandom(&qssr->ssr_header, sizeof(qssr->ssr_header));
988 	qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
989 	memcpy(qssr->ssr_token, token, QUIC_STATELESS_RESET_TOKEN_SIZE);
990 
991 	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
992 
993 	if (fe->fe_key.fk_ipver == IPVERSION) {
994 		struct ip_out_args ipoa;
995 		struct route ro;
996 
997 		bzero(&ipoa, sizeof(ipoa));
998 		ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
999 		ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
1000 		    IPOAF_BOUND_SRCADDR);
1001 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
1002 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1003 
1004 		uh->uh_sum = in_cksum(m, len);
1005 		if (uh->uh_sum == 0) {
1006 			uh->uh_sum = 0xffff;
1007 		}
1008 
1009 		ip->ip_v = IPVERSION;
1010 		ip->ip_hl = sizeof(*ip) >> 2;
1011 		ip->ip_tos = 0;
1012 		/*
1013 		 * ip_output() expects ip_len and ip_off to be in host order.
1014 		 */
1015 		ip->ip_len = len;
1016 		ip->ip_off = IP_DF;
1017 		ip->ip_ttl = (uint8_t)ip_defttl;
1018 		ip->ip_sum = 0;
1019 
1020 		bzero(&ro, sizeof(ro));
1021 		(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
1022 		ROUTE_RELEASE(&ro);
1023 	} else {
1024 		struct ip6_out_args ip6oa;
1025 		struct route_in6 ro6;
1026 
1027 		bzero(&ip6oa, sizeof(ip6oa));
1028 		ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
1029 		ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
1030 		    IP6OAF_BOUND_SRCADDR);
1031 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
1032 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1033 
1034 		uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
1035 		    ulen);
1036 		if (uh->uh_sum == 0) {
1037 			uh->uh_sum = 0xffff;
1038 		}
1039 
1040 		ip6->ip6_vfc |= IPV6_VERSION;
1041 		ip6->ip6_hlim = IPV6_DEFHLIM;
1042 
1043 		bzero(&ro6, sizeof(ro6));
1044 		(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
1045 		    NULL, NULL, &ip6oa);
1046 		ROUTE_RELEASE(&ro6);
1047 	}
1048 }
1049