xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_track.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/fsw_var.h>
31 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
32 #include <netinet/tcp.h>
33 #include <netinet/tcp_fsm.h>
34 #include <netinet/tcp_seq.h>
35 #include <netinet/tcp_timer.h>
36 #include <netinet/tcp_var.h>
37 #include <netinet/in_stat.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip6.h>
40 #include <sys/kdebug.h>
41 
42 /* min/max linger time (in seconds */
43 #define FLOWTRACK_LINGER_MIN    1
44 #define FLOWTRACK_LINGER_MAX    120
45 
46 /* maximum allowed rate of SYNs per second */
47 #define FLOWTRACK_SYN_RATE      20
48 
49 static int flow_track_tcp(struct flow_entry *, struct flow_track *,
50     struct flow_track *, struct __kern_packet *, bool);
51 static int flow_track_udp(struct flow_entry *, struct flow_track *,
52     struct flow_track *, struct __kern_packet *, bool);
53 
54 static void
flow_track_tcp_get_wscale(struct flow_track * s,struct __kern_packet * pkt)55 flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt)
56 {
57 	const uint8_t *hdr = (uint8_t *)(void *)pkt->pkt_flow_tcp_hdr;
58 	int hlen = pkt->pkt_flow_tcp_hlen;
59 	uint8_t optlen, wscale = 0;
60 	const uint8_t *opt;
61 
62 	_CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t));
63 	ASSERT(hlen >= (int)sizeof(struct tcphdr));
64 
65 	opt = hdr + sizeof(struct tcphdr);
66 	hlen -= sizeof(struct tcphdr);
67 	while (hlen >= 3) {
68 		switch (*opt) {
69 		case TCPOPT_EOL:
70 		case TCPOPT_NOP:
71 			++opt;
72 			--hlen;
73 			break;
74 		case TCPOPT_WINDOW:
75 			wscale = opt[2];
76 			if (wscale > TCP_MAX_WINSHIFT) {
77 				wscale = TCP_MAX_WINSHIFT;
78 			}
79 			atomic_bitset_16(&s->fse_flags, FLOWSTATEF_WSCALE);
80 			OS_FALLTHROUGH;
81 		default:
82 			optlen = opt[1];
83 			if (optlen < 2) {
84 				optlen = 2;
85 			}
86 			hlen -= optlen;
87 			opt += optlen;
88 			break;
89 		}
90 	}
91 	s->fse_wscale = wscale;
92 }
93 
94 static void
flow_track_tcp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)95 flow_track_tcp_init(struct flow_entry *fe, struct flow_track *src,
96     struct flow_track *dst, struct __kern_packet *pkt)
97 {
98 #pragma unused(dst)
99 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
100 
101 	/*
102 	 * Source state initialization.
103 	 */
104 	src->fse_state = TCPS_SYN_SENT;
105 	src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
106 	src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + 1);
107 	if (tcp_flags & TH_SYN) {
108 		src->fse_seqhi++;
109 		flow_track_tcp_get_wscale(src, pkt);
110 	}
111 	if (tcp_flags & TH_FIN) {
112 		src->fse_seqhi++;
113 	}
114 
115 	src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), 1);
116 	if (src->fse_flags & FLOWSTATEF_WSCALE) {
117 		/* remove scale factor from initial window */
118 		int win = src->fse_max_win;
119 		ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
120 		win += (1 << src->fse_wscale);
121 		src->fse_max_win = (uint16_t)((win - 1) >> src->fse_wscale);
122 	}
123 
124 	/*
125 	 * Destination state initialization.
126 	 */
127 	dst->fse_state = TCPS_CLOSED;
128 	dst->fse_seqhi = 1;
129 	dst->fse_max_win = 1;
130 
131 	/*
132 	 * Linger time (in seconds).
133 	 */
134 	fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
135 	if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
136 		fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
137 	} else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
138 		fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
139 	}
140 
141 	atomic_bitset_32(&fe->fe_flags, FLOWENTF_INITED);
142 }
143 
144 /*
145  * The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
146  * for a endpoint to process incoming segment and generate ACK, at the point of
147  * observation. For flowswitch, it means that:
148  *
149  *     local end RTT  = local stack processing time
150  *     remote end RTT = driver + network + remote endpoint's processing time
151  *
152  * Since the measurement is lightweight and sampling based, it won't learn and
153  * distinguish lost segment's ACK.  So we could occasionally get large RTT
154  * sample from an ACK to a retransmitted segment.  Thus rtt_max is not any
155  * meaningful to us.
156  */
157 __attribute__((always_inline))
158 static inline void
flow_track_tcp_rtt(struct flow_entry * fe,boolean_t input,struct flow_track * src,struct flow_track * dst,uint8_t tcp_flags,uint32_t seq,uint32_t ack,uint32_t ulen)159 flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
160     struct flow_track *src, struct flow_track *dst, uint8_t tcp_flags,
161     uint32_t seq, uint32_t ack, uint32_t ulen)
162 {
163 #pragma unused(fe, input) /* KDBG defined as noop in release build */
164 	uint64_t dst_last, src_last;
165 	uint64_t now, time_diff;
166 	uint32_t curval, oldval;
167 	clock_sec_t tv_sec;
168 	clock_usec_t tv_usec;
169 
170 	src_last = src->fse_rtt.frtt_last;
171 	dst_last = dst->fse_rtt.frtt_last;
172 
173 	/* start a new RTT tracking session under sampling rate limit */
174 	if (dst_last == 0 ||
175 	    _net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
176 		if (ulen > 0 &&
177 		    dst->fse_rtt.frtt_timestamp == 0) {
178 			dst->fse_rtt.frtt_timestamp = mach_absolute_time();
179 			dst->fse_rtt.frtt_last = _net_uptime;
180 			dst->fse_rtt.frtt_seg_begin = seq;
181 			dst->fse_rtt.frtt_seg_end = seq + ulen;
182 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START),
183 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
184 			    input ? 1 : 0);
185 		}
186 	}
187 
188 	/* we have an ACK, see if current tracking session matches it */
189 	if (tcp_flags & TH_ACK) {
190 		if (src->fse_rtt.frtt_timestamp != 0 &&
191 		    src->fse_rtt.frtt_seg_begin <= ack) {
192 			now = mach_absolute_time();
193 			time_diff = now - src->fse_rtt.frtt_timestamp;
194 
195 			absolutetime_to_microtime(time_diff, &tv_sec, &tv_usec);
196 			curval = (uint32_t)(tv_usec + tv_sec * 1000 * 1000);
197 			oldval = src->fse_rtt.frtt_usec;
198 			if (oldval == 0) {
199 				src->fse_rtt.frtt_usec = curval;
200 			} else {
201 				/* same EWMA decay as TCP RTT */
202 				src->fse_rtt.frtt_usec =
203 				    ((oldval << 4) - oldval + curval) >> 4;
204 			}
205 
206 			/* reset RTT tracking session */
207 			src->fse_rtt.frtt_timestamp = 0;
208 			src->fse_rtt.frtt_last = 0;
209 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_END),
210 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
211 			    input ? 0 : 1);
212 
213 			/* publish rtt stats into flow_stats object */
214 			/* just store both to avoid branch prediction etc. */
215 			fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
216 			fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
217 		}
218 	}
219 }
220 
221 /*
222  * The TCP connection tracking logic is based on Guido van Rooij's paper:
223  * http://www.sane.nl/events/sane2000/papers/rooij.pdf
224  *
225  * In some ways, we act as a middlebox that passively tracks the TCP windows
226  * of each connection on flows marked with FLOWENTF_TRACK.  We never modify
227  * the packet or generate any response (e.g. RST) to the sender; thus we are
228  * simply a silent observer.  The information we gather here is used later
229  * if we need to generate a valid {FIN|RST} segment when the flow is nonviable.
230  *
231  * The implementation is borrowed from Packet Filter, and is further
232  * simplified to cater for our use cases.
233  */
234 #define FTF_NODELAY     0x1     /* want flow to get immediate attention */
235 #define FTF_HALFCLOSED  0x2     /* want flow to be marked as half closed */
236 #define FTF_WAITCLOSE   0x4     /* want flow to linger after close */
237 #define FTF_CLOSENOTIFY 0x8     /* want to notify NECP upon torn down */
238 #define FTF_WITHDRAWN   0x10     /* want flow to be torn down */
239 #define FTF_SYN_RLIM    0x20    /* want flow to rate limit SYN */
240 #define FTF_RST_RLIM    0x40    /* want flow to rate limit RST */
241 __attribute__((always_inline))
242 static inline int
flow_track_tcp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)243 flow_track_tcp(struct flow_entry *fe, struct flow_track *src,
244     struct flow_track *dst, struct __kern_packet *pkt, bool input)
245 {
246 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
247 	uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
248 	uint32_t ack, end, seq, orig_seq;
249 	uint32_t ftflags = 0;
250 	uint8_t sws, dws;
251 	int ackskew, err = 0;
252 
253 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
254 		flow_track_tcp_init(fe, src, dst, pkt);
255 	}
256 
257 	flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
258 	    ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
259 	    pkt->pkt_flow_ulen);
260 
261 	if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
262 	    src->fse_state >= TCPS_FIN_WAIT_2)) {
263 		if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
264 			src->fse_state = dst->fse_state = TCPS_CLOSED;
265 			ftflags |= FTF_SYN_RLIM;
266 		}
267 		if (tcp_flags & TH_RST) {
268 			ftflags |= FTF_RST_RLIM;
269 		}
270 		if (input) {
271 			err = ENETRESET;
272 		}
273 		goto done;
274 	}
275 
276 	if (__probable((tcp_flags & TH_SYN) == 0 &&
277 	    src->fse_wscale != 0 && dst->fse_wscale != 0)) {
278 		sws = src->fse_wscale;
279 		dws = dst->fse_wscale;
280 	} else {
281 		sws = dws = 0;
282 	}
283 
284 	orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
285 	if (__probable(src->fse_seqlo != 0)) {
286 		ack = ntohl(pkt->pkt_flow_tcp_ack);
287 		end = seq + pkt->pkt_flow_ulen;
288 		if (tcp_flags & TH_SYN) {
289 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
290 				ftflags |= FTF_SYN_RLIM;
291 			}
292 			end++;
293 		}
294 		if (tcp_flags & TH_FIN) {
295 			end++;
296 		}
297 		if (tcp_flags & TH_RST) {
298 			ftflags |= FTF_RST_RLIM;
299 		}
300 	} else {
301 		/* first packet from this end; set its state */
302 		ack = ntohl(pkt->pkt_flow_tcp_ack);
303 		end = seq + pkt->pkt_flow_ulen;
304 		if (tcp_flags & TH_SYN) {
305 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
306 				ftflags |= FTF_SYN_RLIM;
307 			}
308 			end++;
309 			if (dst->fse_flags & FLOWSTATEF_WSCALE) {
310 				flow_track_tcp_get_wscale(src, pkt);
311 				if (src->fse_flags & FLOWSTATEF_WSCALE) {
312 					/*
313 					 * Remove scale factor from
314 					 * initial window.
315 					 */
316 					sws = src->fse_wscale;
317 					win = (uint16_t)(((u_int32_t)win + (1 << sws) - 1)
318 					    >> sws);
319 					dws = dst->fse_wscale;
320 				} else {
321 					/* fixup other window */
322 					dst->fse_max_win <<= dst->fse_wscale;
323 					/* in case of a retrans SYN|ACK */
324 					dst->fse_wscale = 0;
325 				}
326 			}
327 		}
328 		if (tcp_flags & TH_FIN) {
329 			end++;
330 		}
331 		if (tcp_flags & TH_RST) {
332 			ftflags |= FTF_RST_RLIM;
333 		}
334 
335 		src->fse_seqlo = seq;
336 		if (src->fse_state < TCPS_SYN_SENT) {
337 			src->fse_state = TCPS_SYN_SENT;
338 		}
339 
340 		/*
341 		 * May need to slide the window (seqhi may have been set by
342 		 * the crappy stack check or if we picked up the connection
343 		 * after establishment).
344 		 */
345 		if (src->fse_seqhi == 1 || SEQ_GEQ(end +
346 		    MAX(1, dst->fse_max_win << dws), src->fse_seqhi)) {
347 			src->fse_seqhi = end + MAX(1, dst->fse_max_win << dws);
348 		}
349 		if (win > src->fse_max_win) {
350 			src->fse_max_win = win;
351 		}
352 	}
353 
354 	if (!(tcp_flags & TH_ACK)) {
355 		/* let it pass through the ack skew check */
356 		ack = dst->fse_seqlo;
357 	} else if ((ack == 0 &&
358 	    (tcp_flags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) ||
359 	    /* broken tcp stacks do not set ack */
360 	    (dst->fse_state < TCPS_SYN_SENT)) {
361 		/*
362 		 * Many stacks (ours included) will set the ACK number in an
363 		 * FIN|ACK if the SYN times out -- no sequence to ACK.
364 		 */
365 		ack = dst->fse_seqlo;
366 	}
367 
368 	if (seq == end) {
369 		/* ease sequencing restrictions on no data packets */
370 		seq = src->fse_seqlo;
371 		end = seq;
372 	}
373 
374 	ackskew = dst->fse_seqlo - ack;
375 
376 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
377 	if (SEQ_GEQ(src->fse_seqhi, end) &&
378 	    /* last octet inside other's window space */
379 	    SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
380 	    /* retrans: not more than one window back */
381 	    (ackskew >= -MAXACKWINDOW) &&
382 	    /* acking not more than one reassembled fragment backwards */
383 	    (ackskew <= (MAXACKWINDOW << sws)) &&
384 	    /* acking not more than one window forward */
385 	    (!(tcp_flags & TH_RST) || orig_seq == src->fse_seqlo ||
386 	    (orig_seq == src->fse_seqlo + 1) ||
387 	    (orig_seq + 1 == src->fse_seqlo))) {
388 		/* require an exact/+1 sequence match on resets when possible */
389 
390 		/* update max window */
391 		if (src->fse_max_win < win) {
392 			src->fse_max_win = win;
393 		}
394 		/* synchronize sequencing */
395 		if (SEQ_GT(end, src->fse_seqlo)) {
396 			src->fse_seqlo = end;
397 		}
398 		/* slide the window of what the other end can send */
399 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
400 			dst->fse_seqhi = ack + MAX((win << sws), 1);
401 		}
402 
403 		/* update states */
404 		if (tcp_flags & TH_SYN) {
405 			if (src->fse_state < TCPS_SYN_SENT) {
406 				src->fse_state = TCPS_SYN_SENT;
407 				ftflags |= FTF_NODELAY;
408 			}
409 		}
410 		if (tcp_flags & TH_FIN) {
411 			if (src->fse_state < TCPS_CLOSING) {
412 				src->fse_seqlast = orig_seq;
413 				src->fse_state = TCPS_CLOSING;
414 				ftflags |= FTF_NODELAY;
415 			}
416 		}
417 		if (tcp_flags & TH_ACK) {
418 			/*
419 			 * Avoid transitioning to ESTABLISHED when our SYN
420 			 * is ACK'd along with a RST.  The sending TCP may
421 			 * still retransmit the SYN (after dropping some
422 			 * options like ECN, etc.)
423 			 */
424 			if (dst->fse_state == TCPS_SYN_SENT &&
425 			    !(tcp_flags & TH_RST)) {
426 				dst->fse_state = TCPS_ESTABLISHED;
427 				ftflags |= (FTF_WAITCLOSE | FTF_CLOSENOTIFY |
428 				    FTF_NODELAY);
429 			} else if (dst->fse_state == TCPS_CLOSING &&
430 			    ack == dst->fse_seqlast + 1) {
431 				dst->fse_state = TCPS_FIN_WAIT_2;
432 				ftflags |= (FTF_WAITCLOSE | FTF_NODELAY);
433 				if (src->fse_state >= TCPS_FIN_WAIT_2) {
434 					ftflags |= FTF_WITHDRAWN;
435 				} else {
436 					ftflags |= FTF_HALFCLOSED;
437 				}
438 			}
439 		}
440 		if ((tcp_flags & TH_RST) &&
441 		    (src->fse_state == TCPS_ESTABLISHED ||
442 		    dst->fse_state == TCPS_ESTABLISHED)) {
443 			/*
444 			 * If either endpoint is in ESTABLISHED, transition
445 			 * both to TIME_WAIT.  Otherwise, keep the existing
446 			 * state as is, e.g. SYN_SENT.
447 			 */
448 			src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
449 			ftflags |= (FTF_WITHDRAWN | FTF_WAITCLOSE | FTF_NODELAY);
450 		}
451 		if (tcp_flags & TH_PUSH) {
452 			ftflags |= FTF_NODELAY;
453 		}
454 	} else if ((dst->fse_state < TCPS_SYN_SENT ||
455 	    dst->fse_state >= TCPS_FIN_WAIT_2 ||
456 	    src->fse_state >= TCPS_FIN_WAIT_2) &&
457 	    SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
458 	    /* within a window forward of the originating packet */
459 	    SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
460 		/* within a window backward of the originating packet */
461 
462 		/* BEGIN CSTYLED */
463 		/*
464 		 * This currently handles three situations:
465 		 *  1) Stupid stacks will shotgun SYNs before their peer
466 		 *     replies.
467 		 *  2) When flow tracking catches an already established
468 		 *     stream (the flow states are cleared, etc.)
469 		 *  3) Packets get funky immediately after the connection
470 		 *     closes (this should catch spurious ACK|FINs that
471 		 *     web servers like to spew after a close).
472 		 *
473 		 * This must be a little more careful than the above code
474 		 * since packet floods will also be caught here.
475 		 */
476 		/* END CSTYLED */
477 
478 		/* update max window */
479 		if (src->fse_max_win < win) {
480 			src->fse_max_win = win;
481 		}
482 		/* synchronize sequencing */
483 		if (SEQ_GT(end, src->fse_seqlo)) {
484 			src->fse_seqlo = end;
485 		}
486 		/* slide the window of what the other end can send */
487 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
488 			dst->fse_seqhi = ack + MAX((win << sws), 1);
489 		}
490 
491 		/*
492 		 * Cannot set dst->fse_seqhi here since this could be a
493 		 * shotgunned SYN and not an already established connection.
494 		 */
495 
496 		if (tcp_flags & TH_FIN) {
497 			if (src->fse_state < TCPS_CLOSING) {
498 				src->fse_seqlast = orig_seq;
499 				src->fse_state = TCPS_CLOSING;
500 				ftflags |= FTF_NODELAY;
501 			}
502 		}
503 		if (tcp_flags & TH_RST) {
504 			src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
505 			ftflags |= (FTF_WAITCLOSE | FTF_NODELAY);
506 		}
507 		if (tcp_flags & TH_PUSH) {
508 			ftflags |= FTF_NODELAY;
509 		}
510 	} else {
511 		if (dst->fse_state == TCPS_SYN_SENT &&
512 		    src->fse_state == TCPS_SYN_SENT) {
513 			src->fse_seqlo = 0;
514 			src->fse_seqhi = 1;
515 			src->fse_max_win = 1;
516 		}
517 	}
518 
519 done:
520 	/*
521 	 * If this needs immediate attention, indicate so.
522 	 */
523 	if (__improbable((ftflags & FTF_NODELAY) != 0)) {
524 		fe->fe_rx_nodelay = true;
525 		ftflags &= ~FTF_NODELAY;
526 	} else {
527 		fe->fe_rx_nodelay = false;
528 	}
529 
530 	if (__improbable((ftflags & FTF_HALFCLOSED) != 0)) {
531 		atomic_bitset_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
532 		ftflags &= ~FTF_HALFCLOSED;
533 	}
534 
535 	/*
536 	 * Hold on to namespace for a while after the flow is closed.
537 	 */
538 	if (__improbable((ftflags & FTF_WAITCLOSE) != 0 &&
539 	    (fe->fe_flags & FLOWENTF_WAIT_CLOSE) == 0)) {
540 		atomic_bitset_32(&fe->fe_flags, FLOWENTF_WAIT_CLOSE);
541 		ftflags &= ~FTF_WAITCLOSE;
542 	}
543 
544 	/*
545 	 * Notify NECP upon tear down (for established flows).
546 	 */
547 	if (__improbable((ftflags & FTF_CLOSENOTIFY) != 0 &&
548 	    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == 0)) {
549 		atomic_bitset_32(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY);
550 		ftflags &= ~FTF_CLOSENOTIFY;
551 	}
552 
553 	/*
554 	 * Flow is withdrawn; the port we have should not be included in
555 	 * the list of offloaded ports, as the connection is no longer
556 	 * usable (we're not expecting any more data).
557 	 * Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
558 	 * thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
559 	 * FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
560 	 */
561 	if (__improbable((ftflags & FTF_WITHDRAWN) != 0)) {
562 		ftflags &= ~FTF_WITHDRAWN;
563 		if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
564 			atomic_bitclear_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
565 		}
566 		fe->fe_want_withdraw = 1;
567 	}
568 
569 	/*
570 	 * If no other work is needed, we're done.
571 	 */
572 	if (ftflags == 0 || input) {
573 		return err;
574 	}
575 
576 	/*
577 	 * If we're over the rate limit for outbound SYNs, drop packet.
578 	 */
579 	if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) {
580 		uint32_t now = (uint32_t)_net_uptime;
581 		if ((now - src->fse_syn_ts) > 1) {
582 			src->fse_syn_ts = now;
583 			src->fse_syn_cnt = 0;
584 		}
585 		if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
586 			err = EPROTO;
587 		}
588 	}
589 
590 	return err;
591 }
592 #undef FTF_NODELAY
593 #undef FTF_WAITCLOSE
594 #undef FTF_CLOSENOTIFY
595 #undef FTF_WITHDRAWN
596 #undef FTF_SYN_RLIM
597 #undef FTF_RST_RLIM
598 
599 boolean_t
flow_track_tcp_want_abort(struct flow_entry * fe)600 flow_track_tcp_want_abort(struct flow_entry *fe)
601 {
602 	struct flow_track *src = &fe->fe_ltrack;
603 	struct flow_track *dst = &fe->fe_rtrack;
604 
605 	if (fe->fe_key.fk_proto != IPPROTO_TCP ||
606 	    (fe->fe_flags & FLOWENTF_ABORTED)) {
607 		goto done;
608 	}
609 
610 	/* this can be enhanced; for now rely on established state */
611 	if (src->fse_state == TCPS_ESTABLISHED ||
612 	    dst->fse_state == TCPS_ESTABLISHED) {
613 		src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
614 		/* don't process more than once */
615 		atomic_bitset_32(&fe->fe_flags, FLOWENTF_ABORTED);
616 		return TRUE;
617 	}
618 done:
619 	return FALSE;
620 }
621 
622 static void
flow_track_udp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)623 flow_track_udp_init(struct flow_entry *fe, struct flow_track *src,
624     struct flow_track *dst, struct __kern_packet *pkt)
625 {
626 #pragma unused(pkt)
627 	/*
628 	 * Source state initialization.
629 	 */
630 	src->fse_state = FT_STATE_NO_TRAFFIC;
631 
632 	/*
633 	 * Destination state initialization.
634 	 */
635 	dst->fse_state = FT_STATE_NO_TRAFFIC;
636 
637 	atomic_bitset_32(&fe->fe_flags, FLOWENTF_INITED);
638 }
639 
640 __attribute__((always_inline))
641 static inline int
flow_track_udp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)642 flow_track_udp(struct flow_entry *fe, struct flow_track *src,
643     struct flow_track *dst, struct __kern_packet *pkt, bool input)
644 {
645 #pragma unused(input)
646 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
647 		flow_track_udp_init(fe, src, dst, pkt);
648 	}
649 
650 	if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
651 		src->fse_state = FT_STATE_SINGLE;
652 	}
653 	if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
654 		dst->fse_state = FT_STATE_MULTIPLE;
655 	}
656 
657 	return 0;
658 }
659 
660 void
flow_track_stats(struct flow_entry * fe,uint64_t bytes,uint64_t packets,bool active,bool in)661 flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
662     bool active, bool in)
663 {
664 	volatile struct sk_stats_flow_track *fst;
665 
666 	if (in) {
667 		fst = &fe->fe_stats->fs_rtrack;
668 	} else {
669 		fst = &fe->fe_stats->fs_ltrack;
670 	}
671 
672 	fst->sft_bytes += bytes;
673 	fst->sft_packets += packets;
674 
675 	if (__probable(active)) {
676 		in_stat_set_activity_bitmap(&fe->fe_stats->fs_activity,
677 		    _net_uptime);
678 	}
679 }
680 
681 int
flow_pkt_track(struct flow_entry * fe,struct __kern_packet * pkt,bool in)682 flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in)
683 {
684 	struct flow_track *src, *dst;
685 	int ret = 0;
686 
687 	_CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED);
688 	_CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN);
689 	_CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
690 	_CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
691 	_CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
692 	_CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
693 	_CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
694 	_CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING);
695 	_CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
696 	_CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
697 	_CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
698 	_CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
699 	_CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE);
700 	_CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
701 	_CASSERT(SFT_STATE_MAX == FT_STATE_MAX);
702 
703 	_CASSERT(FT_STATE_CLOSED == TCPS_CLOSED);
704 	_CASSERT(FT_STATE_LISTEN == TCPS_LISTEN);
705 	_CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
706 	_CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
707 	_CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
708 	_CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
709 	_CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
710 	_CASSERT(FT_STATE_CLOSING == TCPS_CLOSING);
711 	_CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
712 	_CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
713 	_CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
714 
715 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
716 
717 	if (in) {
718 		src = &fe->fe_rtrack;
719 		dst = &fe->fe_ltrack;
720 	} else {
721 		src = &fe->fe_ltrack;
722 		dst = &fe->fe_rtrack;
723 	}
724 
725 	flow_track_stats(fe, (pkt->pkt_length - pkt->pkt_l2_len), 1,
726 	    (pkt->pkt_flow_ulen != 0), in);
727 
728 	/* skip flow state tracking on non-initial fragments */
729 	if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
730 		return 0;
731 	}
732 
733 	switch (pkt->pkt_flow_ip_proto) {
734 	case IPPROTO_TCP:
735 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
736 			ret = flow_track_tcp(fe, src, dst, pkt, in);
737 		}
738 		break;
739 
740 	case IPPROTO_UDP:
741 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
742 			ret = flow_track_udp(fe, src, dst, pkt, in);
743 		}
744 		break;
745 	}
746 
747 	return ret;
748 }
749