xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/flow/flow_track.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/fsw_var.h>
31 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
32 #include <netinet/tcp.h>
33 #include <netinet/tcp_fsm.h>
34 #include <netinet/tcp_seq.h>
35 #include <netinet/tcp_timer.h>
36 #include <netinet/tcp_var.h>
37 #include <netinet/udp.h>
38 #include <netinet/in_stat.h>
39 #include <netinet/ip.h>
40 #include <netinet/ip6.h>
41 #include <sys/kdebug.h>
42 
43 /* min/max linger time (in seconds */
44 #define FLOWTRACK_LINGER_MIN    1
45 #define FLOWTRACK_LINGER_MAX    120
46 
47 /* maximum allowed rate of SYNs per second */
48 #define FLOWTRACK_SYN_RATE      20
49 
50 static int flow_track_tcp(struct flow_entry *, struct flow_track *,
51     struct flow_track *, struct __kern_packet *, bool);
52 static int flow_track_udp(struct flow_entry *, struct flow_track *,
53     struct flow_track *, struct __kern_packet *, bool);
54 
55 static void
flow_track_tcp_get_wscale(struct flow_track * s,struct __kern_packet * pkt)56 flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt)
57 {
58 	const uint8_t *hdr = __unsafe_forge_bidi_indexable(uint8_t *,
59 	    pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
60 	int hlen = pkt->pkt_flow_tcp_hlen;
61 	uint8_t optlen, wscale = 0;
62 	const uint8_t *opt;
63 
64 	static_assert(sizeof(s->fse_flags) == sizeof(uint16_t));
65 	ASSERT(hlen >= (int)sizeof(struct tcphdr));
66 
67 	opt = hdr + sizeof(struct tcphdr);
68 	hlen -= sizeof(struct tcphdr);
69 	while (hlen >= 3) {
70 		switch (*opt) {
71 		case TCPOPT_EOL:
72 		case TCPOPT_NOP:
73 			++opt;
74 			--hlen;
75 			break;
76 		case TCPOPT_WINDOW:
77 			wscale = opt[2];
78 			if (wscale > TCP_MAX_WINSHIFT) {
79 				wscale = TCP_MAX_WINSHIFT;
80 			}
81 			os_atomic_or(&s->fse_flags, FLOWSTATEF_WSCALE, relaxed);
82 			OS_FALLTHROUGH;
83 		default:
84 			optlen = opt[1];
85 			if (optlen < 2) {
86 				optlen = 2;
87 			}
88 			hlen -= optlen;
89 			opt += optlen;
90 			break;
91 		}
92 	}
93 	s->fse_wscale = wscale;
94 }
95 
96 static void
flow_track_tcp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)97 flow_track_tcp_init(struct flow_entry *fe, struct flow_track *src,
98     struct flow_track *dst, struct __kern_packet *pkt)
99 {
100 #pragma unused(dst)
101 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
102 
103 	/*
104 	 * Source state initialization.
105 	 */
106 	src->fse_state = TCPS_SYN_SENT;
107 	src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
108 	src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + 1);
109 	if (tcp_flags & TH_SYN) {
110 		src->fse_seqhi++;
111 		flow_track_tcp_get_wscale(src, pkt);
112 	}
113 	if (tcp_flags & TH_FIN) {
114 		src->fse_seqhi++;
115 	}
116 
117 	src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), 1);
118 	if (src->fse_flags & FLOWSTATEF_WSCALE) {
119 		/* remove scale factor from initial window */
120 		int win = src->fse_max_win;
121 		ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
122 		win += (1 << src->fse_wscale);
123 		src->fse_max_win = (uint16_t)((win - 1) >> src->fse_wscale);
124 	}
125 
126 	/*
127 	 * Destination state initialization.
128 	 */
129 	dst->fse_state = TCPS_CLOSED;
130 	dst->fse_seqhi = 1;
131 	dst->fse_max_win = 1;
132 
133 	/*
134 	 * Linger time (in seconds).
135 	 */
136 	fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
137 	if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
138 		fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
139 	} else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
140 		fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
141 	}
142 
143 	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
144 }
145 
146 /*
147  * The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
148  * for a endpoint to process incoming segment and generate ACK, at the point of
149  * observation. For flowswitch, it means that:
150  *
151  *     local end RTT  = local stack processing time
152  *     remote end RTT = driver + network + remote endpoint's processing time
153  *
154  * Since the measurement is lightweight and sampling based, it won't learn and
155  * distinguish lost segment's ACK.  So we could occasionally get large RTT
156  * sample from an ACK to a retransmitted segment.  Thus rtt_max is not any
157  * meaningful to us.
158  */
159 __attribute__((always_inline))
160 static inline void
flow_track_tcp_rtt(struct flow_entry * fe,boolean_t input,struct flow_track * src,struct flow_track * dst,uint8_t tcp_flags,uint32_t seq,uint32_t ack,uint32_t ulen)161 flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
162     struct flow_track *src, struct flow_track *dst, uint8_t tcp_flags,
163     uint32_t seq, uint32_t ack, uint32_t ulen)
164 {
165 #pragma unused(fe, input) /* KDBG defined as noop in release build */
166 	uint64_t dst_last, src_last;
167 	uint64_t now, time_diff;
168 	uint32_t curval, oldval;
169 	clock_sec_t tv_sec;
170 	clock_usec_t tv_usec;
171 
172 	src_last = src->fse_rtt.frtt_last;
173 	dst_last = dst->fse_rtt.frtt_last;
174 
175 	/* start a new RTT tracking session under sampling rate limit */
176 	if (dst_last == 0 ||
177 	    net_uptime() - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
178 		if (ulen > 0 &&
179 		    dst->fse_rtt.frtt_timestamp == 0) {
180 			dst->fse_rtt.frtt_timestamp = mach_absolute_time();
181 			dst->fse_rtt.frtt_last = net_uptime();
182 			dst->fse_rtt.frtt_seg_begin = seq;
183 			dst->fse_rtt.frtt_seg_end = seq + ulen;
184 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START),
185 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
186 			    input ? 1 : 0);
187 		}
188 	}
189 
190 	/* we have an ACK, see if current tracking session matches it */
191 	if (tcp_flags & TH_ACK) {
192 		if (src->fse_rtt.frtt_timestamp != 0 &&
193 		    src->fse_rtt.frtt_seg_begin <= ack) {
194 			now = mach_absolute_time();
195 			time_diff = now - src->fse_rtt.frtt_timestamp;
196 
197 			absolutetime_to_microtime(time_diff, &tv_sec, &tv_usec);
198 			curval = (uint32_t)(tv_usec + tv_sec * 1000 * 1000);
199 			oldval = src->fse_rtt.frtt_usec;
200 			if (oldval == 0) {
201 				src->fse_rtt.frtt_usec = curval;
202 			} else {
203 				/* same EWMA decay as TCP RTT */
204 				src->fse_rtt.frtt_usec =
205 				    ((oldval << 4) - oldval + curval) >> 4;
206 			}
207 
208 			/* reset RTT tracking session */
209 			src->fse_rtt.frtt_timestamp = 0;
210 			src->fse_rtt.frtt_last = 0;
211 			KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_END),
212 			    SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
213 			    input ? 0 : 1);
214 
215 			/* publish rtt stats into flow_stats object */
216 			/* just store both to avoid branch prediction etc. */
217 			fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
218 			fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
219 		}
220 	}
221 }
222 
223 /*
224  * The TCP connection tracking logic is based on Guido van Rooij's paper:
225  * http://www.sane.nl/events/sane2000/papers/rooij.pdf
226  *
227  * In some ways, we act as a middlebox that passively tracks the TCP windows
228  * of each connection on flows marked with FLOWENTF_TRACK.  We never modify
229  * the packet or generate any response (e.g. RST) to the sender; thus we are
230  * simply a silent observer.  The information we gather here is used later
231  * if we need to generate a valid {FIN|RST} segment when the flow is nonviable.
232  *
233  * The implementation is borrowed from Packet Filter, and is further
234  * simplified to cater for our use cases.
235  */
236 #define FTF_HALFCLOSED  0x1     /* want flow to be marked as half closed */
237 #define FTF_WAITCLOSE   0x2     /* want flow to linger after close */
238 #define FTF_CLOSENOTIFY 0x4     /* want to notify NECP upon torn down */
239 #define FTF_WITHDRAWN   0x8     /* want flow to be torn down */
240 #define FTF_SYN_RLIM    0x10    /* want flow to rate limit SYN */
241 #define FTF_RST_RLIM    0x20    /* want flow to rate limit RST */
242 __attribute__((always_inline))
243 static inline int
flow_track_tcp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)244 flow_track_tcp(struct flow_entry *fe, struct flow_track *src,
245     struct flow_track *dst, struct __kern_packet *pkt, bool input)
246 {
247 	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
248 	uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
249 	uint32_t ack, end, seq, orig_seq;
250 	uint32_t ftflags = 0;
251 	uint8_t sws, dws;
252 	int ackskew, err = 0;
253 
254 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
255 		flow_track_tcp_init(fe, src, dst, pkt);
256 	}
257 
258 	flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
259 	    ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
260 	    pkt->pkt_flow_ulen);
261 
262 	if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
263 	    src->fse_state >= TCPS_FIN_WAIT_2)) {
264 		if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
265 			src->fse_state = dst->fse_state = TCPS_CLOSED;
266 			ftflags |= FTF_SYN_RLIM;
267 		}
268 		if (tcp_flags & TH_RST) {
269 			ftflags |= FTF_RST_RLIM;
270 		}
271 		if (input) {
272 			err = ENETRESET;
273 		}
274 		goto done;
275 	}
276 
277 	if (__probable((tcp_flags & TH_SYN) == 0 &&
278 	    src->fse_wscale != 0 && dst->fse_wscale != 0)) {
279 		sws = src->fse_wscale;
280 		dws = dst->fse_wscale;
281 	} else {
282 		sws = dws = 0;
283 	}
284 
285 	orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
286 	if (__probable(src->fse_seqlo != 0)) {
287 		ack = ntohl(pkt->pkt_flow_tcp_ack);
288 		end = seq + pkt->pkt_flow_ulen;
289 		if (tcp_flags & TH_SYN) {
290 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
291 				ftflags |= FTF_SYN_RLIM;
292 			}
293 			end++;
294 		}
295 		if (tcp_flags & TH_FIN) {
296 			end++;
297 		}
298 		if (tcp_flags & TH_RST) {
299 			ftflags |= FTF_RST_RLIM;
300 		}
301 	} else {
302 		/* first packet from this end; set its state */
303 		ack = ntohl(pkt->pkt_flow_tcp_ack);
304 
305 		/* We saw the first SYN, but stack does not reply with a SYN */
306 		if (dst->fse_state == TCPS_SYN_SENT && ((tcp_flags & TH_SYN) == 0)) {
307 			/* Act as if no sequence number is set */
308 			seq = 0;
309 			/* Pretend the outgoing SYN was not ACK'ed */
310 			ack = dst->fse_seqlo;
311 		}
312 
313 		end = seq + pkt->pkt_flow_ulen;
314 		if (tcp_flags & TH_SYN) {
315 			if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
316 				ftflags |= FTF_SYN_RLIM;
317 			}
318 			end++;
319 			if (dst->fse_flags & FLOWSTATEF_WSCALE) {
320 				flow_track_tcp_get_wscale(src, pkt);
321 				if (src->fse_flags & FLOWSTATEF_WSCALE) {
322 					/*
323 					 * Remove scale factor from
324 					 * initial window.
325 					 */
326 					sws = src->fse_wscale;
327 					win = (uint16_t)(((u_int32_t)win + (1 << sws) - 1)
328 					    >> sws);
329 					dws = dst->fse_wscale;
330 				} else {
331 					/* fixup other window */
332 					dst->fse_max_win = (uint16_t)(dst->fse_max_win << dst->fse_wscale);
333 					/* in case of a retrans SYN|ACK */
334 					dst->fse_wscale = 0;
335 				}
336 			}
337 		}
338 		if (tcp_flags & TH_FIN) {
339 			end++;
340 		}
341 		if (tcp_flags & TH_RST) {
342 			ftflags |= FTF_RST_RLIM;
343 		}
344 
345 		src->fse_seqlo = seq;
346 		if (src->fse_state < TCPS_SYN_SENT) {
347 			if (tcp_flags & TH_SYN) {
348 				src->fse_state = TCPS_SYN_SENT;
349 			} else {
350 				/* Picking up the connection in the middle */
351 				src->fse_state = TCPS_ESTABLISHED;
352 			}
353 		}
354 
355 		/*
356 		 * May need to slide the window (seqhi may have been set by
357 		 * the crappy stack check or if we picked up the connection
358 		 * after establishment).
359 		 */
360 		if (src->fse_seqhi == 1 || SEQ_GEQ(end +
361 		    MAX(1, dst->fse_max_win << dws), src->fse_seqhi)) {
362 			src->fse_seqhi = end + MAX(1, dst->fse_max_win << dws);
363 		}
364 		if (win > src->fse_max_win) {
365 			src->fse_max_win = win;
366 		}
367 	}
368 
369 	if (!(tcp_flags & TH_ACK)) {
370 		/* let it pass through the ack skew check */
371 		ack = dst->fse_seqlo;
372 	} else if ((ack == 0 &&
373 	    (tcp_flags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) ||
374 	    /* broken tcp stacks do not set ack */
375 	    (dst->fse_state < TCPS_SYN_SENT)) {
376 		/*
377 		 * Many stacks (ours included) will set the ACK number in an
378 		 * FIN|ACK if the SYN times out -- no sequence to ACK.
379 		 */
380 		ack = dst->fse_seqlo;
381 	}
382 
383 	if (seq == end) {
384 		/* ease sequencing restrictions on no data packets */
385 		seq = src->fse_seqlo;
386 		end = seq;
387 	}
388 
389 	ackskew = dst->fse_seqlo - ack;
390 
391 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
392 	if (SEQ_GEQ(src->fse_seqhi, end) &&
393 	    /* last octet inside other's window space */
394 	    SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
395 	    /* retrans: not more than one window back */
396 	    (ackskew >= -MAXACKWINDOW) &&
397 	    /* acking not more than one reassembled fragment backwards */
398 	    (ackskew <= (MAXACKWINDOW << sws)) &&
399 	    /* acking not more than one window forward */
400 	    (!(tcp_flags & TH_RST) || orig_seq == src->fse_seqlo ||
401 	    (orig_seq == src->fse_seqlo + 1) ||
402 	    (orig_seq + 1 == src->fse_seqlo))) {
403 		/* require an exact/+1 sequence match on resets when possible */
404 
405 		/* update max window */
406 		if (src->fse_max_win < win) {
407 			src->fse_max_win = win;
408 		}
409 		/* synchronize sequencing */
410 		if (SEQ_GT(end, src->fse_seqlo)) {
411 			src->fse_seqlo = end;
412 		}
413 		/* slide the window of what the other end can send */
414 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
415 			dst->fse_seqhi = ack + MAX((win << sws), 1);
416 		}
417 
418 		/* update states */
419 		if (tcp_flags & TH_SYN) {
420 			if (src->fse_state < TCPS_SYN_SENT) {
421 				src->fse_state = TCPS_SYN_SENT;
422 			}
423 		}
424 		if (tcp_flags & TH_FIN) {
425 			if (src->fse_state < TCPS_CLOSING) {
426 				src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
427 				src->fse_state = TCPS_CLOSING;
428 			}
429 		}
430 		if (tcp_flags & TH_ACK) {
431 			/*
432 			 * Avoid transitioning to ESTABLISHED when our SYN
433 			 * is ACK'd along with a RST.  The sending TCP may
434 			 * still retransmit the SYN (after dropping some
435 			 * options like ECN, etc.)
436 			 */
437 			if (dst->fse_state == TCPS_SYN_SENT &&
438 			    !(tcp_flags & TH_RST)) {
439 				dst->fse_state = TCPS_ESTABLISHED;
440 				ftflags |= (FTF_WAITCLOSE | FTF_CLOSENOTIFY);
441 			} else if (dst->fse_state == TCPS_CLOSING &&
442 			    ack == dst->fse_seqlast + 1) {
443 				dst->fse_state = TCPS_FIN_WAIT_2;
444 				ftflags |= FTF_WAITCLOSE;
445 				if (src->fse_state >= TCPS_FIN_WAIT_2) {
446 					ftflags |= FTF_WITHDRAWN;
447 				} else {
448 					ftflags |= FTF_HALFCLOSED;
449 				}
450 			}
451 		}
452 		if ((tcp_flags & TH_RST) &&
453 		    (src->fse_state == TCPS_ESTABLISHED ||
454 		    dst->fse_state == TCPS_ESTABLISHED)) {
455 			/*
456 			 * If either endpoint is in ESTABLISHED, transition
457 			 * both to TIME_WAIT.  Otherwise, keep the existing
458 			 * state as is, e.g. SYN_SENT.
459 			 */
460 			src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
461 			ftflags |= (FTF_WITHDRAWN | FTF_WAITCLOSE);
462 		}
463 	} else if ((dst->fse_state < TCPS_SYN_SENT ||
464 	    dst->fse_state >= TCPS_FIN_WAIT_2 ||
465 	    src->fse_state >= TCPS_FIN_WAIT_2) &&
466 	    SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
467 	    /* within a window forward of the originating packet */
468 	    SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
469 		/* within a window backward of the originating packet */
470 
471 		/* BEGIN CSTYLED */
472 		/*
473 		 * This currently handles three situations:
474 		 *  1) Stupid stacks will shotgun SYNs before their peer
475 		 *     replies.
476 		 *  2) When flow tracking catches an already established
477 		 *     stream (the flow states are cleared, etc.)
478 		 *  3) Packets get funky immediately after the connection
479 		 *     closes (this should catch spurious ACK|FINs that
480 		 *     web servers like to spew after a close).
481 		 *
482 		 * This must be a little more careful than the above code
483 		 * since packet floods will also be caught here.
484 		 */
485 		/* END CSTYLED */
486 
487 		/* update max window */
488 		if (src->fse_max_win < win) {
489 			src->fse_max_win = win;
490 		}
491 		/* synchronize sequencing */
492 		if (SEQ_GT(end, src->fse_seqlo)) {
493 			src->fse_seqlo = end;
494 		}
495 		/* slide the window of what the other end can send */
496 		if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
497 			dst->fse_seqhi = ack + MAX((win << sws), 1);
498 		}
499 
500 		/*
501 		 * Cannot set dst->fse_seqhi here since this could be a
502 		 * shotgunned SYN and not an already established connection.
503 		 */
504 
505 		if (tcp_flags & TH_FIN) {
506 			if (src->fse_state < TCPS_CLOSING) {
507 				src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
508 				src->fse_state = TCPS_CLOSING;
509 			}
510 		}
511 		if (tcp_flags & TH_RST) {
512 			/*
513 			 * Do not act on TCP RST with invalid sequence number per RFC 5961
514 			 */
515 			if (SEQ_GEQ(orig_seq, src->fse_seqlo)) {
516 				src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
517 				ftflags |= FTF_WAITCLOSE;
518 			}
519 		}
520 	} else {
521 		if (dst->fse_state == TCPS_SYN_SENT &&
522 		    src->fse_state == TCPS_SYN_SENT) {
523 			src->fse_seqlo = 0;
524 			src->fse_seqhi = 1;
525 			src->fse_max_win = 1;
526 		}
527 	}
528 
529 done:
530 	if (__improbable((ftflags & FTF_HALFCLOSED) != 0)) {
531 		os_atomic_or(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
532 		ftflags &= ~FTF_HALFCLOSED;
533 	}
534 
535 	/*
536 	 * Hold on to namespace for a while after the flow is closed.
537 	 */
538 	if (__improbable((ftflags & FTF_WAITCLOSE) != 0 &&
539 	    (fe->fe_flags & FLOWENTF_WAIT_CLOSE) == 0)) {
540 		os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
541 		ftflags &= ~FTF_WAITCLOSE;
542 	}
543 
544 	/*
545 	 * Notify NECP upon tear down (for established flows).
546 	 */
547 	if (__improbable((ftflags & FTF_CLOSENOTIFY) != 0 &&
548 	    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == 0)) {
549 		os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY, relaxed);
550 		ftflags &= ~FTF_CLOSENOTIFY;
551 	}
552 
553 	/*
554 	 * Flow is withdrawn; the port we have should not be included in
555 	 * the list of offloaded ports, as the connection is no longer
556 	 * usable (we're not expecting any more data).
557 	 * Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
558 	 * thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
559 	 * FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
560 	 */
561 	if (__improbable((ftflags & FTF_WITHDRAWN) != 0)) {
562 		ftflags &= ~FTF_WITHDRAWN;
563 		if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
564 			os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
565 		}
566 		fe->fe_want_withdraw = 1;
567 	}
568 
569 	/*
570 	 * If no other work is needed, we're done.
571 	 */
572 	if (ftflags == 0 || input) {
573 		return err;
574 	}
575 
576 	/*
577 	 * If we're over the rate limit for outbound SYNs, drop packet.
578 	 */
579 	if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) {
580 		uint32_t now = (uint32_t)net_uptime();
581 		if ((now - src->fse_syn_ts) > 1) {
582 			src->fse_syn_ts = now;
583 			src->fse_syn_cnt = 0;
584 		}
585 		if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
586 			err = EPROTO;
587 		}
588 	}
589 
590 	return err;
591 }
592 #undef FTF_WAITCLOSE
593 #undef FTF_CLOSENOTIFY
594 #undef FTF_WITHDRAWN
595 #undef FTF_SYN_RLIM
596 #undef FTF_RST_RLIM
597 
598 boolean_t
flow_track_tcp_want_abort(struct flow_entry * fe)599 flow_track_tcp_want_abort(struct flow_entry *fe)
600 {
601 	struct flow_track *src = &fe->fe_ltrack;
602 	struct flow_track *dst = &fe->fe_rtrack;
603 
604 	if (fe->fe_key.fk_proto != IPPROTO_TCP ||
605 	    (fe->fe_flags & (FLOWENTF_ABORTED | FLOWENTF_AOP_OFFLOAD))) {
606 		goto done;
607 	}
608 
609 	/* this can be enhanced; for now rely on established state */
610 	if (src->fse_state == TCPS_ESTABLISHED ||
611 	    dst->fse_state == TCPS_ESTABLISHED) {
612 		src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
613 		/* don't process more than once */
614 		os_atomic_or(&fe->fe_flags, FLOWENTF_ABORTED, relaxed);
615 		return TRUE;
616 	}
617 done:
618 	return FALSE;
619 }
620 
621 static void
flow_track_udp_init(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt)622 flow_track_udp_init(struct flow_entry *fe, struct flow_track *src,
623     struct flow_track *dst, struct __kern_packet *pkt)
624 {
625 #pragma unused(pkt)
626 	/*
627 	 * Source state initialization.
628 	 */
629 	src->fse_state = FT_STATE_NO_TRAFFIC;
630 
631 	/*
632 	 * Destination state initialization.
633 	 */
634 	dst->fse_state = FT_STATE_NO_TRAFFIC;
635 
636 	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
637 }
638 
639 __attribute__((always_inline))
640 static inline int
flow_track_udp(struct flow_entry * fe,struct flow_track * src,struct flow_track * dst,struct __kern_packet * pkt,bool input)641 flow_track_udp(struct flow_entry *fe, struct flow_track *src,
642     struct flow_track *dst, struct __kern_packet *pkt, bool input)
643 {
644 #pragma unused(input)
645 	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
646 		flow_track_udp_init(fe, src, dst, pkt);
647 	}
648 
649 	if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
650 		src->fse_state = FT_STATE_SINGLE;
651 	}
652 	if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
653 		dst->fse_state = FT_STATE_MULTIPLE;
654 	}
655 
656 	return 0;
657 }
658 
659 void
flow_track_stats(struct flow_entry * fe,uint64_t bytes,uint64_t packets,bool active,bool in)660 flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
661     bool active, bool in)
662 {
663 	volatile struct sk_stats_flow_track *fst;
664 
665 	if (in) {
666 		fst = &fe->fe_stats->fs_rtrack;
667 	} else {
668 		fst = &fe->fe_stats->fs_ltrack;
669 	}
670 
671 	fst->sft_bytes += bytes;
672 	fst->sft_packets += packets;
673 
674 	if (__probable(active)) {
675 		in_stat_set_activity_bitmap(&fe->fe_stats->fs_activity,
676 		    net_uptime());
677 	}
678 }
679 
680 int
flow_pkt_track(struct flow_entry * fe,struct __kern_packet * pkt,bool in)681 flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in)
682 {
683 	struct flow_track *src, *dst;
684 	int ret = 0;
685 
686 	static_assert(SFT_STATE_CLOSED == FT_STATE_CLOSED);
687 	static_assert(SFT_STATE_LISTEN == FT_STATE_LISTEN);
688 	static_assert(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
689 	static_assert(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
690 	static_assert(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
691 	static_assert(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
692 	static_assert(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
693 	static_assert(SFT_STATE_CLOSING == FT_STATE_CLOSING);
694 	static_assert(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
695 	static_assert(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
696 	static_assert(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
697 	static_assert(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
698 	static_assert(SFT_STATE_SINGLE == FT_STATE_SINGLE);
699 	static_assert(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
700 	static_assert(SFT_STATE_MAX == FT_STATE_MAX);
701 
702 	static_assert(FT_STATE_CLOSED == TCPS_CLOSED);
703 	static_assert(FT_STATE_LISTEN == TCPS_LISTEN);
704 	static_assert(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
705 	static_assert(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
706 	static_assert(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
707 	static_assert(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
708 	static_assert(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
709 	static_assert(FT_STATE_CLOSING == TCPS_CLOSING);
710 	static_assert(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
711 	static_assert(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
712 	static_assert(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
713 
714 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
715 
716 	if (in) {
717 		src = &fe->fe_rtrack;
718 		dst = &fe->fe_ltrack;
719 	} else {
720 		src = &fe->fe_ltrack;
721 		dst = &fe->fe_rtrack;
722 	}
723 
724 	flow_track_stats(fe, (pkt->pkt_length - pkt->pkt_l2_len), 1,
725 	    (pkt->pkt_flow_ulen != 0), in);
726 
727 	/* skip flow state tracking on non-initial fragments */
728 	if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
729 		return 0;
730 	}
731 
732 	switch (pkt->pkt_flow_ip_proto) {
733 	case IPPROTO_TCP:
734 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
735 			ret = flow_track_tcp(fe, src, dst, pkt, in);
736 		}
737 		break;
738 
739 	case IPPROTO_UDP:
740 		if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
741 			ret = flow_track_udp(fe, src, dst, pkt, in);
742 		}
743 		break;
744 	}
745 
746 	return ret;
747 }
748 
749 /*
750  * @function flow_track_abort_tcp
751  * @abstract send RST for a given TCP flow.
752  * @param in_pkt incoming packet that triggers RST.
753  * @param rst_pkt use as RST template for SEQ/ACK information.
754  */
755 void
flow_track_abort_tcp(struct flow_entry * fe,struct __kern_packet * in_pkt,struct __kern_packet * rst_pkt)756 flow_track_abort_tcp(struct flow_entry *fe, struct __kern_packet *in_pkt,
757     struct __kern_packet *rst_pkt)
758 {
759 	struct nx_flowswitch *fsw = fe->fe_fsw;
760 	struct flow_track *src, *dst;
761 	struct ip *ip;
762 	struct ip6_hdr *ip6;
763 	struct tcphdr *th;
764 	uint16_t len, tlen;
765 	struct mbuf *m;
766 
767 	/* guaranteed by caller */
768 	ASSERT(fsw->fsw_ifp != NULL);
769 	ASSERT(in_pkt == NULL || rst_pkt == NULL);
770 
771 	src = &fe->fe_ltrack;
772 	dst = &fe->fe_rtrack;
773 
774 	tlen = sizeof(struct tcphdr);
775 	if (fe->fe_key.fk_ipver == IPVERSION) {
776 		len = sizeof(struct ip) + tlen;
777 	} else {
778 		ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
779 		len = sizeof(struct ip6_hdr) + tlen;
780 	}
781 
782 	m = m_gethdr(M_NOWAIT, MT_HEADER);
783 	if (__improbable(m == NULL)) {
784 		return;
785 	}
786 
787 	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
788 	m->m_data += max_linkhdr;               /* 32-bit aligned */
789 	m->m_pkthdr.len = m->m_len = len;
790 
791 	/* zero out for checksum */
792 	bzero(m_mtod_current(m), len);
793 
794 	if (fe->fe_key.fk_ipver == IPVERSION) {
795 		ip = mtod(m, struct ip *);
796 
797 		/* IP header fields included in the TCP checksum */
798 		ip->ip_p = IPPROTO_TCP;
799 		ip->ip_len = htons(tlen);
800 		if (rst_pkt == NULL) {
801 			ip->ip_src = fe->fe_key.fk_src4;
802 			ip->ip_dst = fe->fe_key.fk_dst4;
803 		} else {
804 			ip->ip_src = rst_pkt->pkt_flow_ipv4_src;
805 			ip->ip_dst = rst_pkt->pkt_flow_ipv4_dst;
806 		}
807 
808 		th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
809 	} else {
810 		ip6 = mtod(m, struct ip6_hdr *);
811 
812 		/* IP header fields included in the TCP checksum */
813 		ip6->ip6_nxt = IPPROTO_TCP;
814 		ip6->ip6_plen = htons(tlen);
815 		if (rst_pkt == NULL) {
816 			ip6->ip6_src = fe->fe_key.fk_src6;
817 			ip6->ip6_dst = fe->fe_key.fk_dst6;
818 		} else {
819 			ip6->ip6_src = rst_pkt->pkt_flow_ipv6_src;
820 			ip6->ip6_dst = rst_pkt->pkt_flow_ipv6_dst;
821 		}
822 
823 		th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
824 	}
825 
826 	/*
827 	 * TCP header (fabricate a pure RST).
828 	 */
829 	if (in_pkt != NULL) {
830 		th->th_sport = in_pkt->pkt_flow_tcp_dst;
831 		th->th_dport = in_pkt->pkt_flow_tcp_src;
832 		if (__probable(in_pkt->pkt_flow_tcp_flags | TH_ACK)) {
833 			/* <SEQ=SEG.ACK><CTL=RST> */
834 			th->th_seq = in_pkt->pkt_flow_tcp_ack;
835 			th->th_ack = 0;
836 			th->th_flags = TH_RST;
837 		} else {
838 			/* <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> */
839 			th->th_seq = 0;
840 			th->th_ack = in_pkt->pkt_flow_tcp_seq +
841 			    in_pkt->pkt_flow_ulen;
842 			th->th_flags = TH_RST | TH_ACK;
843 		}
844 	} else if (rst_pkt != NULL) {
845 		th->th_sport = rst_pkt->pkt_flow_tcp_src;
846 		th->th_dport = rst_pkt->pkt_flow_tcp_dst;
847 		th->th_seq = rst_pkt->pkt_flow_tcp_seq;
848 		th->th_ack = rst_pkt->pkt_flow_tcp_ack;
849 		th->th_flags = rst_pkt->pkt_flow_tcp_flags;
850 	} else {
851 		th->th_sport = fe->fe_key.fk_sport;
852 		th->th_dport = fe->fe_key.fk_dport;
853 		th->th_seq = htonl(src->fse_seqlo);     /* peer's last ACK */
854 		th->th_ack = 0;
855 		th->th_flags = TH_RST;
856 	}
857 	th->th_off = (tlen >> 2);
858 	th->th_win = 0;
859 
860 	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
861 
862 	if (fe->fe_key.fk_ipver == IPVERSION) {
863 		struct ip_out_args ipoa;
864 		struct route ro;
865 
866 		bzero(&ipoa, sizeof(ipoa));
867 		ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
868 		ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
869 		    IPOAF_BOUND_SRCADDR);
870 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
871 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
872 
873 		/* TCP checksum */
874 		th->th_sum = in_cksum(m, len);
875 
876 		ip->ip_v = IPVERSION;
877 		ip->ip_hl = sizeof(*ip) >> 2;
878 		ip->ip_tos = 0;
879 		/*
880 		 * ip_output() expects ip_len and ip_off to be in host order.
881 		 */
882 		ip->ip_len = len;
883 		ip->ip_off = IP_DF;
884 		ip->ip_ttl = (uint8_t)ip_defttl;
885 		ip->ip_sum = 0;
886 
887 		bzero(&ro, sizeof(ro));
888 		(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
889 		ROUTE_RELEASE(&ro);
890 	} else {
891 		struct ip6_out_args ip6oa;
892 		struct route_in6 ro6;
893 
894 		bzero(&ip6oa, sizeof(ip6oa));
895 		ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
896 		ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
897 		    IP6OAF_BOUND_SRCADDR);
898 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
899 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
900 
901 		/* TCP checksum */
902 		th->th_sum = in6_cksum(m, IPPROTO_TCP,
903 		    sizeof(struct ip6_hdr), tlen);
904 
905 		ip6->ip6_vfc |= IPV6_VERSION;
906 		ip6->ip6_hlim = IPV6_DEFHLIM;
907 
908 		bzero(&ro6, sizeof(ro6));
909 		(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
910 		    NULL, NULL, &ip6oa);
911 		ROUTE_RELEASE(&ro6);
912 	}
913 }
914 
915 void
flow_track_abort_quic(struct flow_entry * fe,uint8_t * __counted_by (QUIC_STATELESS_RESET_TOKEN_SIZE)token)916 flow_track_abort_quic(struct flow_entry *fe,
917     uint8_t *__counted_by(QUIC_STATELESS_RESET_TOKEN_SIZE)token)
918 {
919 	struct quic_stateless_reset {
920 		uint8_t ssr_header[30];
921 		uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
922 	};
923 	struct nx_flowswitch *fsw = fe->fe_fsw;
924 	struct ip *ip;
925 	struct ip6_hdr *ip6;
926 	struct udphdr *uh;
927 	struct quic_stateless_reset *qssr;
928 	uint16_t len, l3hlen, ulen;
929 	struct mbuf *__single m;
930 	unsigned int one = 1;
931 	int error;
932 
933 	/* guaranteed by caller */
934 	ASSERT(fsw->fsw_ifp != NULL);
935 
936 	/* skip zero token */
937 	bool is_zero_token = true;
938 	for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
939 		if (token[i] != 0) {
940 			is_zero_token = false;
941 			break;
942 		}
943 	}
944 	if (is_zero_token) {
945 		return;
946 	}
947 
948 	ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
949 	if (fe->fe_key.fk_ipver == IPVERSION) {
950 		l3hlen = sizeof(struct ip);
951 	} else {
952 		ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
953 		l3hlen = sizeof(struct ip6_hdr);
954 	}
955 
956 	len = l3hlen + ulen;
957 
958 	error = mbuf_allocpacket(MBUF_DONTWAIT, max_linkhdr + len, &one, &m);
959 	if (__improbable(error != 0)) {
960 		return;
961 	}
962 	VERIFY(m != 0);
963 
964 	m->m_pkthdr.pkt_proto = IPPROTO_UDP;
965 	m->m_data += max_linkhdr;               /* 32-bit aligned */
966 	m->m_pkthdr.len = m->m_len = len;
967 
968 	/* zero out for checksum */
969 	bzero(m_mtod_current(m), len);
970 
971 	if (fe->fe_key.fk_ipver == IPVERSION) {
972 		ip = mtod(m, struct ip *);
973 		ip->ip_p = IPPROTO_UDP;
974 		ip->ip_len = htons(ulen);
975 		ip->ip_src = fe->fe_key.fk_src4;
976 		ip->ip_dst = fe->fe_key.fk_dst4;
977 		uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
978 	} else {
979 		ip6 = mtod(m, struct ip6_hdr *);
980 		ip6->ip6_nxt = IPPROTO_UDP;
981 		ip6->ip6_plen = htons(ulen);
982 		ip6->ip6_src = fe->fe_key.fk_src6;
983 		ip6->ip6_dst = fe->fe_key.fk_dst6;
984 		uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
985 	}
986 
987 	/* UDP header */
988 	uh->uh_sport = fe->fe_key.fk_sport;
989 	uh->uh_dport = fe->fe_key.fk_dport;
990 	uh->uh_ulen = htons(ulen);
991 
992 	/* QUIC stateless reset */
993 	qssr = (struct quic_stateless_reset *)(uh + 1);
994 	read_frandom(&qssr->ssr_header, sizeof(qssr->ssr_header));
995 	qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
996 	memcpy(qssr->ssr_token, token, QUIC_STATELESS_RESET_TOKEN_SIZE);
997 
998 	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
999 
1000 	if (fe->fe_key.fk_ipver == IPVERSION) {
1001 		struct ip_out_args ipoa;
1002 		struct route ro;
1003 
1004 		bzero(&ipoa, sizeof(ipoa));
1005 		ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
1006 		ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
1007 		    IPOAF_BOUND_SRCADDR);
1008 		ipoa.ipoa_sotc = SO_TC_UNSPEC;
1009 		ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1010 
1011 		uh->uh_sum = in_cksum(m, len);
1012 		if (uh->uh_sum == 0) {
1013 			uh->uh_sum = 0xffff;
1014 		}
1015 
1016 		ip->ip_v = IPVERSION;
1017 		ip->ip_hl = sizeof(*ip) >> 2;
1018 		ip->ip_tos = 0;
1019 		/*
1020 		 * ip_output() expects ip_len and ip_off to be in host order.
1021 		 */
1022 		ip->ip_len = len;
1023 		ip->ip_off = IP_DF;
1024 		ip->ip_ttl = (uint8_t)ip_defttl;
1025 		ip->ip_sum = 0;
1026 
1027 		bzero(&ro, sizeof(ro));
1028 		(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
1029 		ROUTE_RELEASE(&ro);
1030 	} else {
1031 		struct ip6_out_args ip6oa;
1032 		struct route_in6 ro6;
1033 
1034 		bzero(&ip6oa, sizeof(ip6oa));
1035 		ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
1036 		ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
1037 		    IP6OAF_BOUND_SRCADDR);
1038 		ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
1039 		ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1040 
1041 		uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
1042 		    ulen);
1043 		if (uh->uh_sum == 0) {
1044 			uh->uh_sum = 0xffff;
1045 		}
1046 
1047 		ip6->ip6_vfc |= IPV6_VERSION;
1048 		ip6->ip6_hlim = IPV6_DEFHLIM;
1049 
1050 		bzero(&ro6, sizeof(ro6));
1051 		(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
1052 		    NULL, NULL, &ip6oa);
1053 		ROUTE_RELEASE(&ro6);
1054 	}
1055 }
1056