xref: /xnu-8796.121.2/bsd/skywalk/nexus/flowswitch/fsw_dp.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 /*
55  *  BSD LICENSE
56  *
57  * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58  *  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  *  modification, are permitted provided that the following conditions
62  *  are met:
63  *
64  *    * Redistributions of source code must retain the above copyright
65  *      notice, this list of conditions and the following disclaimer.
66  *    * Redistributions in binary form must reproduce the above copyright
67  *      notice, this list of conditions and the following disclaimer in
68  *      the documentation and/or other materials provided with the
69  *      distribution.
70  *    * Neither the name of NEC Europe Ltd. nor the names of
71  *      its contributors may be used to endorse or promote products derived
72  *      from this software without specific prior written permission.
73  *
74  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/pktsched/pktsched_netem.h>
99 #include <netinet/tcp.h>
100 #include <netinet/udp.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 
104 extern kern_return_t thread_terminate(thread_t);
105 
106 #define FSW_ZONE_MAX                  256
107 #define FSW_ZONE_NAME                 "skywalk.nx.fsw"
108 
109 static uint64_t fsw_reap_last __sk_aligned(8);
110 static uint64_t fsw_want_purge __sk_aligned(8);
111 
112 #define NX_FSW_FE_TABLESZ       256     /* some power of 2 */
113 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
114 
115 #define NX_FSW_FOB_HASHSZ       31      /* some mersenne prime */
116 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
117 
118 #define NX_FSW_FRB_HASHSZ       128     /* some power of 2 */
119 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
120 
121 #define NX_FSW_FRIB_HASHSZ      13      /* some mersenne prime */
122 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
123 
124 #define NX_FSW_FLOW_REAP_INTERVAL 1     /* seconds */
125 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
126 
127 #define NX_FSW_FLOW_PURGE_THRES 0       /* purge every N reaps (0 = disable) */
128 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
129 
130 #define FSW_REAP_IVAL            (MAX(1, fsw_flow_reap_interval))
131 #define FSW_REAP_SK_THRES        (FSW_REAP_IVAL << 5)
132 #define FSW_REAP_IF_THRES        (FSW_REAP_IVAL << 5)
133 #define FSW_DRAIN_CH_THRES       (FSW_REAP_IVAL << 5)
134 #define FSW_IFSTATS_THRES        1
135 
136 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
137 
138 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
139 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
140 #if (DEVELOPMENT || DEBUG)
141 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
142     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
143     "flowswitch Rx batch size");
144 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
145     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
146     "flowswitch Tx batch size");
147 #endif /* !DEVELOPMENT && !DEBUG */
148 
149 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
150     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
151     "flowswitch RX aggregation for tcp flows (enable/disable)");
152 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
153     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
154     "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
155 
156 /*
157  * IP reassembly
158  * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
159  * enable/disable the reassembly routine regardless of whether the
160  * transport netagent is enabled or not.
161  *
162  * 'fsw_ip_reass' is a tri-state:
163  *    0 means force IP reassembly off
164  *    1 means force IP reassembly on
165  *    2 means don't force the value, use what's appropriate for this flowswitch
166  */
167 #define FSW_IP_REASS_FORCE_OFF          0
168 #define FSW_IP_REASS_FORCE_ON           1
169 #define FSW_IP_REASS_NO_FORCE           2
170 
171 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
172 
173 static int
174 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
175 {
176 #pragma unused(oidp, arg1, arg2)
177 	unsigned int new_value;
178 	int changed;
179 	int error;
180 
181 	error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
182 	    &new_value, &changed);
183 	if (error == 0 && changed != 0) {
184 		if (new_value > FSW_IP_REASS_NO_FORCE) {
185 			return EINVAL;
186 		}
187 		fsw_ip_reass = new_value;
188 	}
189 	return error;
190 }
191 
192 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
193     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
194     0, 0, fsw_ip_reass_sysctl, "IU",
195     "adjust flowswitch IP reassembly");
196 
197 #if (DEVELOPMENT || DEBUG)
198 static uint64_t _fsw_inject_error = 0;
199 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
200 	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
201 	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
202 
203 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
204 	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
205 	        SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
206 	        if ((_f) != NULL)                                       \
207 	                (_f)(__VA_ARGS__);                              \
208 	}                                                               \
209 } while (0)
210 
211 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
212     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
213 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
214     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
215 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
216     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
217 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
218     flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
219     &fsw_flow_route_id_buckets, 0, "");
220 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
221     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
222 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
223     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
224 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
225     CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
226 #else
227 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
228 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
229 #endif /* !DEVELOPMENT && !DEBUG */
230 
231 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
232     struct flow_entry *);
233 static void fsw_reap_thread_func(void *, wait_result_t);
234 static void fsw_reap_thread_cont(void *, wait_result_t);
235 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
236 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
237 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
238 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
239 
240 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
241     struct __kern_packet *);
242 
243 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
244 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
245     uint32_t, uint32_t);
246 
247 static int __fsw_dp_inited = 0;
248 
249 int
fsw_dp_init(void)250 fsw_dp_init(void)
251 {
252 	_CASSERT(FSW_VP_DEV == 0);
253 	_CASSERT(FSW_VP_HOST == 1);
254 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
255 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
256 
257 	ASSERT(!__fsw_dp_inited);
258 
259 	flow_mgr_init();
260 	flow_init();
261 
262 	__fsw_dp_inited = 1;
263 
264 	return 0;
265 }
266 
267 void
fsw_dp_uninit(void)268 fsw_dp_uninit(void)
269 {
270 	if (__fsw_dp_inited) {
271 		flow_fini();
272 		flow_mgr_fini();
273 
274 		__fsw_dp_inited = 0;
275 	}
276 }
277 
278 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)279 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
280 {
281 	pp_free_pktq(pktq);
282 }
283 
284 #define dp_drop_pktq(fsw, pktq) do { \
285 	uint32_t _len = KPKTQ_LEN(pktq); \
286 	if (KPKTQ_EMPTY(pktq)) { \
287 	        ASSERT(_len == 0); \
288 	        return; \
289 	} \
290 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
291 	FSW_STATS_ADD(FSW_STATS_DROP, _len); \
292 	DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
293 	dp_free_pktq(fsw, pktq); \
294 } while (0)
295 
296 SK_NO_INLINE_ATTRIBUTE
297 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,bool input)298 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
299 {
300 	pid_t pid;
301 	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
302 	char *proc_name = NULL;
303 	pid_t epid;
304 	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
305 	char *eproc_name = NULL;
306 	sa_family_t af;
307 	bool tap_early = false;
308 	struct __kern_packet *pkt;
309 
310 	ASSERT(fe != NULL);
311 	ASSERT(fsw->fsw_ifp != NULL);
312 
313 	if (fe->fe_nx_port == FSW_VP_HOST) {
314 		/* allow packets to be tapped before aggregation happens */
315 		tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
316 		if (!tap_early) {
317 			/* all other traffic will be tapped in the dlil input path */
318 			return;
319 		}
320 	}
321 	if (fe->fe_key.fk_ipver == IPVERSION) {
322 		af = AF_INET;
323 	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
324 		af = AF_INET6;
325 	} else {
326 		return;
327 	}
328 
329 	pid = fe->fe_pid;
330 	if (fe->fe_proc_name[0] != '\0') {
331 		(void) strlcpy(proc_name_buf, fe->fe_proc_name,
332 		    sizeof(proc_name_buf));
333 		proc_name = proc_name_buf;
334 	}
335 	epid = fe->fe_epid;
336 	if (fe->fe_eproc_name[0] != '\0') {
337 		(void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
338 		    sizeof(eproc_name_buf));
339 		eproc_name = eproc_name_buf;
340 	}
341 	if (input) {
342 		KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
343 			pktap_input_packet(fsw->fsw_ifp, af,
344 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
345 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
346 			    IPPROTO_TCP, fe->fe_flowid,
347 			    tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
348 		}
349 	} else {
350 		KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
351 			pktap_output_packet(fsw->fsw_ifp, af,
352 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
353 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
354 			    0, 0, PTH_FLAG_NEXUS_CHAN);
355 		}
356 	}
357 }
358 
359 #if (DEVELOPMENT || DEBUG)
360 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)361 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
362     int *ret)
363 {
364 	static boolean_t _err35_flag_modified = FALSE;
365 
366 	switch (step) {
367 	case 1:
368 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
369 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
370 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
371 			_err35_flag_modified = TRUE;
372 		}
373 		break;
374 
375 	case 2:
376 		if (!_err35_flag_modified) {
377 			return;
378 		}
379 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
380 			m_freem(pkt->pkt_mbuf);
381 			pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
382 			pkt->pkt_mbuf = NULL;
383 		}
384 		*ret = EJUSTRETURN;
385 		fr->fr_flags |= FLOWRTF_RESOLVED;
386 		_err35_flag_modified = FALSE;
387 		break;
388 
389 	default:
390 		VERIFY(0);
391 		/* not reached */
392 	}
393 }
394 
395 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)396 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
397 {
398 	static boolean_t _err36_flag_modified = FALSE;
399 
400 	switch (step) {
401 	case 1:
402 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
403 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
404 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
405 			_err36_flag_modified = TRUE;
406 		}
407 		break;
408 
409 	case 2:
410 		if (!_err36_flag_modified) {
411 			return;
412 		}
413 		*ret = ENETUNREACH;
414 		fr->fr_flags |= FLOWRTF_RESOLVED;
415 		_err36_flag_modified = FALSE;
416 		break;
417 
418 	default:
419 		VERIFY(0);
420 		/* not reached */
421 	}
422 }
423 #else /* !DEVELOPMENT && !DEBUG */
424 #define _fsw_error35_handler(...)
425 #define _fsw_error36_handler(...)
426 #endif /* DEVELOPMENT || DEBUG */
427 
428 /*
429  * Check if the source packet content can fit into the destination
430  * ring's packet. Returns TRUE if the source packet can fit.
431  * Note: Failures could be caused by misconfigured packet pool sizes,
432  * missing packet size check again MTU or if the source packet is from
433  * a compat netif and the attached mbuf is larger than MTU due to LRO.
434  */
435 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)436 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
437     uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
438     uint32_t *copy_len)
439 {
440 	uint32_t tlen = 0;
441 	uint32_t splen = spkt->pkt_length - skip_l2hlen;
442 
443 	if (l2hlen != 0) {
444 		VERIFY(skip_l2hlen == 0);
445 		tlen += l2hlen;
446 	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
447 		splen -= ETHER_CRC_LEN;
448 	}
449 
450 	tlen += splen;
451 	*copy_len = splen;
452 
453 	return tlen <= ((__packet_get_buflet_count(dph) *
454 	       PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
455 	       headroom);
456 }
457 
458 #if SK_LOG
459 /* Hoisted out of line to reduce kernel stack footprint */
460 SK_LOG_ATTRIBUTE
461 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)462 copy_packet_from_dev_log(struct __kern_packet *spkt,
463     struct __kern_packet *dpkt, struct proc *p)
464 {
465 	uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
466 	    ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
467 	    SK_VERB_COPY_MBUF : SK_VERB_COPY));
468 	char *daddr;
469 	MD_BUFLET_ADDR_ABS(dpkt, daddr);
470 	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
471 	    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
472 	    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
473 	    (uint32_t)dpkt->pkt_l2_len);
474 	SK_DF(logflags | SK_VERB_DUMP, "%s",
475 	    sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
476 }
477 #else
478 #define copy_packet_from_dev_log(...)
479 #endif /* SK_LOG */
480 
481 
482 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)483 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
484     struct __kern_packet *dpkt)
485 {
486 	/*
487 	 * source and destination nexus don't share the packet pool
488 	 * sync operation here is to
489 	 * - alloc packet for the rx(dst) ring
490 	 * - copy data/metadata from src packet to dst packet
491 	 * - attach alloc'd packet to rx(dst) ring
492 	 */
493 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
494 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
495 	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
496 	    METADATA_SUBTYPE(spkt));
497 	boolean_t do_cksum_rx;
498 	uint16_t skip_l2h_len = spkt->pkt_l2_len;
499 	uint16_t iphlen;
500 	uint32_t dlen;
501 	int err;
502 
503 	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
504 	    &dlen))) {
505 		SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
506 		    PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
507 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
508 		return EINVAL;
509 	}
510 
511 	/* Copy packet metadata */
512 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
513 	_PKT_COPY(spkt, dpkt);
514 	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
515 	    PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
516 	ASSERT(dpkt->pkt_mbuf == NULL);
517 
518 	dpkt->pkt_headroom = 0;
519 	dpkt->pkt_l2_len = 0;
520 
521 	/* don't include IP header from partial sum */
522 	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
523 		iphlen = spkt->pkt_flow_ip_hlen;
524 		do_cksum_rx = sk_cksum_rx;
525 	} else {
526 		iphlen = 0;
527 		do_cksum_rx = FALSE;
528 	}
529 
530 	/* Copy packet payload */
531 	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
532 	    (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
533 		FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
534 		/*
535 		 * Source packet has truncated contents (just enough for
536 		 * the classifer) of an mbuf from the compat driver; copy
537 		 * the entire entire mbuf contents to destination packet.
538 		 */
539 		m_adj(spkt->pkt_mbuf, skip_l2h_len);
540 		ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
541 		fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
542 		    spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
543 	} else {
544 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
545 		/*
546 		 * Source packet has full contents, either from an mbuf
547 		 * that came up from the compat driver, or because it
548 		 * originated on the native driver; copy to destination.
549 		 */
550 		fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
551 		    (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
552 		    iphlen, 0, FALSE);
553 	}
554 
555 #if DEBUG || DEVELOPMENT
556 	if (__improbable(pkt_trailers > 0)) {
557 		dlen += pkt_add_trailers(dph, dlen, iphlen);
558 	}
559 #endif /* DEBUG || DEVELOPMENT */
560 
561 	/* Finalize and attach packet to Rx ring */
562 	METADATA_ADJUST_LEN(dpkt, 0, 0);
563 	err = __packet_finalize(dph);
564 	VERIFY(err == 0);
565 
566 	copy_packet_from_dev_log(spkt, dpkt, kernproc);
567 
568 	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
569 		ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
570 		mbuf_free(spkt->pkt_mbuf);
571 		KPKT_CLEAR_MBUF_DATA(spkt);
572 	} else {
573 		fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
574 	}
575 
576 	if (__probable(do_cksum_rx != 0)) {
577 		FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
578 	}
579 
580 	return 0;
581 }
582 
583 SK_NO_INLINE_ATTRIBUTE
584 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)585 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
586 {
587 	char *pkt_buf;
588 	void *l3_hdr;
589 	uint16_t nfrags, tlen;
590 	int err = 0;
591 
592 	switch (fsw_ip_reass) {
593 	case FSW_IP_REASS_FORCE_OFF:
594 		return pkt;
595 	case FSW_IP_REASS_FORCE_ON:
596 		break;
597 	default:
598 		if (!FSW_NETAGENT_ENABLED(fsw) ||
599 		    flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
600 			return pkt;
601 		}
602 		break;
603 	}
604 
605 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
606 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
607 
608 	ASSERT(fsw->fsw_ipfm != NULL);
609 	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
610 
611 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
612 		err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
613 		    (struct ip *)l3_hdr, &nfrags, &tlen);
614 	} else {
615 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
616 		/* we only handle frag header immediately after v6 header */
617 		err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
618 		    (struct ip6_hdr *)l3_hdr,
619 		    (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
620 		    &nfrags, &tlen);
621 	}
622 	if (__improbable(err != 0)) {
623 		/* if we get a bad fragment, free it */
624 		pp_free_packet_single(pkt);
625 		pkt = NULL;
626 	} else {
627 		ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
628 	}
629 
630 	return pkt;
631 }
632 
633 SK_NO_INLINE_ATTRIBUTE
634 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)635 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
636 {
637 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
638 	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
639 	kern_packet_t ph =  SK_PTR_ENCODE(pkt,
640 	    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
641 	/*
642 	 * This is the case when the packet is coming in from
643 	 * compat-netif. This packet only has valid metadata
644 	 * and an attached mbuf. We need to copy enough data
645 	 * from the mbuf to the packet buffer for the
646 	 * classifier. Compat netif packet pool is configured
647 	 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
648 	 * which is just enough to hold the protocol headers
649 	 * for the flowswitch classifier.
650 	 */
651 
652 	pkt->pkt_headroom = 0;
653 	METADATA_ADJUST_LEN(pkt, 0, 0);
654 	/*
655 	 * Copy the initial 128 bytes of the packet for
656 	 * classification.
657 	 * Ethernet(14) + IPv6 header(40) +
658 	 * + IPv6 fragment header(8) +
659 	 * TCP header with options(60).
660 	 */
661 	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
662 	    pkt->pkt_headroom, pkt->pkt_mbuf, 0,
663 	    MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
664 	    FALSE, 0);
665 
666 	int err = __packet_finalize_with_mbuf(pkt);
667 	VERIFY(err == 0);
668 }
669 
670 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)671 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
672 {
673 	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
674 
675 	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
676 		rx_prepare_packet_mbuf(fsw, pkt);
677 	}
678 
679 	return pkt;
680 }
681 
682 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)683 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
684     bool input, struct flow_entry *prev_fe)
685 {
686 	struct flow_key key __sk_aligned(16);
687 	struct flow_entry *fe = NULL;
688 
689 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
690 	flow_pkt2key(pkt, input, &key);
691 
692 	if (__probable(prev_fe != NULL &&
693 	    prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
694 		uint16_t saved_mask = key.fk_mask;
695 		key.fk_mask = FKMASK_5TUPLE;
696 		if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
697 			flow_entry_retain(prev_fe);
698 			fe = prev_fe;
699 		} else {
700 			key.fk_mask = saved_mask;
701 		}
702 	}
703 
704 top:
705 	if (__improbable(fe == NULL)) {
706 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
707 	}
708 
709 	if (__improbable(fe != NULL &&
710 	    (fe->fe_flags & (FLOWENT_PARENT | FLOWENT_CHILD)) != 0)) {
711 		/* Rx */
712 		if (input) {
713 			if (fe->fe_flags & FLOWENT_PARENT) {
714 				struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
715 				if (child_fe != NULL) {
716 					flow_entry_release(&fe);
717 					fe = child_fe;
718 				}
719 			} else {
720 				if (!rx_flow_demux_match(fsw, fe, pkt)) {
721 					flow_entry_release(&fe);
722 					fe = NULL;
723 					goto top;
724 				}
725 			}
726 		} else {
727 			/* Tx */
728 			if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
729 				if (__probable(fe->fe_flags & FLOWENT_PARENT)) {
730 					struct flow_entry *parent_fe = fe;
731 					fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
732 					flow_entry_release(&parent_fe);
733 				} else {
734 					flow_entry_release(&fe);
735 					fe = NULL;
736 					goto top;
737 				}
738 			}
739 		}
740 	}
741 
742 	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
743 	SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
744 	    "%s %s %s \"%s\" fe 0x%llx",
745 	    input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
746 	    sk_proc_name_address(current_proc()),
747 	    fk_as_string(&key, fkbuf, sizeof(fkbuf)),
748 	    SK_KVA(fe));
749 
750 	return fe;
751 }
752 
753 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)754 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
755     struct flow_entry *prev_fe)
756 {
757 	struct flow_entry *fe;
758 
759 	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
760 	_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
761 	if (fe == NULL) {
762 		FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
763 		return NULL;
764 	}
765 
766 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
767 		FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
768 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
769 		    "Rx flow torn down");
770 		flow_entry_release(&fe);
771 		fe = NULL;
772 	}
773 
774 	return fe;
775 }
776 
777 static inline void
rx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)778 rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
779     struct __kern_packet *pkt)
780 {
781 	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
782 		fe->fe_rx_frag_count++;
783 	}
784 
785 	/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
786 	if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
787 		ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
788 		TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
789 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
790 	} else {
791 		ASSERT(!TAILQ_EMPTY(fes));
792 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
793 		flow_entry_release(&fe);
794 	}
795 }
796 
797 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)798 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
799     struct __kern_packet *pkt)
800 {
801 	/* record frag continuation */
802 	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
803 		ASSERT(pkt->pkt_flow_ip_is_frag);
804 		fe->fe_tx_is_cont_frag = true;
805 		fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
806 	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
807 		fe->fe_tx_is_cont_frag = false;
808 		fe->fe_tx_frag_id = 0;
809 	}
810 
811 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
812 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
813 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
814 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
815 	} else {
816 		ASSERT(!TAILQ_EMPTY(fes));
817 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
818 		flow_entry_release(&fe);
819 	}
820 }
821 
822 static inline void
fsw_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)823 fsw_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
824     uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
825 {
826 	uint32_t n_pkts = 0;
827 
828 	KPKTQ_INIT(pktq);
829 
830 	slot_idx_t idx, idx_end;
831 	idx = r->ckr_khead;
832 	idx_end = r->ckr_rhead;
833 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
834 
835 	*n_bytes = 0;
836 	for (; n_pkts < n_pkts_max && idx != idx_end;
837 	    idx = SLOT_NEXT(idx, r->ckr_lim)) {
838 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
839 		struct __kern_packet *pkt = ksd->sd_pkt;
840 
841 		ASSERT(pkt->pkt_nextpkt == NULL);
842 		KR_SLOT_DETACH_METADATA(r, ksd);
843 
844 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
845 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
846 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
847 		    || (pkt->pkt_length == 0)) {
848 			FSW_STATS_INC(FSW_STATS_DROP);
849 			pp_free_packet_single(pkt);
850 			continue;
851 		}
852 		if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
853 			__packet_set_tx_nx_port(SK_PKT2PH(pkt),
854 			    vpna->vpna_nx_port, vpna->vpna_gencnt);
855 		}
856 
857 		n_pkts++;
858 		*n_bytes += pkt->pkt_length;
859 
860 		KPKTQ_ENQUEUE(pktq, pkt);
861 	}
862 
863 	r->ckr_khead = idx;
864 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
865 }
866 
867 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)868 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
869     struct pktq *pktq)
870 {
871 #pragma unused(fsw)
872 	struct __kern_packet *pkt;
873 	struct __kern_quantum *kqum;
874 	uint32_t kr_space_avail = 0;
875 	uint32_t n, n_pkts = 0, n_bytes = 0;
876 	slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
877 
878 	kr_enter(r, TRUE);
879 
880 	idx_start = r->ckr_ktail;
881 	kr_space_avail = kr_available_slots_rxring(r);
882 	_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
883 	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
884 	_FSW_INJECT_ERROR(41, n, 0, null_func);
885 	idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
886 
887 	idx = idx_start;
888 	while (idx != idx_end) {
889 		KPKTQ_DEQUEUE(pktq, pkt);
890 		kqum = SK_PTR_ADDR_KQUM(pkt);
891 		kqum->qum_qflags |= QUM_F_FINALIZED;
892 		n_pkts++;
893 		n_bytes += pkt->pkt_length;
894 		KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
895 		if (__improbable(pkt->pkt_trace_id != 0)) {
896 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
897 			KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
898 		}
899 		idx = SLOT_NEXT(idx, r->ckr_lim);
900 	}
901 
902 	kr_update_stats(r, n_pkts, n_bytes);
903 
904 	/*
905 	 * ensure slot attachments are visible before updating the
906 	 * tail pointer
907 	 */
908 	membar_sync();
909 
910 	r->ckr_ktail = idx_end;
911 
912 	kr_exit(r);
913 
914 	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
915 
916 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
917 	    r->ckr_name, n_pkts);
918 }
919 
920 static void
pkts_to_pktq(struct __kern_packet * pkts[],uint32_t n_pkts,struct pktq * pktq)921 pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
922 {
923 	ASSERT(KPKTQ_EMPTY(pktq));
924 
925 	for (uint32_t i = 0; i < n_pkts; i++) {
926 		struct __kern_packet *pkt = pkts[i];
927 		ASSERT(pkt->pkt_nextpkt == NULL);
928 		KPKTQ_ENQUEUE(pktq, pkt);
929 	}
930 }
931 
932 /*
933  * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
934  */
935 SK_NO_INLINE_ATTRIBUTE
936 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)937 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
938     struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
939 {
940 	uint32_t tot_cnt;
941 	unsigned int num_segs = 1;
942 	struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head;
943 	uint32_t mhead_cnt, mhead_bufsize;
944 	uint32_t mhead_waste = 0;
945 	uint32_t mcnt = 0, mbytes = 0;
946 	uint32_t largest, max_pkt_len;
947 	struct __kern_packet *pkt;
948 	struct kern_pbufpool *pp;
949 
950 	tot_cnt = KPKTQ_LEN(pktq);
951 	ASSERT(tot_cnt > 0);
952 	mhead_cnt = tot_cnt;
953 
954 	/*
955 	 * Opportunistically batch-allocate the mbufs based on the largest
956 	 * packet size we've seen in the recent past.  Note that we reset
957 	 * fe_rx_largest_size below if we notice that we're under-utilizing the
958 	 * allocated buffers (thus disabling this batch allocation).
959 	 */
960 	largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
961 	if (__probable(largest != 0)) {
962 		if (largest <= MCLBYTES) {
963 			mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
964 			    &num_segs, M_NOWAIT, 1, 0);
965 			mhead_bufsize = MCLBYTES;
966 		} else if (largest <= MBIGCLBYTES) {
967 			mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
968 			    &num_segs, M_NOWAIT, 1, 0);
969 			mhead_bufsize = MBIGCLBYTES;
970 		} else if (largest <= M16KCLBYTES) {
971 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
972 			    &num_segs, M_NOWAIT, 1, 0);
973 			mhead_bufsize = M16KCLBYTES;
974 		} else if (largest <= M16KCLBYTES * 2) {
975 			num_segs = 2;
976 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
977 			    &num_segs, M_NOWAIT, 1, 0);
978 			mhead_bufsize = M16KCLBYTES * 2;
979 		} else {
980 			mhead = NULL;
981 			mhead_bufsize = mhead_cnt = 0;
982 		}
983 	} else {
984 		mhead = NULL;
985 		mhead_bufsize = mhead_cnt = 0;
986 	}
987 	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
988 	    uint32_t, mhead_cnt, uint32_t, tot_cnt);
989 
990 	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
991 	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
992 
993 	KPKTQ_FOREACH(pkt, pktq) {
994 		uint32_t tot_len, len;
995 		uint16_t pad, llhlen, iphlen;
996 		boolean_t do_cksum_rx;
997 		struct mbuf *m;
998 		int error;
999 
1000 		llhlen = pkt->pkt_l2_len;
1001 		len = pkt->pkt_length;
1002 		if (__improbable(len > max_pkt_len || llhlen > len)) {
1003 			DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1004 			    struct __kern_packet *, pkt);
1005 			FSW_STATS_INC(FSW_STATS_DROP);
1006 			FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1007 			continue;
1008 		}
1009 		/* begin payload on 32-bit boundary; figure out the padding */
1010 		pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1011 		tot_len = pad + len;
1012 
1013 		/* remember largest packet size */
1014 		if (__improbable(largest < tot_len)) {
1015 			largest = MAX(tot_len, MCLBYTES);
1016 		}
1017 
1018 		/*
1019 		 * If the above batch allocation returned partial
1020 		 * success, we try a blocking allocation here again.
1021 		 */
1022 		m = mhead;
1023 		if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1024 			ASSERT(mhead != NULL || mhead_cnt == 0);
1025 			num_segs = 1;
1026 			if (tot_len > M16KCLBYTES) {
1027 				num_segs = 0;
1028 			}
1029 			if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1030 			    &num_segs, &m)) != 0) {
1031 				DTRACE_SKYWALK2(bad__len,
1032 				    struct nx_flowswitch *, fsw,
1033 				    struct __kern_packet *, pkt);
1034 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1035 				FSW_STATS_INC(FSW_STATS_DROP);
1036 				continue;
1037 			}
1038 		} else {
1039 			mhead = m->m_nextpkt;
1040 			m->m_nextpkt = NULL;
1041 			ASSERT(mhead_cnt != 0);
1042 			--mhead_cnt;
1043 
1044 			/* check if we're underutilizing large buffers */
1045 			if (__improbable(mhead_bufsize > MCLBYTES &&
1046 			    tot_len < (mhead_bufsize >> 1))) {
1047 				++mhead_waste;
1048 			}
1049 			if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1050 				FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1051 			}
1052 		}
1053 		m->m_data += pad;
1054 		m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1055 
1056 		/* don't include IP header from partial sum */
1057 		if (__probable((pkt->pkt_qum_qflags &
1058 		    QUM_F_FLOW_CLASSIFIED) != 0)) {
1059 			iphlen = pkt->pkt_flow_ip_hlen;
1060 			do_cksum_rx = sk_cksum_rx;
1061 		} else {
1062 			iphlen = 0;
1063 			do_cksum_rx = FALSE;
1064 		}
1065 
1066 		fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1067 		    pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1068 		    llhlen + iphlen);
1069 
1070 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1071 		if (do_cksum_rx) {
1072 			FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1073 		}
1074 #if DEBUG || DEVELOPMENT
1075 		if (__improbable(pkt_trailers > 0)) {
1076 			(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1077 		}
1078 #endif /* DEBUG || DEVELOPMENT */
1079 		m_adj(m, llhlen);
1080 
1081 		m->m_pkthdr.rcvif = fsw->fsw_ifp;
1082 		if (__improbable((pkt->pkt_link_flags &
1083 		    PKT_LINKF_ETHFCS) != 0)) {
1084 			m->m_flags |= M_HASFCS;
1085 		}
1086 		if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1087 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1088 		}
1089 		ASSERT(m->m_nextpkt == NULL);
1090 		tail = m;
1091 		*tailp = m;
1092 		tailp = &m->m_nextpkt;
1093 		mcnt++;
1094 		mbytes += m_pktlen(m);
1095 	}
1096 	/* free any leftovers */
1097 	if (__improbable(mhead != NULL)) {
1098 		DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1099 		ASSERT(mhead_cnt != 0);
1100 		(void) m_freem_list(mhead);
1101 		mhead = NULL;
1102 		mhead_cnt = 0;
1103 	}
1104 
1105 	/* reset if most packets (>50%) are smaller than our batch buffers */
1106 	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1107 		DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1108 		    struct flow_entry *, NULL, uint32_t, mhead_waste,
1109 		    uint32_t, tot_cnt);
1110 		largest = 0;
1111 	}
1112 
1113 	if (largest != fsw->fsw_rx_largest_size) {
1114 		atomic_set_32(&fsw->fsw_rx_largest_size, largest);
1115 	}
1116 
1117 	pp_free_pktq(pktq);
1118 	*m_headp = head;
1119 	*m_tailp = tail;
1120 	*cnt = mcnt;
1121 	*bytes = mbytes;
1122 }
1123 
1124 /*
1125  * This function only extracts the mbuf from the packet. The caller frees
1126  * the packet.
1127  */
1128 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1129 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1130 {
1131 	struct mbuf *m;
1132 	struct pkthdr *mhdr;
1133 	uint16_t llhlen;
1134 
1135 	m = pkt->pkt_mbuf;
1136 	ASSERT(m != NULL);
1137 
1138 	llhlen = pkt->pkt_l2_len;
1139 	if (llhlen > pkt->pkt_length) {
1140 		m_freem(m);
1141 		KPKT_CLEAR_MBUF_DATA(pkt);
1142 		DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1143 		    struct __kern_packet *, pkt);
1144 		FSW_STATS_INC(FSW_STATS_DROP);
1145 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1146 		return NULL;
1147 	}
1148 	mhdr = &m->m_pkthdr;
1149 	if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1150 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1151 		mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1152 		mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1153 		mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1154 		mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1155 	}
1156 #if DEBUG || DEVELOPMENT
1157 	uint32_t extra = 0;
1158 	if (__improbable(pkt_trailers > 0)) {
1159 		extra = pkt_add_trailers_mbuf(m, llhlen);
1160 	}
1161 #endif /* DEBUG || DEVELOPMENT */
1162 	m_adj(m, llhlen);
1163 	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1164 	KPKT_CLEAR_MBUF_DATA(pkt);
1165 	return m;
1166 }
1167 
1168 SK_NO_INLINE_ATTRIBUTE
1169 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1170 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1171     struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1172 {
1173 	struct __kern_packet *pkt;
1174 	struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1175 	uint32_t c = 0, b = 0;
1176 
1177 	KPKTQ_FOREACH(pkt, pktq) {
1178 		m = convert_compat_pkt_to_mbuf(fsw, pkt);
1179 		if (__improbable(m == NULL)) {
1180 			continue;
1181 		}
1182 		tail = m;
1183 		*tailp = m;
1184 		tailp = &m->m_nextpkt;
1185 		c++;
1186 		b += m_pktlen(m);
1187 	}
1188 	pp_free_pktq(pktq);
1189 	*m_head = head;
1190 	*m_tail = tail;
1191 	*cnt = c;
1192 	*bytes = b;
1193 }
1194 
1195 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1196 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1197     uint32_t cnt, uint32_t bytes)
1198 {
1199 	struct ifnet_stat_increment_param s;
1200 
1201 	bzero(&s, sizeof(s));
1202 	s.packets_in = cnt;
1203 	s.bytes_in = bytes;
1204 	dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1205 }
1206 
1207 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1208 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1209 {
1210 	struct mbuf *m_head = NULL, *m_tail = NULL;
1211 	uint32_t cnt = 0, bytes = 0;
1212 	boolean_t compat;
1213 
1214 	ASSERT(!KPKTQ_EMPTY(pktq));
1215 
1216 	/* All packets in the pktq must have the same type */
1217 	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1218 	if (compat) {
1219 		convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1220 		    &bytes);
1221 	} else {
1222 		convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1223 		    &bytes);
1224 	}
1225 	if (__improbable(m_head == NULL)) {
1226 		DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1227 		return;
1228 	}
1229 	fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1230 }
1231 
1232 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1233 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1234     struct __kern_channel_ring *r, struct pktq *pktq)
1235 {
1236 	fsw_ring_enqueue_pktq(fsw, r, pktq);
1237 	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1238 	dp_drop_pktq(fsw, pktq);
1239 }
1240 
1241 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1242 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1243 {
1244 	struct kern_nexus *nx = fsw->fsw_nx;
1245 	struct nexus_adapter *na = NULL;
1246 	nexus_port_t port = fe->fe_nx_port;
1247 
1248 	if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1249 		SK_ERR("dev or host ports have no NA");
1250 		return NULL;
1251 	}
1252 
1253 	if (__improbable(!nx_port_is_valid(nx, port))) {
1254 		SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1255 		    if_name(fsw->fsw_ifp), port);
1256 		return NULL;
1257 	}
1258 
1259 	na = nx_port_get_na(nx, port);
1260 	if (__improbable(na == NULL)) {
1261 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1262 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1263 		    if_name(fsw->fsw_ifp), port);
1264 		return NULL;
1265 	}
1266 
1267 	if (__improbable(!NA_IS_ACTIVE(na))) {
1268 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1269 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1270 		    if_name(fsw->fsw_ifp), port);
1271 		return NULL;
1272 	}
1273 
1274 	if (__improbable(nx_port_is_defunct(nx, port))) {
1275 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1276 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1277 		    if_name(fsw->fsw_ifp), port);
1278 		return NULL;
1279 	}
1280 
1281 	return na;
1282 }
1283 
1284 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1285 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1286 {
1287 	struct nexus_vp_adapter *na = NULL;
1288 	struct __kern_channel_ring *r = NULL;
1289 
1290 	na = VPNA(flow_get_na(fsw, fe));
1291 	if (__improbable(na == NULL)) {
1292 		return NULL;
1293 	}
1294 
1295 	switch (txrx) {
1296 	case NR_RX:
1297 		r = &na->vpna_up.na_rx_rings[0];
1298 		break;
1299 	case NR_TX:
1300 		r = &na->vpna_up.na_tx_rings[0];
1301 		break;
1302 	default:
1303 		__builtin_unreachable();
1304 		VERIFY(0);
1305 	}
1306 
1307 	if (__improbable(KR_DROP(r))) {
1308 		FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1309 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1310 		    r->ckr_name, SK_KVA(r));
1311 		return NULL;
1312 	}
1313 
1314 	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1315 
1316 #if (DEVELOPMENT || DEBUG)
1317 	if (r != NULL) {
1318 		_FSW_INJECT_ERROR(4, r, NULL, null_func);
1319 	}
1320 #endif /* DEVELOPMENT || DEBUG */
1321 
1322 	return r;
1323 }
1324 
1325 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1326 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1327 {
1328 	return flow_get_ring(fsw, fe, NR_RX);
1329 }
1330 
1331 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1332 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1333 {
1334 	return flow_get_ring(fsw, fe, NR_TX);
1335 }
1336 
1337 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1338 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1339 {
1340 	struct flow_route *fr = fe->fe_route;
1341 	struct ifnet *ifp = fsw->fsw_ifp;
1342 
1343 	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1344 	    !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1345 	    fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1346 	    !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1347 		/*
1348 		 * The source address is no longer around; we want this
1349 		 * flow to be nonviable, but that requires holding the lock
1350 		 * as writer (which isn't the case now.)  Indicate that
1351 		 * we need to finalize the nonviable later down below.
1352 		 *
1353 		 * We also request that the flow route be re-configured,
1354 		 * if this is a connected mode flow.
1355 		 *
1356 		 */
1357 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1358 			/*
1359 			 * fsw_pending_nonviable is a hint for reaper thread;
1360 			 * due to the fact that setting fe_want_nonviable and
1361 			 * incrementing fsw_pending_nonviable counter is not
1362 			 * atomic, let the increment happen first, and the
1363 			 * thread losing the CAS does decrement.
1364 			 */
1365 			atomic_add_32(&fsw->fsw_pending_nonviable, 1);
1366 			if (atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
1367 				fsw_reap_sched(fsw);
1368 			} else {
1369 				atomic_add_32(&fsw->fsw_pending_nonviable, -1);
1370 			}
1371 		}
1372 		if (fr != NULL) {
1373 			atomic_add_32(&fr->fr_want_configure, 1);
1374 		}
1375 	}
1376 
1377 	/* if flow was (or is going to be) marked as nonviable, drop it */
1378 	if (__improbable(fe->fe_want_nonviable ||
1379 	    (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1380 		SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1381 		    SK_KVA(fe));
1382 		return false;
1383 	}
1384 	return true;
1385 }
1386 
1387 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1388 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1389 {
1390 	bool okay;
1391 	okay = dp_flow_route_process(fsw, fe);
1392 #if (DEVELOPMENT || DEBUG)
1393 	if (okay) {
1394 		_FSW_INJECT_ERROR(5, okay, false, null_func);
1395 	}
1396 #endif /* DEVELOPMENT || DEBUG */
1397 
1398 	return okay;
1399 }
1400 
1401 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1402 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1403 {
1404 	struct pktq dpkts;              /* dst pool alloc'ed packets */
1405 	struct pktq disposed_pkts;         /* done src packets */
1406 	struct pktq dropped_pkts;         /* dropped src packets */
1407 	struct pktq transferred_pkts;         /* dst packet ready for ring */
1408 	struct __kern_packet *pkt, *tpkt;
1409 	struct kern_pbufpool *dpp;
1410 	uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1411 	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1412 	uint16_t buf_array_iter = 0;
1413 	uint32_t cnt, buf_cnt = 0;
1414 	int err;
1415 
1416 	KPKTQ_INIT(&dpkts);
1417 	KPKTQ_INIT(&dropped_pkts);
1418 	KPKTQ_INIT(&disposed_pkts);
1419 	KPKTQ_INIT(&transferred_pkts);
1420 
1421 	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1422 		SK_ERR("Rx route bad");
1423 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1424 		FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1425 		goto done;
1426 	}
1427 
1428 	if (fe->fe_nx_port == FSW_VP_HOST) {
1429 		/*
1430 		 * The host ring does not exist anymore so we can't take
1431 		 * the enqueue path below. This path should only be hit
1432 		 * for the rare tcp fragmentation case.
1433 		 */
1434 		fsw_host_rx(fsw, &fe->fe_rx_pktq);
1435 		return;
1436 	}
1437 
1438 	/* find the ring */
1439 	struct __kern_channel_ring *r;
1440 	r = fsw_flow_get_rx_ring(fsw, fe);
1441 	if (__improbable(r == NULL)) {
1442 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1443 		goto done;
1444 	}
1445 
1446 	/* snoop before L2 is stripped */
1447 	if (__improbable(pktap_total_tap_count != 0)) {
1448 		fsw_snoop(fsw, fe, true);
1449 	}
1450 
1451 	dpp = r->ckr_pp;
1452 	/* batch allocate enough packets */
1453 	err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1454 	    SKMEM_NOSLEEP);
1455 	if (__improbable(err == ENOMEM)) {
1456 		ASSERT(KPKTQ_EMPTY(&dpkts));
1457 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1458 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1459 		SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1460 		    r->ckr_name, SK_KVA(r));
1461 		goto done;
1462 	}
1463 
1464 	/*
1465 	 * estimate total number of buflets for the packet chain.
1466 	 */
1467 	cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
1468 	if (cnt > n_pkts) {
1469 		ASSERT(dpp->pp_max_frags > 1);
1470 		cnt -= n_pkts;
1471 		buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1472 		err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1473 		    SKMEM_NOSLEEP, PP_ALLOC_BFT_ATTACH_BUFFER);
1474 		if (__improbable(buf_cnt == 0)) {
1475 			KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1476 			FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1477 			SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1478 			    "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1479 			goto done;
1480 		}
1481 		err = 0;
1482 	}
1483 
1484 	/* extra processing for user flow */
1485 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1486 		err = 0;
1487 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1488 		if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1489 			fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1490 		} else {
1491 			fe->fe_rx_pktq_bytes = 0;
1492 		}
1493 		err = flow_pkt_track(fe, pkt, true);
1494 		_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1495 		if (__improbable(err != 0)) {
1496 			SK_ERR("flow_pkt_track failed (err %d)", err);
1497 			FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1498 			/* if need to trigger RST */
1499 			if (err == ENETRESET) {
1500 				flow_track_abort_tcp(fe, pkt, NULL);
1501 			}
1502 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1503 			continue;
1504 		}
1505 
1506 		/* transfer to dpkt */
1507 		if (pkt->pkt_qum.qum_pp != dpp) {
1508 			struct __kern_buflet *bprev, *bnew;
1509 			struct __kern_packet *dpkt = NULL;
1510 			uint32_t n_bufs, i;
1511 
1512 			KPKTQ_DEQUEUE(&dpkts, dpkt);
1513 			if (__improbable(dpkt == NULL)) {
1514 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1515 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1516 				continue;
1517 			}
1518 			n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1519 			n_bufs--;
1520 			for (i = 0; i < n_bufs; i++) {
1521 				if (__improbable(buf_cnt == 0)) {
1522 					ASSERT(dpp->pp_max_frags > 1);
1523 					buf_array_iter = 0;
1524 					cnt = howmany(fe->fe_rx_pktq_bytes,
1525 					    PP_BUF_SIZE_DEF(dpp));
1526 					n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1527 					if (cnt >= n_pkts) {
1528 						cnt -= n_pkts;
1529 					} else {
1530 						cnt = 0;
1531 					}
1532 					cnt += (n_bufs - i);
1533 					buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1534 					    cnt);
1535 					cnt = buf_cnt;
1536 					err = pp_alloc_buflet_batch(dpp,
1537 					    buf_array, &buf_cnt,
1538 					    SKMEM_NOSLEEP, PP_ALLOC_BFT_ATTACH_BUFFER);
1539 					if (__improbable(buf_cnt == 0)) {
1540 						FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1541 						KPKTQ_ENQUEUE(&dropped_pkts,
1542 						    pkt);
1543 						pkt = NULL;
1544 						pp_free_packet_single(dpkt);
1545 						dpkt = NULL;
1546 						SK_ERR("failed to alloc %d "
1547 						    "buflets (err %d) for "
1548 						    "kr %s, 0x%llu", cnt, err,
1549 						    r->ckr_name, SK_KVA(r));
1550 						break;
1551 					}
1552 					err = 0;
1553 				}
1554 				ASSERT(buf_cnt != 0);
1555 				if (i == 0) {
1556 					PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1557 				}
1558 				bnew = (kern_buflet_t)buf_array[buf_array_iter];
1559 				buf_array[buf_array_iter] = 0;
1560 				buf_array_iter++;
1561 				buf_cnt--;
1562 				VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1563 				    bprev, bnew) == 0);
1564 				bprev = bnew;
1565 			}
1566 			if (__improbable(err != 0)) {
1567 				continue;
1568 			}
1569 			err = copy_packet_from_dev(fsw, pkt, dpkt);
1570 			_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1571 			if (__improbable(err != 0)) {
1572 				SK_ERR("copy packet failed (err %d)", err);
1573 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1574 				pp_free_packet_single(dpkt);
1575 				dpkt = NULL;
1576 				continue;
1577 			}
1578 			KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1579 			pkt = dpkt;
1580 		}
1581 		_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1582 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1583 		pkt->pkt_policy_id = fe->fe_policy_id;
1584 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1585 		if (pkt->pkt_bufs_cnt > 1) {
1586 			pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1587 			pkt->pkt_seg_cnt = 1;
1588 		}
1589 		KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1590 	}
1591 	KPKTQ_FINI(&fe->fe_rx_pktq);
1592 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1593 	KPKTQ_FINI(&transferred_pkts);
1594 
1595 	fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
1596 
1597 done:
1598 	/* Free unused buflets */
1599 	while (buf_cnt > 0) {
1600 		pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1601 		buf_array[buf_array_iter] = 0;
1602 		buf_array_iter++;
1603 		buf_cnt--;
1604 	}
1605 	dp_free_pktq(fsw, &dpkts);
1606 	dp_free_pktq(fsw, &disposed_pkts);
1607 	dp_drop_pktq(fsw, &dropped_pkts);
1608 }
1609 
1610 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1611 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1612 {
1613 	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1614 	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1615 
1616 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1617 	    KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1618 
1619 	/* flow related processing (default, agg, fpd, etc.) */
1620 	fe->fe_rx_process(fsw, fe);
1621 
1622 	if (__improbable(fe->fe_want_withdraw)) {
1623 		fsw_reap_sched(fsw);
1624 	}
1625 
1626 	KPKTQ_FINI(&fe->fe_rx_pktq);
1627 }
1628 
1629 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1630 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1631 {
1632 	/*
1633 	 * We only care about wake packets of flows that belong the flow switch
1634 	 * as wake packets for the host stack are handled by the host input
1635 	 * function
1636 	 */
1637 #if (DEBUG || DEVELOPMENT)
1638 	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1639 		/*
1640 		 * This is a one shot command
1641 		 */
1642 		fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1643 
1644 		pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1645 	}
1646 #endif /* (DEBUG || DEVELOPMENT) */
1647 	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1648 		if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
1649 	}
1650 }
1651 
1652 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)1653 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
1654 {
1655 	struct __kern_packet *pkt, *tpkt;
1656 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1657 	struct flow_entry *fe, *prev_fe;
1658 	sa_family_t af;
1659 	struct pktq host_pkts, dropped_pkts;
1660 	int err;
1661 
1662 	KPKTQ_INIT(&host_pkts);
1663 	KPKTQ_INIT(&dropped_pkts);
1664 
1665 	if (__improbable(FSW_QUIESCED(fsw))) {
1666 		DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1667 		KPKTQ_CONCAT(&dropped_pkts, pktq);
1668 		goto done;
1669 	}
1670 	if (__improbable(fsw->fsw_demux == NULL)) {
1671 		KPKTQ_CONCAT(&dropped_pkts, pktq);
1672 		goto done;
1673 	}
1674 
1675 	prev_fe = NULL;
1676 	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1677 		if (__probable(tpkt)) {
1678 			void *baddr;
1679 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1680 			SK_PREFETCH(baddr, 0);
1681 			/* prefetch L3 and L4 flow structs */
1682 			SK_PREFETCHW(tpkt->pkt_flow, 0);
1683 			SK_PREFETCHW(tpkt->pkt_flow, 128);
1684 		}
1685 
1686 		KPKTQ_REMOVE(pktq, pkt);
1687 
1688 		pkt = rx_prepare_packet(fsw, pkt);
1689 
1690 		af = fsw->fsw_demux(fsw, pkt);
1691 		if (__improbable(af == AF_UNSPEC)) {
1692 			KPKTQ_ENQUEUE(&host_pkts, pkt);
1693 			continue;
1694 		}
1695 
1696 		err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
1697 		_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1698 		if (__improbable(err != 0)) {
1699 			FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1700 			KPKTQ_ENQUEUE(&host_pkts, pkt);
1701 			continue;
1702 		}
1703 
1704 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1705 			pkt = rx_process_ip_frag(fsw, pkt);
1706 			if (pkt == NULL) {
1707 				continue;
1708 			}
1709 		}
1710 
1711 #if DEVELOPMENT || DEBUG
1712 		trace_pkt_dump_payload(fsw->fsw_ifp, pkt, true);
1713 #endif /* DEVELOPMENT || DEBUG */
1714 
1715 		prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
1716 		if (__improbable(fe == NULL)) {
1717 			KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
1718 			continue;
1719 		}
1720 
1721 		fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1722 
1723 		dp_rx_process_wake_packet(fsw, pkt);
1724 
1725 		rx_flow_batch_packet(&fes, fe, pkt);
1726 		prev_fe = fe;
1727 	}
1728 
1729 	struct flow_entry *tfe = NULL;
1730 	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
1731 		rx_flow_process(fsw, fe);
1732 		TAILQ_REMOVE(&fes, fe, fe_rx_link);
1733 		fe->fe_rx_pktq_bytes = 0;
1734 		fe->fe_rx_frag_count = 0;
1735 		flow_entry_release(&fe);
1736 	}
1737 
1738 	if (!KPKTQ_EMPTY(&host_pkts)) {
1739 		fsw_host_rx(fsw, &host_pkts);
1740 	}
1741 
1742 done:
1743 	dp_drop_pktq(fsw, &dropped_pkts);
1744 }
1745 
1746 #if (DEVELOPMENT || DEBUG)
1747 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)1748 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
1749     struct __kern_packet *pkt)
1750 {
1751 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
1752 
1753 	lck_mtx_lock_spin(&frt->frt_lock);
1754 	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
1755 	lck_mtx_unlock(&frt->frt_lock);
1756 }
1757 
1758 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)1759 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
1760 {
1761 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
1762 
1763 	ASSERT(frt->frt_thread != THREAD_NULL);
1764 	lck_mtx_lock_spin(&frt->frt_lock);
1765 	ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
1766 
1767 	frt->frt_requests++;
1768 	if (!(frt->frt_flags & FRT_RUNNING)) {
1769 		thread_wakeup((caddr_t)frt);
1770 	}
1771 	lck_mtx_unlock(&frt->frt_lock);
1772 }
1773 
1774 __attribute__((noreturn))
1775 static void
fsw_rps_thread_cont(void * v,wait_result_t w)1776 fsw_rps_thread_cont(void *v, wait_result_t w)
1777 {
1778 	struct fsw_rps_thread *frt = v;
1779 	struct nx_flowswitch *fsw = frt->frt_fsw;
1780 
1781 	lck_mtx_lock(&frt->frt_lock);
1782 	if (__improbable(w == THREAD_INTERRUPTIBLE ||
1783 	    (frt->frt_flags & FRT_TERMINATING) != 0)) {
1784 		goto terminate;
1785 	}
1786 	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
1787 		goto done;
1788 	}
1789 	frt->frt_flags |= FRT_RUNNING;
1790 
1791 	for (;;) {
1792 		uint32_t requests = frt->frt_requests;
1793 		struct pktq pkts;
1794 
1795 		KPKTQ_INIT(&pkts);
1796 		KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
1797 		lck_mtx_unlock(&frt->frt_lock);
1798 
1799 		sk_protect_t protect;
1800 		protect = sk_sync_protect();
1801 		FSW_RLOCK(fsw);
1802 		_fsw_receive_locked(fsw, &pkts);
1803 		FSW_RUNLOCK(fsw);
1804 		sk_sync_unprotect(protect);
1805 
1806 		lck_mtx_lock(&frt->frt_lock);
1807 		if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
1808 		    requests == frt->frt_requests) {
1809 			frt->frt_requests = 0;
1810 			break;
1811 		}
1812 	}
1813 
1814 done:
1815 	lck_mtx_unlock(&frt->frt_lock);
1816 	if (!(frt->frt_flags & FRT_TERMINATING)) {
1817 		frt->frt_flags &= ~FRT_RUNNING;
1818 		assert_wait(frt, THREAD_UNINT);
1819 		thread_block_parameter(fsw_rps_thread_cont, frt);
1820 		__builtin_unreachable();
1821 	} else {
1822 terminate:
1823 		LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
1824 		frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
1825 		frt->frt_flags |= FRT_TERMINATED;
1826 
1827 		if (frt->frt_flags & FRT_TERMINATEBLOCK) {
1828 			thread_wakeup((caddr_t)&frt);
1829 		}
1830 		lck_mtx_unlock(&frt->frt_lock);
1831 
1832 		SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
1833 		    frt->frt_idx);
1834 
1835 		/* for the extra refcnt from kernel_thread_start() */
1836 		thread_deallocate(current_thread());
1837 		/* this is the end */
1838 		thread_terminate(current_thread());
1839 		/* NOTREACHED */
1840 		__builtin_unreachable();
1841 	}
1842 
1843 	/* must never get here */
1844 	VERIFY(0);
1845 	/* NOTREACHED */
1846 	__builtin_unreachable();
1847 }
1848 
1849 __attribute__((noreturn))
1850 static void
fsw_rps_thread_func(void * v,wait_result_t w)1851 fsw_rps_thread_func(void *v, wait_result_t w)
1852 {
1853 #pragma unused(w)
1854 	struct fsw_rps_thread *frt = v;
1855 	struct nx_flowswitch *fsw = frt->frt_fsw;
1856 
1857 	char thread_name[MAXTHREADNAMESIZE];
1858 	bzero(thread_name, sizeof(thread_name));
1859 	(void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
1860 	    if_name(fsw->fsw_ifp), frt->frt_idx);
1861 	thread_set_thread_name(frt->frt_thread, thread_name);
1862 	SK_D("%s spawned", thread_name);
1863 
1864 	net_thread_marks_push(NET_THREAD_SYNC_RX);
1865 	assert_wait(frt, THREAD_UNINT);
1866 	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
1867 
1868 	__builtin_unreachable();
1869 }
1870 
1871 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)1872 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
1873 {
1874 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
1875 	uint64_t f = (1 * NSEC_PER_MSEC);
1876 	uint64_t s = (1000 * NSEC_PER_SEC);
1877 	uint32_t c = 0;
1878 
1879 	lck_mtx_lock(&frt->frt_lock);
1880 	frt->frt_flags |= FRT_TERMINATING;
1881 
1882 	while (!(frt->frt_flags & FRT_TERMINATED)) {
1883 		uint64_t t = 0;
1884 		nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
1885 		clock_absolutetime_interval_to_deadline(t, &t);
1886 		ASSERT(t != 0);
1887 
1888 		frt->frt_flags |= FRT_TERMINATEBLOCK;
1889 		if (!(frt->frt_flags & FRT_RUNNING)) {
1890 			thread_wakeup_one((caddr_t)frt);
1891 		}
1892 		(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
1893 		lck_mtx_unlock(&frt->frt_lock);
1894 		thread_block(THREAD_CONTINUE_NULL);
1895 		lck_mtx_lock(&frt->frt_lock);
1896 		frt->frt_flags &= ~FRT_TERMINATEBLOCK;
1897 	}
1898 	ASSERT(frt->frt_flags & FRT_TERMINATED);
1899 	lck_mtx_unlock(&frt->frt_lock);
1900 	frt->frt_thread = THREAD_NULL;
1901 }
1902 
1903 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)1904 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
1905 {
1906 	kern_return_t error;
1907 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
1908 	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
1909 	frt->frt_idx = i;
1910 	frt->frt_fsw = fsw;
1911 	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
1912 	ASSERT(!error);
1913 	KPKTQ_INIT(&frt->frt_pktq);
1914 }
1915 
1916 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)1917 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
1918 {
1919 	if (n > FSW_RPS_MAX_NTHREADS) {
1920 		SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
1921 		return EINVAL;
1922 	}
1923 
1924 	FSW_WLOCK(fsw);
1925 	if (n < fsw->fsw_rps_nthreads) {
1926 		for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
1927 			fsw_rps_thread_join(fsw, i);
1928 		}
1929 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
1930 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
1931 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1932 	} else if (n > fsw->fsw_rps_nthreads) {
1933 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
1934 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
1935 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1936 		for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
1937 			fsw_rps_thread_spawn(fsw, i);
1938 		}
1939 	}
1940 	fsw->fsw_rps_nthreads = n;
1941 	FSW_WUNLOCK(fsw);
1942 	return 0;
1943 }
1944 
1945 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1946 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1947 {
1948 	sa_family_t af = fsw->fsw_demux(fsw, pkt);
1949 	if (__improbable(af == AF_UNSPEC)) {
1950 		return 0;
1951 	}
1952 
1953 	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
1954 
1955 	if (__improbable((pkt->pkt_qum_qflags &
1956 	    QUM_F_FLOW_CLASSIFIED) == 0)) {
1957 		return 0;
1958 	}
1959 
1960 	struct flow_key key;
1961 	flow_pkt2key(pkt, true, &key);
1962 	key.fk_mask = FKMASK_5TUPLE;
1963 
1964 	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
1965 
1966 	return id;
1967 }
1968 
1969 #endif /* !DEVELOPMENT && !DEBUG */
1970 
1971 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)1972 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
1973 {
1974 	FSW_RLOCK(fsw);
1975 #if (DEVELOPMENT || DEBUG)
1976 	if (fsw->fsw_rps_nthreads != 0) {
1977 		struct __kern_packet *pkt, *tpkt;
1978 		bitmap_t map = 0;
1979 
1980 		_CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
1981 		KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1982 			uint32_t id = get_rps_id(fsw, pkt);
1983 			KPKTQ_REMOVE(pktq, pkt);
1984 			fsw_rps_rx(fsw, id, pkt);
1985 			bitmap_set(&map, id);
1986 		}
1987 		for (int i = bitmap_first(&map, 64); i >= 0;
1988 		    i = bitmap_next(&map, i)) {
1989 			fsw_rps_thread_schedule(fsw, i);
1990 		}
1991 	} else
1992 #endif /* !DEVELOPMENT && !DEBUG */
1993 	{
1994 		_fsw_receive_locked(fsw, pktq);
1995 	}
1996 	FSW_RUNLOCK(fsw);
1997 }
1998 
1999 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)2000 fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts,
2001     uint32_t n_pkts)
2002 {
2003 #pragma unused(handle)
2004 	struct nx_flowswitch *fsw = handle;
2005 	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2006 	struct pktq pktq;
2007 	sk_protect_t protect;
2008 	uint32_t i;
2009 
2010 	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2011 
2012 	for (i = 0; i < n_pkts; i++) {
2013 		ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2014 		ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2015 		kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2016 	}
2017 
2018 	protect = sk_sync_protect();
2019 	KPKTQ_INIT(&pktq);
2020 	pkts_to_pktq(kpkts, n_pkts, &pktq);
2021 
2022 	fsw_receive(fsw, &pktq);
2023 	KPKTQ_FINI(&pktq);
2024 	sk_sync_unprotect(protect);
2025 
2026 	return 0;
2027 }
2028 
2029 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2030 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2031 {
2032 	classq_pkt_t p;
2033 	struct netem *ne;
2034 	struct __kern_packet *pkt, *tpkt;
2035 
2036 	ASSERT(fsw->fsw_ifp != NULL);
2037 	ne = fsw->fsw_ifp->if_input_netem;
2038 	ASSERT(ne != NULL);
2039 	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2040 		bool pdrop;
2041 		KPKTQ_REMOVE(q, pkt);
2042 		CLASSQ_PKT_INIT_PACKET(&p, pkt);
2043 		netem_enqueue(ne, &p, &pdrop);
2044 	}
2045 }
2046 
2047 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2048 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2049     struct nexus_pkt_stats *out_stats)
2050 {
2051 	struct __kern_packet *pkt = pkt_head, *next;
2052 	struct nx_flowswitch *fsw;
2053 	uint32_t n_bytes = 0, n_pkts = 0;
2054 	uint64_t total_pkts = 0, total_bytes = 0;
2055 	struct pktq q;
2056 
2057 	KPKTQ_INIT(&q);
2058 	if (__improbable(devna->na_ifp == NULL ||
2059 	    (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2060 		SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2061 		pp_free_packet_chain(pkt_head, NULL);
2062 		return;
2063 	}
2064 	while (pkt != NULL) {
2065 		if (__improbable(pkt->pkt_trace_id != 0)) {
2066 			KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2067 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2068 		}
2069 		next = pkt->pkt_nextpkt;
2070 		pkt->pkt_nextpkt = NULL;
2071 
2072 		if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2073 			KPKTQ_ENQUEUE(&q, pkt);
2074 			n_bytes += pkt->pkt_length;
2075 		} else {
2076 			DTRACE_SKYWALK1(non__finalized__drop,
2077 			    struct __kern_packet *, pkt);
2078 			FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2079 			pp_free_packet_single(pkt);
2080 			pkt = NULL;
2081 		}
2082 		n_pkts = KPKTQ_LEN(&q);
2083 		if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2084 			if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2085 				fsw_dev_input_netem_enqueue(fsw, &q);
2086 			} else {
2087 				fsw_receive(fsw, &q);
2088 			}
2089 			total_pkts += n_pkts;
2090 			total_bytes += n_bytes;
2091 			n_pkts = 0;
2092 			n_bytes = 0;
2093 			KPKTQ_FINI(&q);
2094 		}
2095 		pkt = next;
2096 	}
2097 	ASSERT(KPKTQ_LEN(&q) == 0);
2098 	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2099 	if (out_stats != NULL) {
2100 		out_stats->nps_pkts = total_pkts;
2101 		out_stats->nps_bytes = total_bytes;
2102 	}
2103 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2104 }
2105 
2106 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2107 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2108     struct __kern_packet *dpkt)
2109 {
2110 	struct mbuf *m = NULL;
2111 	uint16_t bdlen, bdlim, bdoff;
2112 	uint8_t *bdaddr;
2113 	unsigned int one = 1;
2114 	int err = 0;
2115 
2116 	err = mbuf_allocpacket(MBUF_DONTWAIT,
2117 	    (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2118 #if (DEVELOPMENT || DEBUG)
2119 	if (m != NULL) {
2120 		_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2121 	}
2122 #endif /* DEVELOPMENT || DEBUG */
2123 	if (__improbable(m == NULL)) {
2124 		FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2125 		err = ENOBUFS;
2126 		goto done;
2127 	}
2128 
2129 	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2130 	if (fsw->fsw_frame_headroom > bdlim) {
2131 		SK_ERR("not enough space in buffer for headroom");
2132 		err = EINVAL;
2133 		goto done;
2134 	}
2135 
2136 	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2137 	dpkt->pkt_mbuf = m;
2138 	dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2139 
2140 	/* packet copy into mbuf */
2141 	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2142 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2143 	    fsw->fsw_frame_headroom, spkt->pkt_length,
2144 	    PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2145 	    spkt->pkt_csum_tx_start_off);
2146 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2147 
2148 	/* header copy into dpkt buffer for classification */
2149 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2150 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2151 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2152 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2153 	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2154 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2155 	    sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2156 
2157 	/*
2158 	 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2159 	 * buflet baddr m_data always points to the beginning of packet and
2160 	 * should represents the same as baddr + headroom
2161 	 */
2162 	ASSERT((uintptr_t)m->m_data ==
2163 	    ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2164 
2165 done:
2166 	return err;
2167 }
2168 
2169 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2170 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2171     struct __kern_packet *dpkt)
2172 {
2173 	struct ifnet *ifp = fsw->fsw_ifp;
2174 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2175 
2176 	if (headroom > UINT8_MAX) {
2177 		SK_ERR("headroom too large %d", headroom);
2178 		return ERANGE;
2179 	}
2180 	dpkt->pkt_headroom = (uint8_t)headroom;
2181 	ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2182 	dpkt->pkt_l2_len = 0;
2183 	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2184 
2185 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2186 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2187 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2188 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2189 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2190 	    dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2191 	    spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2192 	    (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2193 	    (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2194 	    (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2195 
2196 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2197 
2198 	return 0;
2199 }
2200 
2201 #if SK_LOG
2202 /* Hoisted out of line to reduce kernel stack footprint */
2203 SK_LOG_ATTRIBUTE
2204 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2205 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2206     struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2207 {
2208 	struct proc *p = current_proc();
2209 	struct ifnet *ifp = fsw->fsw_ifp;
2210 	uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2211 
2212 	if (error == ERANGE) {
2213 		SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2214 		    "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2215 		    (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2216 		    (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2217 	} else if (error == ENOBUFS) {
2218 		SK_DF(logflags, "%s(%d) packet allocation failure",
2219 		    sk_proc_name_address(p), sk_proc_pid(p));
2220 	} else if (error == 0) {
2221 		ASSERT(dpkt != NULL);
2222 		char *daddr;
2223 		MD_BUFLET_ADDR_ABS(dpkt, daddr);
2224 		SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2225 		    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2226 		    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2227 		    (uint32_t)fsw->fsw_frame_headroom,
2228 		    (uint32_t)ifp->if_tx_headroom);
2229 		SK_DF(logflags | SK_VERB_DUMP, "%s",
2230 		    sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
2231 	} else {
2232 		SK_DF(logflags, "%s(%d) error %d", error);
2233 	}
2234 }
2235 #else
2236 #define dp_copy_to_dev_log(...)
2237 #endif /* SK_LOG */
2238 
2239 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2240 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2241     struct __kern_packet *dpkt)
2242 {
2243 	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2244 	struct ifnet *ifp = fsw->fsw_ifp;
2245 	uint32_t dev_pkt_len;
2246 	int err = 0;
2247 
2248 	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2249 	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2250 
2251 	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2252 	/* Copy packet metadata */
2253 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2254 	_PKT_COPY(spkt, dpkt);
2255 	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2256 	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2257 	    !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2258 	ASSERT(dpkt->pkt_mbuf == NULL);
2259 
2260 	/* Copy AQM metadata */
2261 	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2262 	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2263 	_CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2264 	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2265 	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2266 	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2267 
2268 	switch (fsw->fsw_classq_enq_ptype) {
2269 	case QP_MBUF:
2270 		err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2271 		break;
2272 
2273 	case QP_PACKET:
2274 		dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2275 		    spkt->pkt_length;
2276 		if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2277 			FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2278 			err = ERANGE;
2279 			goto done;
2280 		}
2281 		err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2282 		break;
2283 
2284 	default:
2285 		VERIFY(0);
2286 		__builtin_unreachable();
2287 	}
2288 done:
2289 	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2290 	return err;
2291 }
2292 
2293 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2294 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2295 {
2296 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2297 	ASSERT(pkt->pkt_mbuf != NULL);
2298 	struct mbuf *m = pkt->pkt_mbuf;
2299 
2300 	/* pass additional metadata generated from flow parse/lookup */
2301 	_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2302 	    sizeof(pkt->pkt_flow_token));
2303 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2304 	    sizeof(pkt->pkt_flowsrc_token));
2305 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2306 	    sizeof(pkt->pkt_flowsrc_fidx));
2307 	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2308 	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2309 	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2310 	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2311 	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2312 	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2313 	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2314 
2315 	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2316 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2317 	}
2318 
2319 	/* The packet should have a timestamp by the time we get here. */
2320 	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2321 	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2322 
2323 	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2324 	m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2325 	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2326 		m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2327 	}
2328 	if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
2329 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
2330 	}
2331 	KPKT_CLEAR_MBUF_DATA(pkt);
2332 
2333 	/* mbuf has been consumed, release packet as well */
2334 	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2335 	pp_free_packet_single(pkt);
2336 	return m;
2337 }
2338 
2339 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2340 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2341     struct mbuf **head, struct mbuf **tail,
2342     uint32_t *cnt, uint32_t *bytes)
2343 {
2344 	struct __kern_packet *pkt = pkt_list, *next;
2345 	struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL;
2346 	uint32_t c = 0, b = 0;
2347 
2348 	while (pkt != NULL) {
2349 		next = pkt->pkt_nextpkt;
2350 		pkt->pkt_nextpkt = NULL;
2351 		m = convert_pkt_to_mbuf(pkt);
2352 		ASSERT(m != NULL);
2353 
2354 		*m_tailp = m;
2355 		m_tailp = &m->m_nextpkt;
2356 		c++;
2357 		b += m_pktlen(m);
2358 		pkt = next;
2359 	}
2360 	if (head != NULL) {
2361 		*head = m_head;
2362 	}
2363 	if (tail != NULL) {
2364 		*tail = m;
2365 	}
2366 	if (cnt != NULL) {
2367 		*cnt = c;
2368 	}
2369 	if (bytes != NULL) {
2370 		*bytes = b;
2371 	}
2372 }
2373 
2374 SK_NO_INLINE_ATTRIBUTE
2375 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2376 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2377     struct __kern_packet *pkt)
2378 {
2379 	struct ifnet *ifp = fsw->fsw_ifp;
2380 	boolean_t pkt_drop = FALSE;
2381 	int err;
2382 
2383 	FSW_LOCK_ASSERT_HELD(fsw);
2384 	ASSERT(fsw->fsw_classq_enabled);
2385 	ASSERT(pkt->pkt_flow_token != 0);
2386 	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2387 	    1, pkt->pkt_length);
2388 
2389 	if (__improbable(pkt->pkt_trace_id != 0)) {
2390 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2391 		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2392 	}
2393 
2394 	switch (fsw->fsw_classq_enq_ptype) {
2395 	case QP_MBUF: {                         /* compat interface */
2396 		struct mbuf *m;
2397 
2398 		m = convert_pkt_to_mbuf(pkt);
2399 		ASSERT(m != NULL);
2400 		pkt = NULL;
2401 
2402 		/* ifnet_enqueue consumes mbuf */
2403 		err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2404 		m = NULL;
2405 #if (DEVELOPMENT || DEBUG)
2406 		if (__improbable(!pkt_drop)) {
2407 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2408 		}
2409 #endif /* DEVELOPMENT || DEBUG */
2410 		if (pkt_drop) {
2411 			FSW_STATS_INC(FSW_STATS_DROP);
2412 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2413 		}
2414 		break;
2415 	}
2416 	case QP_PACKET: {                       /* native interface */
2417 		/* ifnet_enqueue consumes packet */
2418 		err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2419 		pkt = NULL;
2420 #if (DEVELOPMENT || DEBUG)
2421 		if (__improbable(!pkt_drop)) {
2422 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2423 		}
2424 #endif /* DEVELOPMENT || DEBUG */
2425 		if (pkt_drop) {
2426 			FSW_STATS_INC(FSW_STATS_DROP);
2427 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2428 		}
2429 		break;
2430 	}
2431 	default:
2432 		err = EINVAL;
2433 		VERIFY(0);
2434 		/* NOTREACHED */
2435 		__builtin_unreachable();
2436 	}
2437 
2438 	return err;
2439 }
2440 
2441 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2442 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2443     struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2444     uint32_t cnt, uint32_t bytes)
2445 {
2446 	struct ifnet *ifp = fsw->fsw_ifp;
2447 	boolean_t pkt_drop = FALSE;
2448 	uint32_t svc;
2449 	int err;
2450 
2451 	FSW_LOCK_ASSERT_HELD(fsw);
2452 	ASSERT(fsw->fsw_classq_enabled);
2453 	ASSERT(pkt_head->pkt_flow_token != 0);
2454 
2455 	/*
2456 	 * All packets in the flow should have the same svc.
2457 	 */
2458 	svc = pkt_head->pkt_svc_class;
2459 	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2460 
2461 	switch (fsw->fsw_classq_enq_ptype) {
2462 	case QP_MBUF: {                         /* compat interface */
2463 		struct mbuf *m_head = NULL, *m_tail = NULL;
2464 		uint32_t c = 0, b = 0;
2465 
2466 		convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2467 		ASSERT(m_head != NULL && m_tail != NULL);
2468 		ASSERT(c == cnt);
2469 		ASSERT(b == bytes);
2470 		pkt_head = NULL;
2471 
2472 		/* ifnet_enqueue consumes mbuf */
2473 		err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2474 		    bytes, FALSE, &pkt_drop);
2475 		m_head = NULL;
2476 		m_tail = NULL;
2477 #if (DEVELOPMENT || DEBUG)
2478 		if (__improbable(!pkt_drop)) {
2479 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2480 		}
2481 #endif /* DEVELOPMENT || DEBUG */
2482 		if (pkt_drop) {
2483 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2484 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2485 			    cnt);
2486 		}
2487 		break;
2488 	}
2489 	case QP_PACKET: {                       /* native interface */
2490 		/* ifnet_enqueue consumes packet */
2491 		err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2492 		    bytes, FALSE, &pkt_drop);
2493 		pkt_head = NULL;
2494 #if (DEVELOPMENT || DEBUG)
2495 		if (__improbable(!pkt_drop)) {
2496 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2497 		}
2498 #endif /* DEVELOPMENT || DEBUG */
2499 		if (pkt_drop) {
2500 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2501 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2502 			    cnt);
2503 		}
2504 		break;
2505 	}
2506 	default:
2507 		err = EINVAL;
2508 		VERIFY(0);
2509 		/* NOTREACHED */
2510 		__builtin_unreachable();
2511 	}
2512 
2513 	return err;
2514 }
2515 
2516 /*
2517  * This code path needs to be kept for interfaces without logical link support.
2518  */
2519 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2520 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2521     bool chain, uint32_t cnt, uint32_t bytes)
2522 {
2523 	bool flowadv_is_set = false;
2524 	struct __kern_packet *pkt, *tail, *tpkt;
2525 	flowadv_idx_t flow_adv_idx;
2526 	bool flowadv_cap;
2527 	flowadv_token_t flow_adv_token;
2528 	int err;
2529 
2530 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2531 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2532 
2533 	if (chain) {
2534 		pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2535 		tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2536 		KPKTQ_INIT(&fe->fe_tx_pktq);
2537 		if (pkt == NULL) {
2538 			return;
2539 		}
2540 		flow_adv_idx = pkt->pkt_flowsrc_fidx;
2541 		flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2542 		flow_adv_token = pkt->pkt_flow_token;
2543 
2544 		err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
2545 
2546 		/* set flow advisory if needed */
2547 		if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2548 		    flowadv_cap)) {
2549 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2550 			    flow_adv_idx, flow_adv_token);
2551 		}
2552 	} else {
2553 		uint32_t c = 0, b = 0;
2554 
2555 		KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2556 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2557 
2558 			flow_adv_idx = pkt->pkt_flowsrc_fidx;
2559 			flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2560 			flow_adv_token = pkt->pkt_flow_token;
2561 
2562 			c++;
2563 			b += pkt->pkt_length;
2564 			err = classq_enqueue_flow_single(fsw, pkt);
2565 
2566 			/* set flow advisory if needed */
2567 			if (__improbable(!flowadv_is_set &&
2568 			    ((err == EQFULL || err == EQSUSPENDED) &&
2569 			    flowadv_cap))) {
2570 				flowadv_is_set = na_flowadv_set(
2571 					flow_get_na(fsw, fe), flow_adv_idx,
2572 					flow_adv_token);
2573 			}
2574 		}
2575 		ASSERT(c == cnt);
2576 		ASSERT(b == bytes);
2577 	}
2578 
2579 	/* notify flow advisory event */
2580 	if (__improbable(flowadv_is_set)) {
2581 		struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2582 		if (__probable(r)) {
2583 			na_flowadv_event(r);
2584 			SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2585 			    "%s(%d) notified of flow update",
2586 			    sk_proc_name_address(current_proc()),
2587 			    sk_proc_pid(current_proc()));
2588 		}
2589 	}
2590 }
2591 
2592 /*
2593  * Logical link code path
2594  */
2595 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2596 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2597     bool chain, uint32_t cnt, uint32_t bytes)
2598 {
2599 	struct __kern_packet *pkt, *tail;
2600 	flowadv_idx_t flow_adv_idx;
2601 	bool flowadv_is_set = false;
2602 	bool flowadv_cap;
2603 	flowadv_token_t flow_adv_token;
2604 	uint32_t flowctl = 0, dropped = 0;
2605 	int err;
2606 
2607 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2608 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2609 
2610 	/*
2611 	 * Not supporting chains for now
2612 	 */
2613 	VERIFY(!chain);
2614 	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2615 	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2616 	KPKTQ_INIT(&fe->fe_tx_pktq);
2617 	if (pkt == NULL) {
2618 		return;
2619 	}
2620 	flow_adv_idx = pkt->pkt_flowsrc_fidx;
2621 	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2622 	flow_adv_token = pkt->pkt_flow_token;
2623 
2624 	err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2625 	    &flowctl, &dropped);
2626 
2627 	if (__improbable(err != 0)) {
2628 		/* set flow advisory if needed */
2629 		if (flowctl > 0 && flowadv_cap) {
2630 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2631 			    flow_adv_idx, flow_adv_token);
2632 
2633 			/* notify flow advisory event */
2634 			if (flowadv_is_set) {
2635 				struct __kern_channel_ring *r =
2636 				    fsw_flow_get_tx_ring(fsw, fe);
2637 				if (__probable(r)) {
2638 					na_flowadv_event(r);
2639 					SK_DF(SK_VERB_FLOW_ADVISORY |
2640 					    SK_VERB_TX,
2641 					    "%s(%d) notified of flow update",
2642 					    sk_proc_name_address(current_proc()),
2643 					    sk_proc_pid(current_proc()));
2644 				}
2645 			}
2646 		}
2647 		if (dropped > 0) {
2648 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2649 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2650 			    dropped);
2651 		}
2652 	}
2653 }
2654 
2655 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2656 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2657 {
2658 #pragma unused(fsw)
2659 	/* finalize here; no more changes to buflets after classq */
2660 	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2661 		kern_packet_t ph = SK_PTR_ENCODE(pkt,
2662 		    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2663 		int err = __packet_finalize(ph);
2664 		VERIFY(err == 0);
2665 	}
2666 }
2667 
2668 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2669 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2670 {
2671 	struct flow_route *fr = fe->fe_route;
2672 	int err;
2673 
2674 	ASSERT(fr != NULL);
2675 
2676 	if (__improbable(!dp_flow_route_process(fsw, fe))) {
2677 		return false;
2678 	}
2679 	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
2680 		flow_qset_select_dynamic(fsw, fe, TRUE);
2681 	}
2682 
2683 	_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2684 	    _fsw_error35_handler, 1, fr, NULL, NULL);
2685 	_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
2686 	    _fsw_error36_handler, 1, fr, NULL);
2687 
2688 	/*
2689 	 * See if we need to resolve the flow route; note the test against
2690 	 * fr_flags here is done without any lock for performance.  Thus
2691 	 * it's possible that we race against the thread performing route
2692 	 * event updates for a packet (which is OK).  In any case we should
2693 	 * not have any assertion on fr_flags value(s) due to the lack of
2694 	 * serialization.
2695 	 */
2696 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
2697 		goto frame;
2698 	}
2699 
2700 	struct __kern_packet *pkt, *tpkt;
2701 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2702 		err = fsw->fsw_resolve(fsw, fr, pkt);
2703 		_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
2704 		_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
2705 		/*
2706 		 * If resolver returns EJUSTRETURN then we drop the pkt as the
2707 		 * resolver should have converted the pkt into mbuf (or
2708 		 * detached the attached mbuf from pkt) and added it to the
2709 		 * llinfo queue. If we do have a cached llinfo, then proceed
2710 		 * to using it even though it may be stale (very unlikely)
2711 		 * while the resolution is in progress.
2712 		 * Otherwise, any other error results in dropping pkt.
2713 		 */
2714 		if (err == EJUSTRETURN) {
2715 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2716 			pp_free_packet_single(pkt);
2717 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
2718 			continue;
2719 		} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
2720 			/* use existing llinfo */
2721 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
2722 		} else if (err != 0) {
2723 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2724 			pp_free_packet_single(pkt);
2725 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
2726 			continue;
2727 		}
2728 	}
2729 
2730 frame:
2731 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2732 		if (fsw->fsw_frame != NULL) {
2733 			fsw->fsw_frame(fsw, fr, pkt);
2734 		}
2735 	}
2736 
2737 	return true;
2738 }
2739 
2740 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2741 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2742 {
2743 #pragma unused(fsw)
2744 	struct __kern_packet *pkt, *tpkt;
2745 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2746 		KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2747 		/* listener is only allowed TCP RST */
2748 		if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
2749 		    (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
2750 			flow_track_abort_tcp(fe, NULL, pkt);
2751 		} else {
2752 			char *addr;
2753 			MD_BUFLET_ADDR_ABS(pkt, addr);
2754 			SK_ERR("listener flow sends non-RST packet %s",
2755 			    sk_dump(sk_proc_name_address(current_proc()),
2756 			    addr, pkt->pkt_length, 128, NULL, 0));
2757 		}
2758 		pp_free_packet_single(pkt);
2759 	}
2760 }
2761 
2762 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)2763 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
2764     volatile uint64_t *rt_ts, ifnet_t ifp)
2765 {
2766 	struct timespec now;
2767 	uint64_t now_nsec = 0;
2768 
2769 	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
2770 		nanouptime(&now);
2771 		net_timernsec(&now, &now_nsec);
2772 		pkt->pkt_timestamp = now_nsec;
2773 	}
2774 	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
2775 
2776 	/*
2777 	 * If the packet service class is not background,
2778 	 * update the timestamps on the interface, as well as
2779 	 * the ones in nexus-wide advisory to indicate recent
2780 	 * activity on a foreground flow.
2781 	 */
2782 	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
2783 		ifp->if_fg_sendts = (uint32_t)_net_uptime;
2784 		if (fg_ts != NULL) {
2785 			*fg_ts = _net_uptime;
2786 		}
2787 	}
2788 	if (pkt->pkt_pflags & PKT_F_REALTIME) {
2789 		ifp->if_rt_sendts = (uint32_t)_net_uptime;
2790 		if (rt_ts != NULL) {
2791 			*rt_ts = _net_uptime;
2792 		}
2793 	}
2794 }
2795 
2796 /*
2797  * TODO:
2798  * We can check the flow entry as well to only allow chain enqueue
2799  * on flows matching a certain criteria.
2800  */
2801 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,struct flow_entry * fe)2802 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, struct flow_entry *fe)
2803 {
2804 #pragma unused(fe)
2805 	return fsw_chain_enqueue != 0 &&
2806 	       fsw->fsw_ifp->if_output_netem == NULL &&
2807 	       (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
2808 	       fe->fe_qset == NULL;
2809 }
2810 
2811 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2812 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2813 {
2814 	struct pktq dropped_pkts;
2815 	bool chain;
2816 	uint32_t cnt = 0, bytes = 0;
2817 	volatile struct sk_nexusadv *nxadv = NULL;
2818 	volatile uint64_t *fg_ts = NULL;
2819 	volatile uint64_t *rt_ts = NULL;
2820 	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
2821 
2822 	KPKTQ_INIT(&dropped_pkts);
2823 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2824 	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
2825 		dp_listener_flow_tx_process(fsw, fe);
2826 		return;
2827 	}
2828 	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
2829 		SK_RDERR(5, "Tx route bad");
2830 		FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
2831 		    KPKTQ_LEN(&fe->fe_tx_pktq));
2832 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
2833 		goto done;
2834 	}
2835 	chain = fsw_chain_enqueue_enabled(fsw, fe);
2836 	if (chain) {
2837 		nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
2838 		if (nxadv != NULL) {
2839 			fg_ts = &nxadv->nxadv_fg_sendts;
2840 			rt_ts = &nxadv->nxadv_rt_sendts;
2841 		}
2842 	}
2843 	struct __kern_packet *pkt, *tpkt;
2844 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2845 		int err = flow_pkt_track(fe, pkt, false);
2846 		if (__improbable(err != 0)) {
2847 			SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
2848 			FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
2849 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2850 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2851 			continue;
2852 		}
2853 
2854 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2855 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2856 
2857 		/* set AQM related values for outgoing packet */
2858 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
2859 			pkt->pkt_pflags |= PKT_F_FLOW_ADV;
2860 			pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
2861 			pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
2862 		} else {
2863 			pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
2864 		}
2865 		_UUID_CLEAR(pkt->pkt_flow_id);
2866 		pkt->pkt_flow_token = fe->fe_flowid;
2867 		pkt->pkt_pflags |= PKT_F_FLOW_ID;
2868 		pkt->pkt_qset_idx = qset_idx;
2869 		/*
2870 		 * The same code is exercised per packet for the non-chain case
2871 		 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
2872 		 * re-walking the chain later.
2873 		 */
2874 		if (chain) {
2875 			fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
2876 		}
2877 		/* mark packet tos/svc_class */
2878 		fsw_qos_mark(fsw, fe, pkt);
2879 
2880 		tx_finalize_packet(fsw, pkt);
2881 		bytes += pkt->pkt_length;
2882 		cnt++;
2883 	}
2884 
2885 	/* snoop after it's finalized */
2886 	if (__improbable(pktap_total_tap_count != 0)) {
2887 		fsw_snoop(fsw, fe, false);
2888 	}
2889 	if (fe->fe_qset != NULL) {
2890 		classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
2891 	} else {
2892 		classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
2893 	}
2894 done:
2895 	dp_drop_pktq(fsw, &dropped_pkts);
2896 }
2897 
2898 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)2899 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
2900     struct flow_entry *prev_fe, struct __kern_packet *pkt)
2901 {
2902 	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
2903 
2904 	if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
2905 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
2906 		SK_ERR("%s(%d) invalid zero fragment id",
2907 		    sk_proc_name_address(current_proc()),
2908 		    sk_proc_pid(current_proc()));
2909 		return NULL;
2910 	}
2911 
2912 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
2913 	    "%s(%d) continuation frag, id %u",
2914 	    sk_proc_name_address(current_proc()),
2915 	    sk_proc_pid(current_proc()),
2916 	    pkt->pkt_flow_ip_frag_id);
2917 	if (__improbable(prev_fe == NULL ||
2918 	    !prev_fe->fe_tx_is_cont_frag)) {
2919 		SK_ERR("%s(%d) unexpected continuation frag",
2920 		    sk_proc_name_address(current_proc()),
2921 		    sk_proc_pid(current_proc()),
2922 		    pkt->pkt_flow_ip_frag_id);
2923 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2924 		return NULL;
2925 	}
2926 	if (__improbable(pkt->pkt_flow_ip_frag_id !=
2927 	    prev_fe->fe_tx_frag_id)) {
2928 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2929 		SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
2930 		    sk_proc_name_address(current_proc()),
2931 		    sk_proc_pid(current_proc()),
2932 		    pkt->pkt_flow_ip_frag_id,
2933 		    prev_fe->fe_tx_frag_id);
2934 		return NULL;
2935 	}
2936 
2937 	return prev_fe;
2938 }
2939 
2940 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)2941 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
2942     struct flow_entry *prev_fe)
2943 {
2944 	struct flow_entry *fe;
2945 
2946 	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
2947 	if (__improbable(fe == NULL)) {
2948 		goto done;
2949 	}
2950 
2951 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
2952 		SK_RDERR(5, "Tx flow torn down");
2953 		FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
2954 		flow_entry_release(&fe);
2955 		goto done;
2956 	}
2957 
2958 	_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
2959 	    null_func);
2960 
2961 	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
2962 		uuid_string_t flow_id_str, pkt_id_str;
2963 		sk_uuid_unparse(fe->fe_uuid, flow_id_str);
2964 		sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
2965 		SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
2966 		flow_entry_release(&fe);
2967 		FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
2968 	}
2969 
2970 done:
2971 	return fe;
2972 }
2973 
2974 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2975 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2976 {
2977 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2978 	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
2979 
2980 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
2981 	    KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
2982 
2983 	/* flow related processing (default, agg, etc.) */
2984 	fe->fe_tx_process(fsw, fe);
2985 
2986 	KPKTQ_FINI(&fe->fe_tx_pktq);
2987 }
2988 
2989 #if SK_LOG
2990 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)2991 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
2992 {
2993 	char *pkt_buf;
2994 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
2995 	SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
2996 	    sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
2997 	    pkt->pkt_length, 128, NULL, 0));
2998 }
2999 #else /* !SK_LOG */
3000 #define dp_tx_log_pkt(...)
3001 #endif /* !SK_LOG */
3002 
3003 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3004 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3005 {
3006 	struct __kern_packet *spkt, *pkt;
3007 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3008 	struct flow_entry *fe, *prev_fe;
3009 	struct pktq dropped_pkts, dpktq;
3010 	struct nexus_adapter *dev_na;
3011 	struct kern_pbufpool *dev_pp;
3012 	struct ifnet *ifp;
3013 	sa_family_t af;
3014 	uint32_t n_pkts, n_flows = 0;
3015 
3016 	int err;
3017 	KPKTQ_INIT(&dpktq);
3018 	KPKTQ_INIT(&dropped_pkts);
3019 	n_pkts = KPKTQ_LEN(spktq);
3020 
3021 	FSW_RLOCK(fsw);
3022 	if (__improbable(FSW_QUIESCED(fsw))) {
3023 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3024 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3025 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3026 		goto done;
3027 	}
3028 	dev_na = fsw->fsw_dev_ch->ch_na;
3029 	if (__improbable(dev_na == NULL)) {
3030 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3031 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3032 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3033 		goto done;
3034 	}
3035 	/*
3036 	 * fsw_ifp should still be valid at this point. If fsw is detached
3037 	 * after fsw_lock is released, this ifp will remain valid and
3038 	 * netif_transmit() will behave properly even if the ifp is in
3039 	 * detached state.
3040 	 */
3041 	ifp = fsw->fsw_ifp;
3042 
3043 	/* batch allocate enough packets */
3044 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3045 
3046 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3047 	    NULL, SKMEM_NOSLEEP);
3048 #if DEVELOPMENT || DEBUG
3049 	if (__probable(err != ENOMEM)) {
3050 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3051 	}
3052 #endif /* DEVELOPMENT || DEBUG */
3053 	if (__improbable(err == ENOMEM)) {
3054 		ASSERT(KPKTQ_EMPTY(&dpktq));
3055 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3056 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3057 		SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3058 		goto done;
3059 	} else if (__improbable(err == EAGAIN)) {
3060 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3061 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3062 		FSW_STATS_ADD(FSW_STATS_DROP,
3063 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3064 	}
3065 
3066 	n_pkts = KPKTQ_LEN(&dpktq);
3067 	prev_fe = NULL;
3068 	KPKTQ_FOREACH(spkt, spktq) {
3069 		if (n_pkts == 0) {
3070 			break;
3071 		}
3072 		--n_pkts;
3073 
3074 		KPKTQ_DEQUEUE(&dpktq, pkt);
3075 		ASSERT(pkt != NULL);
3076 		err = dp_copy_to_dev(fsw, spkt, pkt);
3077 		if (__improbable(err != 0)) {
3078 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3079 			continue;
3080 		}
3081 
3082 		af = fsw_ip_demux(fsw, pkt);
3083 		if (__improbable(af == AF_UNSPEC)) {
3084 			dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3085 			FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3086 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3087 			continue;
3088 		}
3089 
3090 		err = flow_pkt_classify(pkt, ifp, af, false);
3091 		if (__improbable(err != 0)) {
3092 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3093 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3094 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3095 			continue;
3096 		}
3097 
3098 		if (__improbable(pkt->pkt_flow_ip_is_frag &&
3099 		    !pkt->pkt_flow_ip_is_first_frag)) {
3100 			fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3101 			if (__probable(fe != NULL)) {
3102 				flow_entry_retain(fe);
3103 				goto flow_batch;
3104 			} else {
3105 				FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3106 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3107 				continue;
3108 			}
3109 		}
3110 
3111 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3112 		if (__improbable(fe == NULL)) {
3113 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3114 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3115 			prev_fe = NULL;
3116 			continue;
3117 		}
3118 flow_batch:
3119 		tx_flow_batch_packet(&fes, fe, pkt);
3120 		prev_fe = fe;
3121 	}
3122 
3123 	struct flow_entry *tfe = NULL;
3124 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3125 		tx_flow_process(fsw, fe);
3126 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3127 		fe->fe_tx_is_cont_frag = false;
3128 		fe->fe_tx_frag_id = 0;
3129 		flow_entry_release(&fe);
3130 		n_flows++;
3131 	}
3132 
3133 done:
3134 	FSW_RUNLOCK(fsw);
3135 	if (n_flows > 0) {
3136 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
3137 	}
3138 	dp_drop_pktq(fsw, &dropped_pkts);
3139 	KPKTQ_FINI(&dropped_pkts);
3140 	KPKTQ_FINI(&dpktq);
3141 }
3142 
3143 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3144 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3145     struct proc *p)
3146 {
3147 #pragma unused(p)
3148 	uint32_t total_pkts = 0, total_bytes = 0;
3149 
3150 	for (;;) {
3151 		struct pktq pktq;
3152 		KPKTQ_INIT(&pktq);
3153 		uint32_t n_bytes;
3154 		fsw_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
3155 		if (n_bytes == 0) {
3156 			break;
3157 		}
3158 		total_pkts += KPKTQ_LEN(&pktq);
3159 		total_bytes += n_bytes;
3160 
3161 		if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
3162 			fsw_receive(fsw, &pktq);
3163 		} else {
3164 			fsw_dev_input_netem_enqueue(fsw, &pktq);
3165 		}
3166 		KPKTQ_FINI(&pktq);
3167 	}
3168 
3169 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3170 	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
3171 	    uint32_t, total_bytes);
3172 
3173 	/* compute mitigation rate for delivered traffic */
3174 	if (__probable(r->ckr_netif_mit_stats != NULL)) {
3175 		r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
3176 	}
3177 }
3178 
3179 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3180 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3181     struct proc *p)
3182 {
3183 #pragma unused(p)
3184 	static packet_trace_id_t trace_id = 0;
3185 	uint32_t total_pkts = 0, total_bytes = 0;
3186 
3187 	for (;;) {
3188 		struct pktq pktq;
3189 		KPKTQ_INIT(&pktq);
3190 		uint32_t n_bytes;
3191 		fsw_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes);
3192 		if (n_bytes == 0) {
3193 			break;
3194 		}
3195 		total_pkts += KPKTQ_LEN(&pktq);
3196 		total_bytes += n_bytes;
3197 
3198 		KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
3199 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, KPKTQ_FIRST(&pktq)->pkt_trace_id);
3200 
3201 		dp_tx_pktq(fsw, &pktq);
3202 		dp_free_pktq(fsw, &pktq);
3203 		KPKTQ_FINI(&pktq);
3204 	}
3205 
3206 	kr_update_stats(r, total_pkts, total_bytes);
3207 
3208 	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3209 	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
3210 	    uint32_t, total_bytes);
3211 }
3212 
3213 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3214 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3215     struct proc *p)
3216 {
3217 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
3218 
3219 	ASSERT(sk_is_sync_protected());
3220 	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
3221 	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
3222 
3223 	if (vpna->vpna_nx_port == FSW_VP_DEV) {
3224 		fsw_dev_ring_flush(fsw, r, p);
3225 	} else {
3226 		fsw_user_ring_flush(fsw, r, p);
3227 	}
3228 }
3229 
3230 int
fsw_dp_ctor(struct nx_flowswitch * fsw)3231 fsw_dp_ctor(struct nx_flowswitch *fsw)
3232 {
3233 	uint32_t fe_cnt = fsw_fe_table_size;
3234 	uint32_t fob_cnt = fsw_flow_owner_buckets;
3235 	uint32_t frb_cnt = fsw_flow_route_buckets;
3236 	uint32_t frib_cnt = fsw_flow_route_id_buckets;
3237 	struct kern_nexus *nx = fsw->fsw_nx;
3238 	char name[64];
3239 	int error = 0;
3240 
3241 	/* just in case */
3242 	if (fe_cnt == 0) {
3243 		fe_cnt = NX_FSW_FE_TABLESZ;
3244 		ASSERT(fe_cnt != 0);
3245 	}
3246 	if (fob_cnt == 0) {
3247 		fob_cnt = NX_FSW_FOB_HASHSZ;
3248 		ASSERT(fob_cnt != 0);
3249 	}
3250 	if (frb_cnt == 0) {
3251 		frb_cnt = NX_FSW_FRB_HASHSZ;
3252 		ASSERT(frb_cnt != 0);
3253 	}
3254 	if (frib_cnt == 0) {
3255 		frib_cnt = NX_FSW_FRIB_HASHSZ;
3256 		ASSERT(frib_cnt != 0);
3257 	}
3258 
3259 	/* make sure fe_cnt is a power of two, else round up */
3260 	if ((fe_cnt & (fe_cnt - 1)) != 0) {
3261 		fe_cnt--;
3262 		fe_cnt |= (fe_cnt >> 1);
3263 		fe_cnt |= (fe_cnt >> 2);
3264 		fe_cnt |= (fe_cnt >> 4);
3265 		fe_cnt |= (fe_cnt >> 8);
3266 		fe_cnt |= (fe_cnt >> 16);
3267 		fe_cnt++;
3268 	}
3269 
3270 	/* make sure frb_cnt is a power of two, else round up */
3271 	if ((frb_cnt & (frb_cnt - 1)) != 0) {
3272 		frb_cnt--;
3273 		frb_cnt |= (frb_cnt >> 1);
3274 		frb_cnt |= (frb_cnt >> 2);
3275 		frb_cnt |= (frb_cnt >> 4);
3276 		frb_cnt |= (frb_cnt >> 8);
3277 		frb_cnt |= (frb_cnt >> 16);
3278 		frb_cnt++;
3279 	}
3280 
3281 	lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
3282 	    &nexus_lock_attr);
3283 	lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
3284 	lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
3285 	TAILQ_INIT(&fsw->fsw_linger_head);
3286 
3287 	(void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
3288 	error = nx_advisory_alloc(nx, name,
3289 	    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
3290 	    NEXUS_ADVISORY_TYPE_FLOWSWITCH);
3291 	if (error != 0) {
3292 		fsw_dp_dtor(fsw);
3293 		return error;
3294 	}
3295 
3296 	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
3297 	if (fsw->fsw_flow_mgr == NULL) {
3298 		fsw_dp_dtor(fsw);
3299 		return error;
3300 	}
3301 
3302 	/* generic name; will be customized upon ifattach */
3303 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
3304 	    FSW_REAP_THREADNAME, name, "");
3305 
3306 	if (kernel_thread_start(fsw_reap_thread_func, fsw,
3307 	    &fsw->fsw_reap_thread) != KERN_SUCCESS) {
3308 		panic_plain("%s: can't create thread", __func__);
3309 		/* NOTREACHED */
3310 		__builtin_unreachable();
3311 	}
3312 	/* this must not fail */
3313 	VERIFY(fsw->fsw_reap_thread != NULL);
3314 
3315 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
3316 
3317 
3318 	return error;
3319 }
3320 
3321 void
fsw_dp_dtor(struct nx_flowswitch * fsw)3322 fsw_dp_dtor(struct nx_flowswitch *fsw)
3323 {
3324 	uint64_t f = (1 * NSEC_PER_MSEC);         /* 1 ms */
3325 	uint64_t s = (1000 * NSEC_PER_SEC);         /* 1 sec */
3326 	uint32_t i = 0;
3327 
3328 #if (DEVELOPMENT || DEBUG)
3329 	if (fsw->fsw_rps_threads != NULL) {
3330 		for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
3331 			fsw_rps_thread_join(fsw, i);
3332 		}
3333 		kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
3334 	}
3335 #endif /* !DEVELOPMENT && !DEBUG */
3336 
3337 	nx_advisory_free(fsw->fsw_nx);
3338 
3339 	if (fsw->fsw_reap_thread != THREAD_NULL) {
3340 		/* signal thread to begin self-termination */
3341 		lck_mtx_lock(&fsw->fsw_reap_lock);
3342 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
3343 
3344 		/*
3345 		 * And wait for thread to terminate; use another
3346 		 * wait channel here other than fsw_reap_flags to
3347 		 * make it more explicit.  In the event the reaper
3348 		 * thread misses a wakeup, we'll try again once
3349 		 * every second (except for the first time).
3350 		 */
3351 		while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
3352 			uint64_t t = 0;
3353 
3354 			nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
3355 			clock_absolutetime_interval_to_deadline(t, &t);
3356 			ASSERT(t != 0);
3357 
3358 			fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
3359 			if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
3360 				thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3361 			}
3362 			(void) assert_wait_deadline(&fsw->fsw_reap_thread,
3363 			    THREAD_UNINT, t);
3364 			lck_mtx_unlock(&fsw->fsw_reap_lock);
3365 			thread_block(THREAD_CONTINUE_NULL);
3366 			lck_mtx_lock(&fsw->fsw_reap_lock);
3367 			fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
3368 		}
3369 		ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
3370 		lck_mtx_unlock(&fsw->fsw_reap_lock);
3371 		fsw->fsw_reap_thread = THREAD_NULL;
3372 	}
3373 
3374 	/* free any remaining flow entries in the linger list */
3375 	fsw_linger_purge(fsw);
3376 
3377 	if (fsw->fsw_flow_mgr != NULL) {
3378 		flow_mgr_destroy(fsw->fsw_flow_mgr);
3379 		fsw->fsw_flow_mgr = NULL;
3380 	}
3381 
3382 
3383 	lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
3384 	lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
3385 	lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
3386 }
3387 
3388 void
fsw_linger_insert(struct flow_entry * fe)3389 fsw_linger_insert(struct flow_entry *fe)
3390 {
3391 	struct nx_flowswitch *fsw = fe->fe_fsw;
3392 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3393 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3394 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3395 	    fe->fe_flags, FLOWENTF_BITS);
3396 
3397 	net_update_uptime();
3398 
3399 	ASSERT(flow_entry_refcnt(fe) >= 1);
3400 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3401 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3402 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
3403 	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
3404 	ASSERT(fe->fe_linger_wait != 0);
3405 	fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
3406 	atomic_bitset_32(&fe->fe_flags, FLOWENTF_LINGERING);
3407 
3408 	lck_mtx_lock_spin(&fsw->fsw_linger_lock);
3409 	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
3410 	fsw->fsw_linger_cnt++;
3411 	VERIFY(fsw->fsw_linger_cnt != 0);
3412 	lck_mtx_unlock(&fsw->fsw_linger_lock);
3413 
3414 	fsw_reap_sched(fsw);
3415 }
3416 
3417 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)3418 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
3419     struct flow_entry *fe)
3420 {
3421 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3422 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3423 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3424 	    fe->fe_flags, FLOWENTF_BITS);
3425 
3426 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3427 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3428 	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3429 	atomic_bitclear_32(&fe->fe_flags, FLOWENTF_LINGERING);
3430 
3431 	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
3432 	flow_entry_release(&fe);
3433 }
3434 
3435 static void
fsw_linger_remove(struct flow_entry * fe)3436 fsw_linger_remove(struct flow_entry *fe)
3437 {
3438 	struct nx_flowswitch *fsw = fe->fe_fsw;
3439 
3440 	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
3441 
3442 	fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
3443 	VERIFY(fsw->fsw_linger_cnt != 0);
3444 	fsw->fsw_linger_cnt--;
3445 }
3446 
3447 void
fsw_linger_purge(struct nx_flowswitch * fsw)3448 fsw_linger_purge(struct nx_flowswitch *fsw)
3449 {
3450 	struct flow_entry *fe, *tfe;
3451 
3452 	lck_mtx_lock(&fsw->fsw_linger_lock);
3453 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
3454 		fsw_linger_remove(fe);
3455 	}
3456 	ASSERT(fsw->fsw_linger_cnt == 0);
3457 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3458 	lck_mtx_unlock(&fsw->fsw_linger_lock);
3459 }
3460 
3461 void
fsw_reap_sched(struct nx_flowswitch * fsw)3462 fsw_reap_sched(struct nx_flowswitch *fsw)
3463 {
3464 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
3465 	lck_mtx_lock_spin(&fsw->fsw_reap_lock);
3466 	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
3467 	    !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
3468 		thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3469 	}
3470 	lck_mtx_unlock(&fsw->fsw_reap_lock);
3471 }
3472 
3473 __attribute__((noreturn))
3474 static void
fsw_reap_thread_func(void * v,wait_result_t w)3475 fsw_reap_thread_func(void *v, wait_result_t w)
3476 {
3477 #pragma unused(w)
3478 	struct nx_flowswitch *fsw = v;
3479 
3480 	ASSERT(fsw->fsw_reap_thread == current_thread());
3481 	thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
3482 
3483 	net_update_uptime();
3484 
3485 	lck_mtx_lock(&fsw->fsw_reap_lock);
3486 	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
3487 	(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
3488 	lck_mtx_unlock(&fsw->fsw_reap_lock);
3489 	thread_block_parameter(fsw_reap_thread_cont, fsw);
3490 	/* NOTREACHED */
3491 	__builtin_unreachable();
3492 }
3493 
3494 __attribute__((noreturn))
3495 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)3496 fsw_reap_thread_cont(void *v, wait_result_t wres)
3497 {
3498 	struct nx_flowswitch *fsw = v;
3499 	boolean_t low;
3500 	uint64_t t = 0;
3501 
3502 	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
3503 
3504 	lck_mtx_lock(&fsw->fsw_reap_lock);
3505 	if (__improbable(wres == THREAD_INTERRUPTED ||
3506 	    (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
3507 		goto terminate;
3508 	}
3509 
3510 	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
3511 	fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
3512 	lck_mtx_unlock(&fsw->fsw_reap_lock);
3513 
3514 	net_update_uptime();
3515 
3516 	/* prevent detach from happening while we're here */
3517 	if (!fsw_detach_barrier_add(fsw)) {
3518 		SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
3519 		t = 0;
3520 	} else {
3521 		uint32_t fe_nonviable, fe_freed, fe_aborted;
3522 		uint32_t fr_freed, fr_resid = 0;
3523 		struct ifnet *ifp = fsw->fsw_ifp;
3524 		uint64_t i = FSW_REAP_IVAL;
3525 		uint64_t now = _net_uptime;
3526 		uint64_t last;
3527 
3528 		ASSERT(fsw->fsw_ifp != NULL);
3529 
3530 		/*
3531 		 * Pass 1: process any deferred {withdrawn,nonviable} requests.
3532 		 */
3533 		fe_nonviable = fsw_process_deferred(fsw);
3534 
3535 		/*
3536 		 * Pass 2: remove any expired lingering flows.
3537 		 */
3538 		fe_freed = fsw_process_linger(fsw, &fe_aborted);
3539 
3540 		/*
3541 		 * Pass 3: prune idle flow routes.
3542 		 */
3543 		fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
3544 		    ifp, &fr_resid);
3545 
3546 		/*
3547 		 * Pass 4: prune flow table
3548 		 *
3549 		 */
3550 		cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
3551 
3552 		SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
3553 		    "fe_aborted %u fr_freed %u/%u",
3554 		    fsw->fsw_flow_mgr->fm_name, fe_nonviable,
3555 		    (fe_nonviable + fsw->fsw_pending_nonviable),
3556 		    fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
3557 		    (fe_freed + fr_resid));
3558 
3559 		/* see if VM memory level is critical */
3560 		low = skmem_lowmem_check();
3561 
3562 		/*
3563 		 * If things appear to be idle, we can prune away cached
3564 		 * object that have fallen out of the working sets (this
3565 		 * is different than purging).  Every once in a while, we
3566 		 * also purge the caches.  Note that this is done across
3567 		 * all flowswitch instances, and so we limit this to no
3568 		 * more than once every FSW_REAP_SK_THRES seconds.
3569 		 */
3570 		atomic_get_64(last, &fsw_reap_last);
3571 		if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
3572 		    atomic_test_set_64(&fsw_reap_last, last, now)) {
3573 			fsw_purge_cache(fsw, low);
3574 
3575 			/* increase sleep interval if idle */
3576 			if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
3577 			    fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
3578 				i <<= 3;
3579 			}
3580 		} else if (last == 0) {
3581 			atomic_set_64(&fsw_reap_last, now);
3582 		}
3583 
3584 		/*
3585 		 * Additionally, run thru the list of channels and prune
3586 		 * or purge away cached objects on "idle" channels.  This
3587 		 * check is rate limited to no more than once every
3588 		 * FSW_DRAIN_CH_THRES seconds.
3589 		 */
3590 		last = fsw->fsw_drain_channel_chk_last;
3591 		if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
3592 			SK_DF(SK_VERB_FLOW, "%s: pruning channels",
3593 			    fsw->fsw_flow_mgr->fm_name);
3594 
3595 			fsw->fsw_drain_channel_chk_last = now;
3596 			fsw_drain_channels(fsw, now, low);
3597 		} else if (__improbable(last == 0)) {
3598 			fsw->fsw_drain_channel_chk_last = now;
3599 		}
3600 
3601 		/*
3602 		 * Finally, invoke the interface's reap callback to
3603 		 * tell it to prune or purge away cached objects if
3604 		 * it is idle.  This check is rate limited to no more
3605 		 * than once every FSW_REAP_IF_THRES seconds.
3606 		 */
3607 		last = fsw->fsw_drain_netif_chk_last;
3608 		if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
3609 			ASSERT(fsw->fsw_nifna != NULL);
3610 
3611 			if (ifp->if_na_ops != NULL &&
3612 			    ifp->if_na_ops->ni_reap != NULL) {
3613 				SK_DF(SK_VERB_FLOW, "%s: pruning netif",
3614 				    fsw->fsw_flow_mgr->fm_name);
3615 				ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
3616 				    FSW_REAP_IF_THRES, low);
3617 			}
3618 
3619 			fsw->fsw_drain_netif_chk_last = now;
3620 		} else if (__improbable(last == 0)) {
3621 			fsw->fsw_drain_netif_chk_last = now;
3622 		}
3623 
3624 		/* emit periodic interface stats ktrace */
3625 		last = fsw->fsw_reap_last;
3626 		if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
3627 			KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
3628 			    ifp->if_data.ifi_ibytes * 8,
3629 			    ifp->if_data.ifi_opackets,
3630 			    ifp->if_data.ifi_obytes * 8);
3631 
3632 			fsw->fsw_reap_last = now;
3633 		} else if (__improbable(last == 0)) {
3634 			fsw->fsw_reap_last = now;
3635 		}
3636 
3637 		nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
3638 		clock_absolutetime_interval_to_deadline(t, &t);
3639 		ASSERT(t != 0);
3640 
3641 		/* allow any pending detach to proceed */
3642 		fsw_detach_barrier_remove(fsw);
3643 	}
3644 
3645 	lck_mtx_lock(&fsw->fsw_reap_lock);
3646 	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
3647 		fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
3648 		(void) assert_wait_deadline(&fsw->fsw_reap_flags,
3649 		    THREAD_UNINT, t);
3650 		lck_mtx_unlock(&fsw->fsw_reap_lock);
3651 		thread_block_parameter(fsw_reap_thread_cont, fsw);
3652 		/* NOTREACHED */
3653 		__builtin_unreachable();
3654 	} else {
3655 terminate:
3656 		LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
3657 		fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
3658 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
3659 		/*
3660 		 * And signal any thread waiting for us to terminate;
3661 		 * wait channel here other than fsw_reap_flags to make
3662 		 * it more explicit.
3663 		 */
3664 		if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
3665 			thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
3666 		}
3667 		lck_mtx_unlock(&fsw->fsw_reap_lock);
3668 
3669 		SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
3670 
3671 		/* for the extra refcnt from kernel_thread_start() */
3672 		thread_deallocate(current_thread());
3673 		/* this is the end */
3674 		thread_terminate(current_thread());
3675 		/* NOTREACHED */
3676 		__builtin_unreachable();
3677 	}
3678 
3679 	/* must never get here */
3680 	VERIFY(0);
3681 	/* NOTREACHED */
3682 	__builtin_unreachable();
3683 }
3684 
3685 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)3686 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
3687 {
3688 	struct kern_nexus *nx = fsw->fsw_nx;
3689 
3690 	/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
3691 	FSW_RLOCK(fsw);
3692 
3693 	/* uncrustify doesn't handle C blocks properly */
3694 	/* BEGIN IGNORE CODESTYLE */
3695 	nx_port_foreach(nx, ^(nexus_port_t p) {
3696 		struct nexus_adapter *na = nx_port_get_na(nx, p);
3697 		if (na == NULL || na->na_work_ts == 0 ||
3698 		    (now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
3699 			return;
3700 		}
3701 
3702 		/*
3703 		 * If NA has been inactive for some time (twice the drain
3704 		 * threshold), we clear the work timestamp to temporarily skip
3705 		 * this channel until it's active again.  Purging cached objects
3706 		 * can be expensive since we'd need to allocate and construct
3707 		 * them again, so we do it only when necessary.
3708 		 */
3709 		boolean_t purge;
3710 		if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
3711 			na->na_work_ts = 0;
3712 			purge = TRUE;
3713 		} else {
3714 			purge = FALSE;
3715 		}
3716 
3717 		na_drain(na, purge);  /* purge/prune caches */
3718 	});
3719 	/* END IGNORE CODESTYLE */
3720 
3721 	FSW_RUNLOCK(fsw);
3722 }
3723 
3724 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)3725 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
3726 {
3727 #pragma unused(fsw)
3728 	uint64_t o = atomic_add_64_ov(&fsw_want_purge, 1);
3729 	uint32_t p = fsw_flow_purge_thresh;
3730 	boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
3731 
3732 	SK_DF(SK_VERB_FLOW, "%s: %s caches",
3733 	    fsw->fsw_flow_mgr->fm_name,
3734 	    (purge ? "purge" : "prune"));
3735 
3736 	skmem_cache_reap_now(sk_fo_cache, purge);
3737 	skmem_cache_reap_now(sk_fe_cache, purge);
3738 	skmem_cache_reap_now(sk_fab_cache, purge);
3739 	skmem_cache_reap_now(flow_route_cache, purge);
3740 	skmem_cache_reap_now(flow_stats_cache, purge);
3741 	netns_reap_caches(purge);
3742 	skmem_reap_caches(purge);
3743 
3744 	if (if_is_fsw_transport_netagent_enabled() && purge) {
3745 		mbuf_drain(FALSE);
3746 	}
3747 }
3748 
3749 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)3750 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
3751 {
3752 	/* When the interface is in low power mode, the flow is nonviable */
3753 	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
3754 	    atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
3755 		atomic_add_32(&fsw->fsw_pending_nonviable, 1);
3756 	}
3757 }
3758 
3759 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)3760 fsw_process_deferred(struct nx_flowswitch *fsw)
3761 {
3762 	struct flow_entry_dead sfed __sk_aligned(8);
3763 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
3764 	struct flow_entry_dead *fed, *tfed;
3765 	LIST_HEAD(, flow_entry_dead) fed_head =
3766 	    LIST_HEAD_INITIALIZER(fed_head);
3767 	uint32_t i, nonviable = 0;
3768 	boolean_t lowpowermode = FALSE;
3769 
3770 	bzero(&sfed, sizeof(sfed));
3771 
3772 	/*
3773 	 * The flows become nonviable when the interface
3774 	 * is in low power mode (edge trigger)
3775 	 */
3776 	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
3777 	    fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
3778 		lowpowermode = TRUE;
3779 		fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
3780 	}
3781 
3782 	/*
3783 	 * Scan thru the flow entry tree, and commit any pending withdraw or
3784 	 * nonviable requests.  We may need to push stats and/or unassign the
3785 	 * nexus from NECP, but we cannot do that while holding the locks;
3786 	 * build a temporary list for those entries.
3787 	 */
3788 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
3789 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
3790 		struct flow_owner *fo;
3791 
3792 		/*
3793 		 * Grab the lock at all costs when handling low power mode
3794 		 */
3795 		if (__probable(!lowpowermode)) {
3796 			if (!FOB_TRY_LOCK(fob)) {
3797 				continue;
3798 			}
3799 		} else {
3800 			FOB_LOCK(fob);
3801 		}
3802 
3803 		FOB_LOCK_ASSERT_HELD(fob);
3804 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
3805 			struct flow_entry *fe;
3806 
3807 			RB_FOREACH(fe, flow_entry_id_tree,
3808 			    &fo->fo_flow_entry_id_head) {
3809 				/* try first as reader; skip if we can't */
3810 				if (__improbable(lowpowermode)) {
3811 					fsw_flow_handle_low_power(fsw, fe);
3812 				}
3813 				if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
3814 					atomic_bitclear_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
3815 					flow_namespace_half_close(&fe->fe_port_reservation);
3816 				}
3817 
3818 				/* if not withdrawn/nonviable, skip */
3819 				if (!fe->fe_want_withdraw &&
3820 				    !fe->fe_want_nonviable) {
3821 					continue;
3822 				}
3823 				/*
3824 				 * Here we're holding the lock as writer;
3825 				 * don't spend too much time as we're
3826 				 * blocking the data path now.
3827 				 */
3828 				ASSERT(!uuid_is_null(fe->fe_uuid));
3829 				/* only need flow UUID and booleans */
3830 				uuid_copy(sfed.fed_uuid, fe->fe_uuid);
3831 				sfed.fed_want_clonotify =
3832 				    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
3833 				sfed.fed_want_nonviable = fe->fe_want_nonviable;
3834 				flow_entry_teardown(fo, fe);
3835 
3836 				/* do this outside the flow bucket lock */
3837 				fed = flow_entry_dead_alloc(Z_WAITOK);
3838 				ASSERT(fed != NULL);
3839 				*fed = sfed;
3840 				LIST_INSERT_HEAD(&fed_head, fed, fed_link);
3841 			}
3842 		}
3843 		FOB_UNLOCK(fob);
3844 	}
3845 
3846 	/*
3847 	 * These nonviable flows are no longer useful since we've lost
3848 	 * the source IP address; in the event the client monitors the
3849 	 * viability of the flow, explicitly mark it as nonviable so
3850 	 * that a new flow can be created.
3851 	 */
3852 	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
3853 		LIST_REMOVE(fed, fed_link);
3854 		ASSERT(fsw->fsw_agent_session != NULL);
3855 
3856 		/* if flow is closed early */
3857 		if (fed->fed_want_clonotify) {
3858 			necp_client_early_close(fed->fed_uuid);
3859 		}
3860 
3861 		/* if nonviable, unassign nexus attributes */
3862 		if (fed->fed_want_nonviable) {
3863 			(void) netagent_assign_nexus(fsw->fsw_agent_session,
3864 			    fed->fed_uuid, NULL, 0);
3865 		}
3866 
3867 		flow_entry_dead_free(fed);
3868 		++nonviable;
3869 	}
3870 	ASSERT(LIST_EMPTY(&fed_head));
3871 
3872 	return nonviable;
3873 }
3874 
3875 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)3876 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
3877 {
3878 	struct flow_entry_linger_head linger_head =
3879 	    TAILQ_HEAD_INITIALIZER(linger_head);
3880 	struct flow_entry *fe, *tfe;
3881 	uint64_t now = _net_uptime;
3882 	uint32_t i = 0, cnt = 0, freed = 0;
3883 
3884 	ASSERT(fsw->fsw_ifp != NULL);
3885 	ASSERT(abort != NULL);
3886 	*abort = 0;
3887 
3888 	/*
3889 	 * We don't want to contend with the datapath, so move
3890 	 * everything that's in the linger list into a local list.
3891 	 * This allows us to generate RSTs or free the flow entry
3892 	 * outside the lock.  Any remaining flow entry in the local
3893 	 * list will get re-added back to the head of the linger
3894 	 * list, in front of any new ones added since then.
3895 	 */
3896 	lck_mtx_lock(&fsw->fsw_linger_lock);
3897 	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3898 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3899 	cnt = fsw->fsw_linger_cnt;
3900 	fsw->fsw_linger_cnt = 0;
3901 	lck_mtx_unlock(&fsw->fsw_linger_lock);
3902 
3903 	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
3904 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3905 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3906 		ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3907 
3908 		/*
3909 		 * See if this is a TCP flow that needs to generate
3910 		 * a RST to the remote peer (if not already).
3911 		 */
3912 		if (flow_track_tcp_want_abort(fe)) {
3913 			VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
3914 			ASSERT(!uuid_is_null(fe->fe_uuid));
3915 			flow_track_abort_tcp(fe, NULL, NULL);
3916 			(*abort)++;
3917 			SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3918 			SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
3919 			    "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
3920 			    sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
3921 			    FLOWENTF_BITS);
3922 		}
3923 
3924 		/*
3925 		 * If flow has expired, remove from list and free;
3926 		 * otherwise leave it around in the linger list.
3927 		 */
3928 		if (fe->fe_linger_expire <= now) {
3929 			freed++;
3930 			fsw_linger_remove_internal(&linger_head, fe);
3931 			fe = NULL;
3932 		}
3933 		++i;
3934 	}
3935 	VERIFY(i == cnt && cnt >= freed);
3936 
3937 	/*
3938 	 * Add any remaining ones back into the linger list.
3939 	 */
3940 	lck_mtx_lock(&fsw->fsw_linger_lock);
3941 	if (!TAILQ_EMPTY(&linger_head)) {
3942 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
3943 		TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3944 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3945 		TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
3946 		fsw->fsw_linger_cnt += (cnt - freed);
3947 	}
3948 	ASSERT(TAILQ_EMPTY(&linger_head));
3949 	lck_mtx_unlock(&fsw->fsw_linger_lock);
3950 
3951 	return freed;
3952 }
3953 
3954 __attribute__((always_inline))
3955 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)3956 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
3957 {
3958 	switch (__packet_get_traffic_class(ph)) {
3959 	case PKT_TC_BE:
3960 		ifp->if_tc.ifi_ibepackets++;
3961 		ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3962 		break;
3963 	case PKT_TC_BK:
3964 		ifp->if_tc.ifi_ibkpackets++;
3965 		ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3966 		break;
3967 	case PKT_TC_VI:
3968 		ifp->if_tc.ifi_ivipackets++;
3969 		ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3970 		break;
3971 	case PKT_TC_VO:
3972 		ifp->if_tc.ifi_ivopackets++;
3973 		ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3974 		break;
3975 	default:
3976 		break;
3977 	}
3978 }
3979 
3980 __attribute__((always_inline))
3981 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)3982 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
3983     uint32_t cnt, uint32_t len)
3984 {
3985 	switch (svc) {
3986 	case PKT_TC_BE:
3987 		ifp->if_tc.ifi_obepackets += cnt;
3988 		ifp->if_tc.ifi_obebytes += len;
3989 		break;
3990 	case PKT_TC_BK:
3991 		ifp->if_tc.ifi_obkpackets += cnt;
3992 		ifp->if_tc.ifi_obkbytes += len;
3993 		break;
3994 	case PKT_TC_VI:
3995 		ifp->if_tc.ifi_ovipackets += cnt;
3996 		ifp->if_tc.ifi_ovibytes += len;
3997 		break;
3998 	case PKT_TC_VO:
3999 		ifp->if_tc.ifi_ovopackets += cnt;
4000 		ifp->if_tc.ifi_ovobytes += len;
4001 		break;
4002 	default:
4003 		break;
4004 	}
4005 }
4006