xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/fsw_dp.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 /*
55  *  BSD LICENSE
56  *
57  * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58  *  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  *  modification, are permitted provided that the following conditions
62  *  are met:
63  *
64  *    * Redistributions of source code must retain the above copyright
65  *      notice, this list of conditions and the following disclaimer.
66  *    * Redistributions in binary form must reproduce the above copyright
67  *      notice, this list of conditions and the following disclaimer in
68  *      the documentation and/or other materials provided with the
69  *      distribution.
70  *    * Neither the name of NEC Europe Ltd. nor the names of
71  *      its contributors may be used to endorse or promote products derived
72  *      from this software without specific prior written permission.
73  *
74  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <kern/uipc_domain.h>
94 #include <sys/kdebug.h>
95 #include <sys/sdt.h>
96 #include <net/bpf.h>
97 #include <net/if_ports_used.h>
98 #include <net/pktap.h>
99 #include <net/droptap.h>
100 #include <net/pktsched/pktsched_netem.h>
101 #include <netinet/tcp.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105 #include <netinet/in_var.h>
106 
107 extern kern_return_t thread_terminate(thread_t);
108 
109 #define FSW_ZONE_MAX                  256
110 #define FSW_ZONE_NAME                 "skywalk.nx.fsw"
111 
112 static uint64_t fsw_reap_last __sk_aligned(8);
113 static uint64_t fsw_want_purge __sk_aligned(8);
114 
115 #define NX_FSW_FE_TABLESZ       256     /* some power of 2 */
116 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
117 
118 #define NX_FSW_FOB_HASHSZ       31      /* some mersenne prime */
119 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
120 
121 #define NX_FSW_FRB_HASHSZ       128     /* some power of 2 */
122 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
123 
124 #define NX_FSW_FRIB_HASHSZ      13      /* some mersenne prime */
125 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
126 
127 #define NX_FSW_FLOW_REAP_INTERVAL 1     /* seconds */
128 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
129 
130 #define NX_FSW_RX_STALL_THRES   0       /* seconds (0 = disable) */
131 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
132 
133 #define NX_FSW_RX_STALL_DEFUNCT 1       /* defunct Rx-stalled channel (0 = disable) */
134 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
135 
136 #define NX_FSW_FLOW_PURGE_THRES 0       /* purge every N reaps (0 = disable) */
137 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
138 
139 #define FSW_REAP_IVAL            (MAX(1, fsw_flow_reap_interval))
140 #define FSW_REAP_SK_THRES        (FSW_REAP_IVAL << 5)
141 #define FSW_REAP_IF_THRES        (FSW_REAP_IVAL << 5)
142 #define FSW_DRAIN_CH_THRES       (FSW_REAP_IVAL << 5)
143 #define FSW_IFSTATS_THRES        1
144 
145 #define NX_FSW_CHANNEL_REAP_THRES 1000  /* threshold (bytes/sec) for reaping*/
146 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
147 
148 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
149 
150 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
151 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
152 uint32_t fsw_gso_batch = 8;
153 #if (DEVELOPMENT || DEBUG)
154 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
155     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
156     "flowswitch Rx batch size");
157 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
158     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
159     "flowswitch Tx batch size");
160 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
161     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
162     "flowswitch GSO batch size");
163 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
164     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
165     "flowswitch channel reap threshold throughput (bytes/sec)");
166 #endif /* !DEVELOPMENT && !DEBUG */
167 
168 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
169     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
170     "flowswitch RX aggregation for tcp flows (enable/disable)");
171 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
172     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
173     "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
174 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
175     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
176     "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
177 
178 /*
179  * IP reassembly
180  * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
181  * enable/disable the reassembly routine regardless of whether the
182  * transport netagent is enabled or not.
183  *
184  * 'fsw_ip_reass' is a tri-state:
185  *    0 means force IP reassembly off
186  *    1 means force IP reassembly on
187  *    2 means don't force the value, use what's appropriate for this flowswitch
188  */
189 #define FSW_IP_REASS_FORCE_OFF          0
190 #define FSW_IP_REASS_FORCE_ON           1
191 #define FSW_IP_REASS_AUTO               2
192 
193 uint32_t fsw_ip_reass = FSW_IP_REASS_AUTO;
194 
195 static int
196 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
197 {
198 #pragma unused(oidp, arg1, arg2)
199 	unsigned int new_value;
200 	int changed;
201 	int error;
202 
203 	error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
204 	    &new_value, &changed);
205 	if (error == 0 && changed != 0) {
206 		if (new_value > FSW_IP_REASS_AUTO) {
207 			return EINVAL;
208 		}
209 		fsw_ip_reass = new_value;
210 	}
211 	return error;
212 }
213 
214 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
215     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
216     0, 0, fsw_ip_reass_sysctl, "IU",
217     "adjust flowswitch IP reassembly");
218 
219 #if (DEVELOPMENT || DEBUG)
220 static uint64_t _fsw_inject_error = 0;
221 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
222 	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
223 	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
224 
225 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
226 	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
227 	        SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
228 	        if ((_f) != NULL)                                       \
229 	                (_f)(__VA_ARGS__);                              \
230 	}                                                               \
231 } while (0)
232 
233 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
234     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
235 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
236     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
237 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
238     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
239 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
240     flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
241     &fsw_flow_route_id_buckets, 0, "");
242 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
243     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
244 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
245     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
246 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
247     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
248 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
249     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
250 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
252 #else
253 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
254 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
255 #endif /* !DEVELOPMENT && !DEBUG */
256 
257 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
258     struct flow_entry *);
259 static void fsw_reap_thread_func(void *, wait_result_t);
260 static void fsw_reap_thread_cont(void *, wait_result_t);
261 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
262 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
263 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
264 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
265 static void fsw_process_rxstrc(struct nx_flowswitch *);
266 
267 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
268     struct __kern_packet *);
269 
270 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
271 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
272     uint32_t, uint32_t);
273 
274 static int __fsw_dp_inited = 0;
275 
276 int
fsw_dp_init(void)277 fsw_dp_init(void)
278 {
279 	static_assert(FSW_VP_DEV == 0);
280 	static_assert(FSW_VP_HOST == 1);
281 	static_assert((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
282 	static_assert((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
283 
284 	ASSERT(!__fsw_dp_inited);
285 
286 	flow_mgr_init();
287 	flow_init();
288 
289 	__fsw_dp_inited = 1;
290 
291 	return 0;
292 }
293 
294 void
fsw_dp_uninit(void)295 fsw_dp_uninit(void)
296 {
297 	if (__fsw_dp_inited) {
298 		flow_fini();
299 		flow_mgr_fini();
300 
301 		__fsw_dp_inited = 0;
302 	}
303 }
304 
305 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)306 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
307 {
308 	pp_free_pktq(pktq);
309 }
310 
311 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do {         \
312 	uint32_t _len = KPKTQ_LEN(pktq);                                      \
313 	if (KPKTQ_EMPTY(pktq)) {                                              \
314 	        ASSERT(_len == 0);                                            \
315 	        break;                                                        \
316 	}                                                                     \
317 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len);        \
318 	FSW_STATS_ADD(FSW_STATS_DROP, _len);                                  \
319 	DTRACE_SKYWALK1(fsw__dp__drop, int, _len);                            \
320 	if (__probable(droptap_total_tap_count == 0)) {                       \
321 	        dp_free_pktq(fsw, pktq);                                      \
322 	        break;                                                        \
323 	}                                                                     \
324 	drop_func_t dropfunc;                                                 \
325 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
326 	struct __kern_packet *kpkt = KPKTQ_FIRST(pktq);                       \
327 	struct __kern_packet *next_pkt;                                       \
328 	for (; kpkt != NULL; kpkt = next_pkt) {                               \
329 	        next_pkt = kpkt->pkt_nextpkt;                                 \
330 	        dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags,    \
331 	            fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL,      \
332 	            0, 0);                                                    \
333 	}                                                                     \
334 	dp_free_pktq(fsw, pktq);                                              \
335 } while (0)
336 
337 #define dp_drop_pkt_single_nofree(fsw, pkt, outgoing, _reason, _flags) do { \
338 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet");                \
339 	FSW_STATS_ADD(FSW_STATS_DROP, 1);                                     \
340 	if (__probable(droptap_total_tap_count == 0)) {                       \
341 	        break;                                                        \
342 	}                                                                     \
343 	drop_func_t dropfunc;                                                 \
344 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
345 	dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags,         \
346 	    fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);      \
347 } while (0)
348 
349 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do {          \
350 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet");                \
351 	FSW_STATS_ADD(FSW_STATS_DROP, 1);                                     \
352 	if (__probable(droptap_total_tap_count == 0)) {                       \
353 	        pp_free_packet_single(pkt);                                   \
354 	        break;                                                        \
355 	}                                                                     \
356 	drop_func_t dropfunc;                                                 \
357 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
358 	dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags,         \
359 	    fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);      \
360 	pp_free_packet_single(pkt);                                           \
361 } while (0)
362 
363 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do {                \
364 	if (__probable(droptap_total_tap_count == 0)) {                       \
365 	        pp_free_packet_chain(pkt, NULL);                              \
366 	        break;                                                        \
367 	}                                                                     \
368 	drop_func_t dropfunc;                                                 \
369 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
370 	struct __kern_packet *next_pkt;                                       \
371 	for (; pkt != NULL; pkt = next_pkt) {                                 \
372 	        next_pkt = pkt->pkt_nextpkt;                                  \
373 	        dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
374 	            NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL,               \
375 	            0, 0);                                                    \
376 	}                                                                     \
377 	pp_free_packet_chain(pkt, NULL);                                      \
378 } while (0)
379 
380 
381 SK_NO_INLINE_ATTRIBUTE
382 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)383 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
384     bool input)
385 {
386 	pid_t pid;
387 	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
388 	const char *__null_terminated proc_name = NULL;
389 	pid_t epid;
390 	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
391 	const char *__null_terminated eproc_name = NULL;
392 	sa_family_t af;
393 	bool tap_early = false;
394 	struct __kern_packet *pkt;
395 
396 	ASSERT(fe != NULL);
397 	ASSERT(fsw->fsw_ifp != NULL);
398 
399 	if (fe->fe_nx_port == FSW_VP_HOST) {
400 		/* allow packets to be tapped before aggregation happens */
401 		tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
402 		if (!tap_early) {
403 			/* all other traffic will be tapped in the dlil input path */
404 			return;
405 		}
406 	}
407 	if (fe->fe_key.fk_ipver == IPVERSION) {
408 		af = AF_INET;
409 	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
410 		af = AF_INET6;
411 	} else {
412 		return;
413 	}
414 
415 	pid = fe->fe_pid;
416 	if (fe->fe_proc_name[0] != '\0') {
417 		proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
418 		    fe->fe_proc_name, sizeof(fe->fe_proc_name));
419 	}
420 	epid = fe->fe_epid;
421 	if (fe->fe_eproc_name[0] != '\0') {
422 		eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
423 		    fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
424 	}
425 	if (input) {
426 		KPKTQ_FOREACH(pkt, pktq) {
427 			pktap_input_packet(fsw->fsw_ifp, af,
428 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
429 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
430 			    IPPROTO_TCP, fe->fe_flowid,
431 			    tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
432 		}
433 	} else {
434 		KPKTQ_FOREACH(pkt, pktq) {
435 			pktap_output_packet(fsw->fsw_ifp, af,
436 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
437 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
438 			    0, 0, PTH_FLAG_NEXUS_CHAN);
439 		}
440 	}
441 }
442 
443 #if (DEVELOPMENT || DEBUG)
444 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)445 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
446     int *ret)
447 {
448 	static boolean_t _err35_flag_modified = FALSE;
449 
450 	switch (step) {
451 	case 1:
452 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
453 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
454 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
455 			_err35_flag_modified = TRUE;
456 		}
457 		break;
458 
459 	case 2:
460 		if (!_err35_flag_modified) {
461 			return;
462 		}
463 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
464 			m_freem(pkt->pkt_mbuf);
465 			pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
466 			pkt->pkt_mbuf = NULL;
467 		}
468 		*ret = EJUSTRETURN;
469 		fr->fr_flags |= FLOWRTF_RESOLVED;
470 		_err35_flag_modified = FALSE;
471 		break;
472 
473 	default:
474 		VERIFY(0);
475 		/* not reached */
476 	}
477 }
478 
479 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)480 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
481 {
482 	static boolean_t _err36_flag_modified = FALSE;
483 
484 	switch (step) {
485 	case 1:
486 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
487 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
488 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
489 			_err36_flag_modified = TRUE;
490 		}
491 		break;
492 
493 	case 2:
494 		if (!_err36_flag_modified) {
495 			return;
496 		}
497 		*ret = ENETUNREACH;
498 		fr->fr_flags |= FLOWRTF_RESOLVED;
499 		_err36_flag_modified = FALSE;
500 		break;
501 
502 	default:
503 		VERIFY(0);
504 		/* not reached */
505 	}
506 }
507 #else /* !DEVELOPMENT && !DEBUG */
508 #define _fsw_error35_handler(...)
509 #define _fsw_error36_handler(...)
510 #endif /* DEVELOPMENT || DEBUG */
511 
512 /*
513  * Check if the source packet content can fit into the destination
514  * ring's packet. Returns TRUE if the source packet can fit.
515  * Note: Failures could be caused by misconfigured packet pool sizes,
516  * missing packet size check again MTU or if the source packet is from
517  * a compat netif and the attached mbuf is larger than MTU due to LRO.
518  */
519 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)520 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
521     uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
522     uint32_t *copy_len)
523 {
524 	uint32_t tlen = 0;
525 	uint32_t splen = spkt->pkt_length - skip_l2hlen;
526 
527 	if (l2hlen != 0) {
528 		VERIFY(skip_l2hlen == 0);
529 		tlen += l2hlen;
530 	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
531 		splen -= ETHER_CRC_LEN;
532 	}
533 
534 	tlen += splen;
535 	*copy_len = splen;
536 
537 	return tlen <= ((__packet_get_buflet_count(dph) *
538 	       PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
539 	       headroom);
540 }
541 
542 #if SK_LOG
543 /* Hoisted out of line to reduce kernel stack footprint */
544 SK_LOG_ATTRIBUTE
545 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)546 copy_packet_from_dev_log(struct __kern_packet *spkt,
547     struct __kern_packet *dpkt, struct proc *p)
548 {
549 	uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
550 	    ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
551 	    SK_VERB_COPY_MBUF : SK_VERB_COPY));
552 	char *daddr;
553 	uint32_t pkt_len;
554 
555 	MD_BUFLET_ADDR_ABS(dpkt, daddr);
556 	pkt_len = __packet_get_real_data_length(dpkt);
557 	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
558 	    sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
559 	    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
560 	    (uint32_t)dpkt->pkt_l2_len);
561 	SK_DF(logflags | SK_VERB_DUMP, "%s",
562 	    sk_dump("buf", daddr, pkt_len, 128));
563 }
564 #else
565 #define copy_packet_from_dev_log(...)
566 #endif /* SK_LOG */
567 
568 
569 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)570 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
571     struct __kern_packet *dpkt)
572 {
573 	/*
574 	 * source and destination nexus don't share the packet pool
575 	 * sync operation here is to
576 	 * - alloc packet for the rx(dst) ring
577 	 * - copy data/metadata from src packet to dst packet
578 	 * - attach alloc'd packet to rx(dst) ring
579 	 */
580 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
581 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
582 	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
583 	    METADATA_SUBTYPE(spkt));
584 	boolean_t do_cksum_rx;
585 	uint16_t skip_l2h_len = spkt->pkt_l2_len;
586 	uint16_t iphlen;
587 	uint32_t dlen;
588 	int err;
589 
590 	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
591 	    &dlen))) {
592 		SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
593 		    PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
594 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
595 		return EINVAL;
596 	}
597 
598 	/* Copy packet metadata */
599 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
600 	_PKT_COPY(spkt, dpkt);
601 	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
602 	    PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
603 	ASSERT(dpkt->pkt_mbuf == NULL);
604 
605 	dpkt->pkt_headroom = 0;
606 	dpkt->pkt_l2_len = 0;
607 
608 	/* don't include IP header from partial sum */
609 	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
610 		iphlen = spkt->pkt_flow_ip_hlen;
611 		do_cksum_rx = sk_cksum_rx;
612 	} else {
613 		iphlen = 0;
614 		do_cksum_rx = FALSE;
615 	}
616 
617 	/* Copy packet payload */
618 	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
619 	    (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
620 		FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
621 		/*
622 		 * Source packet has truncated contents (just enough for
623 		 * the classifer) of an mbuf from the compat driver; copy
624 		 * the entire entire mbuf contents to destination packet.
625 		 */
626 		m_adj(spkt->pkt_mbuf, skip_l2h_len);
627 		ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
628 		fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
629 		    spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
630 	} else {
631 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
632 		/*
633 		 * Source packet has full contents, either from an mbuf
634 		 * that came up from the compat driver, or because it
635 		 * originated on the native driver; copy to destination.
636 		 */
637 		fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
638 		    (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
639 		    iphlen, 0, FALSE);
640 	}
641 
642 #if DEBUG || DEVELOPMENT
643 	if (__improbable(pkt_trailers > 0)) {
644 		dlen += pkt_add_trailers(dph, dlen, iphlen);
645 	}
646 #endif /* DEBUG || DEVELOPMENT */
647 
648 	/* Finalize and attach packet to Rx ring */
649 	METADATA_ADJUST_LEN(dpkt, 0, 0);
650 	err = __packet_finalize(dph);
651 	VERIFY(err == 0);
652 
653 	copy_packet_from_dev_log(spkt, dpkt, kernproc);
654 
655 	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
656 		ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
657 		mbuf_freem(spkt->pkt_mbuf);
658 		KPKT_CLEAR_MBUF_DATA(spkt);
659 	} else {
660 		fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
661 	}
662 
663 	if (__probable(do_cksum_rx != 0)) {
664 		FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
665 	}
666 
667 	return 0;
668 }
669 
670 SK_NO_INLINE_ATTRIBUTE
671 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)672 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
673 {
674 	char *pkt_buf;
675 	void *l3_hdr;
676 	uint16_t nfrags, tlen;
677 	int err = 0;
678 
679 	switch (fsw_ip_reass) {
680 	case FSW_IP_REASS_FORCE_OFF:
681 		return pkt;
682 	case FSW_IP_REASS_FORCE_ON:
683 		break;
684 	default:
685 		if (!FSW_NETAGENT_ENABLED(fsw) ||
686 		    flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
687 			return pkt;
688 		}
689 		break;
690 	}
691 
692 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
693 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
694 
695 	ASSERT(fsw->fsw_ipfm != NULL);
696 	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
697 
698 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
699 		struct ip *ip = l3_hdr;
700 		err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
701 	} else {
702 		struct ip6_hdr *ip6_hdr = l3_hdr;
703 		struct ip6_frag *__single ip6_frag =
704 		    (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
705 
706 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
707 		/* we only handle frag header immediately after v6 header */
708 		err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
709 		    &nfrags, &tlen);
710 	}
711 	if (__improbable(err != 0)) {
712 		/* if we get a bad fragment, free it */
713 		pp_free_packet_single(pkt);
714 		pkt = NULL;
715 	} else {
716 		ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
717 	}
718 
719 	return pkt;
720 }
721 
722 SK_NO_INLINE_ATTRIBUTE
723 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)724 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
725 {
726 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
727 	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
728 	kern_packet_t ph =  SK_PTR_ENCODE(pkt,
729 	    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
730 	/*
731 	 * This is the case when the packet is coming in from
732 	 * compat-netif. This packet only has valid metadata
733 	 * and an attached mbuf. We need to copy enough data
734 	 * from the mbuf to the packet buffer for the
735 	 * classifier. Compat netif packet pool is configured
736 	 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
737 	 * which is just enough to hold the protocol headers
738 	 * for the flowswitch classifier.
739 	 */
740 
741 	pkt->pkt_headroom = 0;
742 	METADATA_ADJUST_LEN(pkt, 0, 0);
743 	/*
744 	 * Copy the initial 128 bytes of the packet for
745 	 * classification.
746 	 * Ethernet(14) + IPv6 header(40) +
747 	 * + IPv6 fragment header(8) +
748 	 * TCP header with options(60).
749 	 */
750 	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
751 	    pkt->pkt_headroom, pkt->pkt_mbuf, 0,
752 	    MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
753 	    FALSE, 0);
754 
755 	int err = __packet_finalize_with_mbuf(pkt);
756 	VERIFY(err == 0);
757 }
758 
759 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)760 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
761 {
762 	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
763 
764 	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
765 		rx_prepare_packet_mbuf(fsw, pkt);
766 	}
767 
768 	return pkt;
769 }
770 
771 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)772 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
773     bool input, struct flow_entry *prev_fe)
774 {
775 	struct flow_key key __sk_aligned(16);
776 	struct flow_entry *__single fe = NULL;
777 
778 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
779 	flow_pkt2key(pkt, input, &key);
780 
781 	if (__probable(prev_fe != NULL &&
782 	    prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
783 		uint16_t saved_mask = key.fk_mask;
784 		key.fk_mask = FKMASK_5TUPLE;
785 		if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
786 			flow_entry_retain(prev_fe);
787 			fe = prev_fe;
788 		} else {
789 			key.fk_mask = saved_mask;
790 		}
791 	}
792 
793 top:
794 	if (__improbable(fe == NULL)) {
795 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
796 	}
797 
798 	if (__improbable(fe != NULL &&
799 	    (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
800 		/* Rx */
801 		if (input) {
802 			if (fe->fe_flags & FLOWENTF_PARENT) {
803 				struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
804 				if (child_fe != NULL) {
805 					flow_entry_release(&fe);
806 					fe = child_fe;
807 				}
808 			} else {
809 				if (!rx_flow_demux_match(fsw, fe, pkt)) {
810 					flow_entry_release(&fe);
811 					fe = NULL;
812 					goto top;
813 				}
814 			}
815 		} else {
816 			/* Tx */
817 			if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
818 				if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
819 					struct flow_entry *__single parent_fe = fe;
820 					fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
821 					flow_entry_release(&parent_fe);
822 				} else {
823 					flow_entry_release(&fe);
824 					fe = NULL;
825 					goto top;
826 				}
827 			}
828 		}
829 	}
830 
831 	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
832 	SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
833 	    "%s %s %s \"%s\" fe %p",
834 	    input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
835 	    sk_proc_name(current_proc()),
836 	    fk2str(&key, fkbuf, sizeof(fkbuf)), SK_KVA(fe));
837 
838 	return fe;
839 }
840 
841 SK_NO_INLINE_ATTRIBUTE
842 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)843 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
844 {
845 	struct nx_flowswitch *fsw = fe->fe_fsw;
846 	struct ifnet *ifp = fsw->fsw_ifp;
847 	struct in_ifaddr *ia = NULL;
848 	struct in_ifaddr *best_ia = NULL;
849 	struct in6_ifaddr *ia6 = NULL;
850 	struct in6_ifaddr *best_ia6 = NULL;
851 	struct ifnet *match_ifp = NULL;
852 	struct __flow *flow = pkt->pkt_flow;
853 	bool result = false;
854 
855 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
856 
857 	if (flow->flow_ip_ver == IPVERSION) {
858 		if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
859 		    IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
860 		    IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
861 		    IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
862 		    IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
863 		    IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
864 		    INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
865 			result = true;
866 			goto done;
867 		}
868 
869 		/*
870 		 * Check for a match in the hash bucket.
871 		 */
872 		lck_rw_lock_shared(&in_ifaddr_rwlock);
873 		TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
874 			if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
875 				best_ia = ia;
876 				match_ifp = ia->ia_ifp;
877 
878 				if (match_ifp == ifp) {
879 					break;
880 				}
881 				/*
882 				 * Continue the loop in case there's a exact match with another
883 				 * interface
884 				 */
885 			}
886 		}
887 
888 		if (best_ia != NULL) {
889 			if (match_ifp != ifp && ipforwarding == 0 &&
890 			    (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
891 			    match_ifp->if_family == IFNET_FAMILY_UTUN)) {
892 				/*
893 				 * Drop when interface address check is strict and forwarding
894 				 * is disabled
895 				 */
896 			} else {
897 				lck_rw_done(&in_ifaddr_rwlock);
898 				result = true;
899 				goto done;
900 			}
901 		}
902 		lck_rw_done(&in_ifaddr_rwlock);
903 
904 		if (ifp->if_flags & IFF_BROADCAST) {
905 			/*
906 			 * Check for broadcast addresses.
907 			 *
908 			 * Only accept broadcast packets that arrive via the matching
909 			 * interface.  Reception of forwarded directed broadcasts would be
910 			 * handled via ip_forward() and ether_frameout() with the loopback
911 			 * into the stack for SIMPLEX interfaces handled by ether_frameout().
912 			 */
913 			struct ifaddr *ifa;
914 
915 			ifnet_lock_shared(ifp);
916 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
917 				if (ifa->ifa_addr->sa_family != AF_INET) {
918 					continue;
919 				}
920 				ia = ifatoia(ifa);
921 				if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
922 				    ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
923 					ifnet_lock_done(ifp);
924 					result = true;
925 					goto done;
926 				}
927 			}
928 			ifnet_lock_done(ifp);
929 		}
930 	} else {
931 		struct in6_ifaddrhashhead *ia6_hash_head;
932 
933 		if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
934 		    IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
935 		    IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
936 			result = true;
937 			goto done;
938 		}
939 
940 		/*
941 		 * Check for exact addresses in the hash bucket.
942 		 */
943 		lck_rw_lock_shared(&in6_ifaddr_rwlock);
944 		/* XXX -fbounds-safety: external dependency on ip6_input.c */
945 		ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
946 		    in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
947 		ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
948 
949 		TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
950 			if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
951 			    ia6->ia_ifp->if_index, ifp->if_index)) {
952 				if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
953 					continue;
954 				}
955 				best_ia6 = ia6;
956 				if (ia6->ia_ifp == ifp) {
957 					break;
958 				}
959 				/*
960 				 * Continue the loop in case there's a exact match with another
961 				 * interface
962 				 */
963 			}
964 		}
965 		if (best_ia6 != NULL) {
966 			if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
967 			    (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
968 			    best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
969 				/*
970 				 * Drop when interface address check is strict and forwarding
971 				 * is disabled
972 				 */
973 			} else {
974 				lck_rw_done(&in6_ifaddr_rwlock);
975 				result = true;
976 				goto done;
977 			}
978 		}
979 		lck_rw_done(&in6_ifaddr_rwlock);
980 	}
981 
982 	/*
983 	 * In forwarding mode, if the destination address
984 	 * of the packet does not match any interface
985 	 * address, it maybe destined to the client device
986 	 */
987 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
988 	    "Rx flow does not match interface address");
989 done:
990 	return result;
991 }
992 
993 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)994 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
995     struct flow_entry *prev_fe)
996 {
997 	struct flow_entry *__single fe;
998 
999 	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
1000 	_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
1001 	if (fe == NULL) {
1002 		FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
1003 		return NULL;
1004 	}
1005 
1006 	if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1007 	    fe->fe_flags & FLOWENTF_LISTENER) &&
1008 	    !pkt_is_for_listener(fe, pkt)) {
1009 		FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
1010 		flow_entry_release(&fe);
1011 		return NULL;
1012 	}
1013 
1014 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1015 		FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1016 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1017 		    "Rx flow torn down");
1018 		flow_entry_release(&fe);
1019 		return NULL;
1020 	}
1021 
1022 	if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
1023 		FSW_STATS_INC(FSW_STATS_RX_DISABLED);
1024 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1025 		    "Rx not allowed for this flow");
1026 		flow_entry_release(&fe);
1027 	}
1028 	return fe;
1029 }
1030 
1031 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1032 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1033     struct __kern_packet *pkt, uint64_t tid)
1034 {
1035 	/*
1036 	 * Among threads working on the same fe, the first thread that reaches here
1037 	 * will be responsible for processing all the packets until a point when
1038 	 * it does not see new packets in fe_rx_pktq. Other threads only
1039 	 * enqueue their packets but do not add the flow entry to their flow entry list.
1040 	 */
1041 	lck_mtx_lock(&fe->fe_rx_pktq_lock);
1042 
1043 	if (fe->fe_rx_worker_tid == 0) {
1044 		fe->fe_rx_worker_tid = tid;
1045 	} else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1046 		STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1047 	}
1048 
1049 	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1050 		fe->fe_rx_frag_count++;
1051 	}
1052 
1053 	fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1054 	/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1055 	if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1056 		ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1057 		TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1058 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1059 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1060 	} else {
1061 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1062 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1063 		flow_entry_release(&fe);
1064 	}
1065 }
1066 
1067 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1068 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1069     struct __kern_packet *pkt)
1070 {
1071 	/* record frag continuation */
1072 	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1073 		ASSERT(pkt->pkt_flow_ip_is_frag);
1074 		fe->fe_tx_is_cont_frag = true;
1075 		fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1076 	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1077 		fe->fe_tx_is_cont_frag = false;
1078 		fe->fe_tx_frag_id = 0;
1079 	}
1080 
1081 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1082 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1083 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1084 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1085 	} else {
1086 		ASSERT(!TAILQ_EMPTY(fes));
1087 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1088 		flow_entry_release(&fe);
1089 	}
1090 }
1091 
1092 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1093 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1094     uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1095 {
1096 	uint32_t n_pkts = 0;
1097 	slot_idx_t idx, idx_end;
1098 	idx = r->ckr_khead;
1099 	idx_end = r->ckr_rhead;
1100 
1101 	ASSERT(KPKTQ_EMPTY(pktq));
1102 	*n_bytes = 0;
1103 	for (; n_pkts < n_pkts_max && idx != idx_end;
1104 	    idx = SLOT_NEXT(idx, r->ckr_lim)) {
1105 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1106 		struct __kern_packet *pkt = ksd->sd_pkt;
1107 
1108 		ASSERT(pkt->pkt_nextpkt == NULL);
1109 		KR_SLOT_DETACH_METADATA(r, ksd);
1110 
1111 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1112 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1113 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1114 		    || (pkt->pkt_length == 0)) {
1115 			FSW_STATS_INC(FSW_STATS_DROP);
1116 			pp_free_packet_single(pkt);
1117 			continue;
1118 		}
1119 		n_pkts++;
1120 		*n_bytes += pkt->pkt_length;
1121 
1122 		KPKTQ_ENQUEUE(pktq, pkt);
1123 	}
1124 	r->ckr_khead = idx;
1125 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1126 }
1127 
1128 /*
1129  * This is only for estimating how many packets each GSO packet will need.
1130  * The number does not need to be exact because any leftover packets allocated
1131  * will be freed.
1132  */
1133 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1134 estimate_gso_pkts(struct __kern_packet *pkt)
1135 {
1136 	packet_tso_flags_t tso_flags;
1137 	uint16_t mss;
1138 	uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1139 
1140 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1141 	mss = pkt->pkt_proto_seg_sz;
1142 
1143 	if (tso_flags == PACKET_TSO_IPV4) {
1144 		total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1145 	} else if (tso_flags == PACKET_TSO_IPV6) {
1146 		total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1147 	}
1148 	if (total_hlen != 0 && mss != 0) {
1149 		total_len = pkt->pkt_length;
1150 		n_pkts = (uint32_t)
1151 		    (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1152 	}
1153 	DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1154 	    uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1155 	    uint32_t, n_pkts);
1156 	return n_pkts;
1157 }
1158 
1159 /*
1160  * This function retrieves a chain of packets of the same type only
1161  * (GSO or non-GSO).
1162  */
1163 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1164 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1165     struct __kern_channel_ring *r, uint32_t n_pkts_max,
1166     struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1167 {
1168 	uint32_t n_pkts = 0;
1169 	slot_idx_t idx, idx_end;
1170 	idx = r->ckr_khead;
1171 	idx_end = r->ckr_rhead;
1172 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1173 	boolean_t gso_enabled, gso_required;
1174 	uint32_t gso_pkts;
1175 
1176 	gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1177 	ASSERT(KPKTQ_EMPTY(pktq));
1178 	*n_bytes = 0;
1179 	for (; n_pkts < n_pkts_max &&
1180 	    (!gso_enabled || fsw_gso_batch == 0 ||
1181 	    *gso_pkts_estimate < fsw_gso_batch) &&
1182 	    idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1183 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1184 		struct __kern_packet *pkt = ksd->sd_pkt;
1185 
1186 		ASSERT(pkt->pkt_nextpkt == NULL);
1187 
1188 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1189 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1190 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1191 		    || (pkt->pkt_length == 0)) {
1192 			KR_SLOT_DETACH_METADATA(r, ksd);
1193 			FSW_STATS_INC(FSW_STATS_DROP);
1194 			pp_free_packet_single(pkt);
1195 			continue;
1196 		}
1197 		if (gso_enabled) {
1198 			gso_pkts = estimate_gso_pkts(pkt);
1199 
1200 			/*
1201 			 * We use the first packet to determine what
1202 			 * type the subsequent ones need to be (GSO or
1203 			 * non-GSO).
1204 			 */
1205 			if (n_pkts == 0) {
1206 				gso_required = (gso_pkts != 0);
1207 			} else {
1208 				if (gso_required != (gso_pkts != 0)) {
1209 					break;
1210 				}
1211 			}
1212 			*gso_pkts_estimate += gso_pkts;
1213 		}
1214 		KR_SLOT_DETACH_METADATA(r, ksd);
1215 		if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1216 			__packet_set_tx_nx_port(SK_PKT2PH(pkt),
1217 			    vpna->vpna_nx_port, vpna->vpna_gencnt);
1218 		}
1219 		n_pkts++;
1220 		*n_bytes += pkt->pkt_length;
1221 		KPKTQ_ENQUEUE(pktq, pkt);
1222 	}
1223 	r->ckr_khead = idx;
1224 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1225 	DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1226 	    ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1227 	    uint32_t, *gso_pkts_estimate);
1228 }
1229 
1230 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1231 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1232     struct pktq *pktq)
1233 {
1234 #pragma unused(fsw)
1235 	struct __kern_packet *pkt;
1236 	struct __kern_quantum *kqum;
1237 	uint32_t kr_space_avail = 0;
1238 	uint32_t n, n_pkts = 0, n_bytes = 0;
1239 	slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1240 
1241 	kr_enter(r, TRUE);
1242 
1243 	idx_start = r->ckr_ktail;
1244 	kr_space_avail = kr_available_slots_rxring(r);
1245 	_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1246 	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1247 	_FSW_INJECT_ERROR(41, n, 0, null_func);
1248 	idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1249 
1250 	idx = idx_start;
1251 	while (idx != idx_end) {
1252 		KPKTQ_DEQUEUE(pktq, pkt);
1253 		kqum = SK_PTR_ADDR_KQUM(pkt);
1254 		kqum->qum_qflags |= QUM_F_FINALIZED;
1255 		n_pkts++;
1256 		n_bytes += pkt->pkt_length;
1257 		KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1258 		if (__improbable(pkt->pkt_trace_id != 0)) {
1259 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1260 			KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1261 		}
1262 		idx = SLOT_NEXT(idx, r->ckr_lim);
1263 	}
1264 
1265 	kr_update_stats(r, n_pkts, n_bytes);
1266 
1267 	/*
1268 	 * ensure slot attachments are visible before updating the
1269 	 * tail pointer
1270 	 */
1271 	os_atomic_thread_fence(seq_cst);
1272 
1273 	r->ckr_ktail = idx_end;
1274 
1275 	kr_exit(r);
1276 
1277 	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1278 
1279 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1280 	    r->ckr_name, n_pkts);
1281 }
1282 
1283 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1284 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1285 {
1286 	ASSERT(KPKTQ_EMPTY(pktq));
1287 
1288 	for (uint32_t i = 0; i < n_pkts; i++) {
1289 		struct __kern_packet *__single pkt = pkts[i];
1290 		ASSERT(pkt->pkt_nextpkt == NULL);
1291 		KPKTQ_ENQUEUE(pktq, pkt);
1292 	}
1293 }
1294 
1295 /*
1296  * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1297  */
1298 SK_NO_INLINE_ATTRIBUTE
1299 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1300 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1301     struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1302 {
1303 	uint32_t tot_cnt;
1304 	unsigned int num_segs = 1;
1305 	struct mbuf *__single mhead, *__single head = NULL;
1306 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1307 	uint32_t mhead_cnt, mhead_bufsize;
1308 	uint32_t mhead_waste = 0;
1309 	uint32_t mcnt = 0, mbytes = 0;
1310 	uint32_t largest, max_pkt_len;
1311 	struct __kern_packet *__single pkt;
1312 	struct kern_pbufpool *pp;
1313 
1314 	tot_cnt = KPKTQ_LEN(pktq);
1315 	ASSERT(tot_cnt > 0);
1316 	mhead_cnt = tot_cnt;
1317 
1318 	/*
1319 	 * Opportunistically batch-allocate the mbufs based on the largest
1320 	 * packet size we've seen in the recent past.  Note that we reset
1321 	 * fe_rx_largest_size below if we notice that we're under-utilizing the
1322 	 * allocated buffers (thus disabling this batch allocation).
1323 	 */
1324 	largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1325 	if (__probable(largest != 0)) {
1326 		if (largest <= MCLBYTES) {
1327 			mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1328 			    &num_segs, M_NOWAIT, 1, 0);
1329 			mhead_bufsize = MCLBYTES;
1330 		} else if (largest <= MBIGCLBYTES) {
1331 			mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1332 			    &num_segs, M_NOWAIT, 1, 0);
1333 			mhead_bufsize = MBIGCLBYTES;
1334 		} else if (largest <= M16KCLBYTES) {
1335 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1336 			    &num_segs, M_NOWAIT, 1, 0);
1337 			mhead_bufsize = M16KCLBYTES;
1338 		} else if (largest <= M16KCLBYTES * 2) {
1339 			num_segs = 2;
1340 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1341 			    &num_segs, M_NOWAIT, 1, 0);
1342 			mhead_bufsize = M16KCLBYTES * 2;
1343 		} else {
1344 			mhead = NULL;
1345 			mhead_bufsize = mhead_cnt = 0;
1346 		}
1347 	} else {
1348 		mhead = NULL;
1349 		mhead_bufsize = mhead_cnt = 0;
1350 	}
1351 	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1352 	    uint32_t, mhead_cnt, uint32_t, tot_cnt);
1353 
1354 	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1355 	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1356 
1357 	KPKTQ_FOREACH(pkt, pktq) {
1358 		uint32_t tot_len, len;
1359 		uint16_t pad, llhlen, iphlen;
1360 		boolean_t do_cksum_rx;
1361 		struct mbuf *__single m;
1362 		int error;
1363 
1364 		llhlen = pkt->pkt_l2_len;
1365 		len = pkt->pkt_length;
1366 		if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1367 			DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1368 			    struct __kern_packet *, pkt);
1369 			FSW_STATS_INC(FSW_STATS_DROP);
1370 			FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1371 			continue;
1372 		}
1373 		/* begin payload on 32-bit boundary; figure out the padding */
1374 		pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1375 		tot_len = pad + len;
1376 
1377 		/* remember largest packet size */
1378 		if (__improbable(largest < tot_len)) {
1379 			largest = MAX(tot_len, MCLBYTES);
1380 		}
1381 
1382 		/*
1383 		 * If the above batch allocation returned partial
1384 		 * success, we try a blocking allocation here again.
1385 		 */
1386 		m = mhead;
1387 		if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1388 			ASSERT(mhead != NULL || mhead_cnt == 0);
1389 			num_segs = 1;
1390 			if (tot_len > M16KCLBYTES) {
1391 				num_segs = 0;
1392 			}
1393 			if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1394 			    &num_segs, &m)) != 0) {
1395 				DTRACE_SKYWALK2(bad__len,
1396 				    struct nx_flowswitch *, fsw,
1397 				    struct __kern_packet *, pkt);
1398 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1399 				FSW_STATS_INC(FSW_STATS_DROP);
1400 				continue;
1401 			}
1402 		} else {
1403 			mhead = m->m_nextpkt;
1404 			m->m_nextpkt = NULL;
1405 			ASSERT(mhead_cnt != 0);
1406 			--mhead_cnt;
1407 
1408 			/* check if we're underutilizing large buffers */
1409 			if (__improbable(mhead_bufsize > MCLBYTES &&
1410 			    tot_len < (mhead_bufsize >> 1))) {
1411 				++mhead_waste;
1412 			}
1413 			/*
1414 			 * Clean up unused mbuf.
1415 			 * Ony need to do this when we pre-alloc 2x16K mbufs
1416 			 */
1417 			if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1418 				ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1419 				struct mbuf *m_extra = m->m_next;
1420 				ASSERT(m_extra != NULL);
1421 				ASSERT(m_extra->m_len == 0);
1422 				ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1423 				m->m_next = NULL;
1424 				m_freem(m_extra);
1425 				FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1426 			}
1427 		}
1428 		m->m_data += pad;
1429 		/*
1430 		 * XXX -fbounds-safety: external dependency
1431 		 * mtod does not work because m_len is 0
1432 		 */
1433 		m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1434 
1435 		/* don't include IP header from partial sum */
1436 		if (__probable((pkt->pkt_qum_qflags &
1437 		    QUM_F_FLOW_CLASSIFIED) != 0)) {
1438 			iphlen = pkt->pkt_flow_ip_hlen;
1439 			do_cksum_rx = sk_cksum_rx;
1440 		} else {
1441 			iphlen = 0;
1442 			do_cksum_rx = FALSE;
1443 		}
1444 
1445 		fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1446 		    pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1447 		    llhlen + iphlen);
1448 
1449 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1450 		if (do_cksum_rx) {
1451 			FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1452 		}
1453 #if DEBUG || DEVELOPMENT
1454 		if (__improbable(pkt_trailers > 0)) {
1455 			(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1456 		}
1457 #endif /* DEBUG || DEVELOPMENT */
1458 		m_adj(m, llhlen);
1459 
1460 		m->m_pkthdr.rcvif = fsw->fsw_ifp;
1461 		if (__improbable((pkt->pkt_link_flags &
1462 		    PKT_LINKF_ETHFCS) != 0)) {
1463 			m->m_flags |= M_HASFCS;
1464 		}
1465 		if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1466 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1467 		}
1468 		ASSERT(m->m_nextpkt == NULL);
1469 		tail = m;
1470 		*tailp = m;
1471 		tailp = &m->m_nextpkt;
1472 		mcnt++;
1473 		mbytes += m_pktlen(m);
1474 	}
1475 	/* free any leftovers */
1476 	if (__improbable(mhead != NULL)) {
1477 		DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1478 		ASSERT(mhead_cnt != 0);
1479 		(void) m_freem_list(mhead);
1480 		mhead = NULL;
1481 		mhead_cnt = 0;
1482 	}
1483 
1484 	/* reset if most packets (>50%) are smaller than our batch buffers */
1485 	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1486 		DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1487 		    struct flow_entry *, NULL, uint32_t, mhead_waste,
1488 		    uint32_t, tot_cnt);
1489 		largest = 0;
1490 	}
1491 
1492 	if (largest != fsw->fsw_rx_largest_size) {
1493 		os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1494 	}
1495 
1496 	pp_free_pktq(pktq);
1497 	*m_headp = head;
1498 	*m_tailp = tail;
1499 	*cnt = mcnt;
1500 	*bytes = mbytes;
1501 }
1502 
1503 /*
1504  * This function only extracts the mbuf from the packet. The caller frees
1505  * the packet.
1506  */
1507 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1508 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1509 {
1510 	struct mbuf *m;
1511 	struct pkthdr *mhdr;
1512 	uint16_t llhlen;
1513 
1514 	m = pkt->pkt_mbuf;
1515 	ASSERT(m != NULL);
1516 
1517 	llhlen = pkt->pkt_l2_len;
1518 	if (llhlen > pkt->pkt_length) {
1519 		m_freem(m);
1520 		KPKT_CLEAR_MBUF_DATA(pkt);
1521 		DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1522 		    struct __kern_packet *, pkt);
1523 		FSW_STATS_INC(FSW_STATS_DROP);
1524 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1525 		return NULL;
1526 	}
1527 	mhdr = &m->m_pkthdr;
1528 	if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1529 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1530 		mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1531 		mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1532 		mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1533 		mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1534 	}
1535 #if DEBUG || DEVELOPMENT
1536 	uint32_t extra = 0;
1537 	if (__improbable(pkt_trailers > 0)) {
1538 		extra = pkt_add_trailers_mbuf(m, llhlen);
1539 	}
1540 #endif /* DEBUG || DEVELOPMENT */
1541 	m_adj(m, llhlen);
1542 	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1543 	KPKT_CLEAR_MBUF_DATA(pkt);
1544 	return m;
1545 }
1546 
1547 SK_NO_INLINE_ATTRIBUTE
1548 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1549 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1550     struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1551 {
1552 	struct __kern_packet *pkt;
1553 	struct mbuf *__single m, *__single head = NULL;
1554 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1555 	uint32_t c = 0, b = 0;
1556 
1557 	KPKTQ_FOREACH(pkt, pktq) {
1558 		m = convert_compat_pkt_to_mbuf(fsw, pkt);
1559 		if (__improbable(m == NULL)) {
1560 			continue;
1561 		}
1562 		tail = m;
1563 		*tailp = m;
1564 		tailp = &m->m_nextpkt;
1565 		c++;
1566 		b += m_pktlen(m);
1567 	}
1568 	pp_free_pktq(pktq);
1569 	*m_head = head;
1570 	*m_tail = tail;
1571 	*cnt = c;
1572 	*bytes = b;
1573 }
1574 
1575 void
fsw_host_sendup(struct ifnet * ifp,struct mbufq * host_mq)1576 fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq)
1577 {
1578 	struct ifnet_stat_increment_param s;
1579 
1580 	if (mbufq_empty(host_mq)) {
1581 		return;
1582 	}
1583 
1584 	bzero(&s, sizeof(s));
1585 	s.packets_in = host_mq->count;
1586 	s.bytes_in = host_mq->bytes;
1587 	dlil_input_handler(ifp, mbufq_first(host_mq), mbufq_last(host_mq), &s, FALSE, NULL);
1588 }
1589 
1590 void
fsw_host_rx_cb(struct nx_flowswitch * fsw,struct pktq * pktq)1591 fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq)
1592 {
1593 	ifnet_fsw_rx_cb_t __single cb;
1594 	void *__single cb_arg;
1595 
1596 	ASSERT(!KPKTQ_EMPTY(pktq));
1597 	if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1598 		ASSERT(cb != NULL);
1599 		ASSERT(cb_arg != NULL);
1600 		(*cb)(cb_arg, pktq);
1601 		ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1602 		if (KPKTQ_EMPTY(pktq)) {
1603 			return;
1604 		} else {
1605 			DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1606 			    struct pktq *, pktq);
1607 		}
1608 	}
1609 }
1610 
1611 void
fsw_host_rx_enqueue_mbq(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbufq * host_mq)1612 fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq,
1613     struct mbufq *host_mq)
1614 {
1615 	struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1616 	uint32_t cnt = 0, bytes = 0;
1617 	boolean_t compat;
1618 
1619 	if (KPKTQ_EMPTY(pktq)) {
1620 		return;
1621 	}
1622 
1623 	/* All packets in the pktq must have the same type */
1624 	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1625 	if (compat) {
1626 		convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1627 		    &bytes);
1628 	} else {
1629 		convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1630 		    &bytes);
1631 	}
1632 	if (__improbable(m_head == NULL)) {
1633 		DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1634 		return;
1635 	}
1636 
1637 	mbufq_enqueue(host_mq, m_head, m_tail, cnt, bytes);
1638 }
1639 
1640 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1641 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1642     struct __kern_channel_ring *r, struct pktq *pktq)
1643 {
1644 	fsw_ring_enqueue_pktq(fsw, r, pktq);
1645 	/*
1646 	 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1647 	 * This is to ensure we use the timestamp of the earliest enqueue without
1648 	 * a dequeue.
1649 	 */
1650 	if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1651 		r->ckr_rx_enqueue_ts = net_uptime();
1652 	}
1653 	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1654 	dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1655 	    DROPTAP_FLAG_L2_MISSING);
1656 }
1657 
1658 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1659 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1660 {
1661 	struct kern_nexus *nx = fsw->fsw_nx;
1662 	struct nexus_adapter *na = NULL;
1663 	nexus_port_t port = fe->fe_nx_port;
1664 
1665 	if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1666 		SK_ERR("dev or host ports have no NA");
1667 		return NULL;
1668 	}
1669 
1670 	if (__improbable(!nx_port_is_valid(nx, port))) {
1671 		SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1672 		    if_name(fsw->fsw_ifp), port);
1673 		return NULL;
1674 	}
1675 
1676 	na = nx_port_get_na(nx, port);
1677 	if (__improbable(na == NULL)) {
1678 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1679 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1680 		    if_name(fsw->fsw_ifp), port);
1681 		return NULL;
1682 	}
1683 
1684 	if (__improbable(!NA_IS_ACTIVE(na))) {
1685 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1686 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1687 		    if_name(fsw->fsw_ifp), port);
1688 		return NULL;
1689 	}
1690 
1691 	if (__improbable(nx_port_is_defunct(nx, port))) {
1692 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1693 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1694 		    if_name(fsw->fsw_ifp), port);
1695 		return NULL;
1696 	}
1697 
1698 	return na;
1699 }
1700 
1701 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1702 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1703 {
1704 	struct nexus_vp_adapter *na = NULL;
1705 	struct __kern_channel_ring *__single r = NULL;
1706 
1707 	na = VPNA(flow_get_na(fsw, fe));
1708 	if (__improbable(na == NULL)) {
1709 		return NULL;
1710 	}
1711 
1712 	switch (txrx) {
1713 	case NR_RX:
1714 		r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1715 		break;
1716 	case NR_TX:
1717 		r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1718 		break;
1719 	default:
1720 		__builtin_unreachable();
1721 		VERIFY(0);
1722 	}
1723 
1724 	if (__improbable(KR_DROP(r))) {
1725 		FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1726 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %p %s drop mode",
1727 		    SK_KVA(r), r->ckr_name);
1728 		return NULL;
1729 	}
1730 
1731 	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1732 
1733 #if (DEVELOPMENT || DEBUG)
1734 	if (r != NULL) {
1735 		_FSW_INJECT_ERROR(4, r, NULL, null_func);
1736 	}
1737 #endif /* DEVELOPMENT || DEBUG */
1738 
1739 	return r;
1740 }
1741 
1742 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1743 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1744 {
1745 	return flow_get_ring(fsw, fe, NR_RX);
1746 }
1747 
1748 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1749 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1750 {
1751 	struct flow_route *fr = fe->fe_route;
1752 	struct ifnet *ifp = fsw->fsw_ifp;
1753 
1754 	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1755 	    !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1756 	    fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1757 	    !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1758 		/*
1759 		 * The source address is no longer around; we want this
1760 		 * flow to be nonviable, but that requires holding the lock
1761 		 * as writer (which isn't the case now.)  Indicate that
1762 		 * we need to finalize the nonviable later down below.
1763 		 *
1764 		 * We also request that the flow route be re-configured,
1765 		 * if this is a connected mode flow.
1766 		 *
1767 		 */
1768 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1769 			/*
1770 			 * fsw_pending_nonviable is a hint for reaper thread;
1771 			 * due to the fact that setting fe_want_nonviable and
1772 			 * incrementing fsw_pending_nonviable counter is not
1773 			 * atomic, let the increment happen first, and the
1774 			 * thread losing the CAS does decrement.
1775 			 */
1776 			os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1777 			if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1778 				fsw_reap_sched(fsw);
1779 			} else {
1780 				os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1781 			}
1782 		}
1783 		if (fr != NULL) {
1784 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1785 		}
1786 	}
1787 
1788 	/* if flow was (or is going to be) marked as nonviable, drop it */
1789 	if (__improbable(fe->fe_want_nonviable ||
1790 	    (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1791 		SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow %p non-viable",
1792 		    SK_KVA(fe));
1793 		return false;
1794 	}
1795 	return true;
1796 }
1797 
1798 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1799 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1800 {
1801 	bool okay;
1802 	okay = dp_flow_route_process(fsw, fe);
1803 #if (DEVELOPMENT || DEBUG)
1804 	if (okay) {
1805 		_FSW_INJECT_ERROR(5, okay, false, null_func);
1806 	}
1807 #endif /* DEVELOPMENT || DEBUG */
1808 
1809 	return okay;
1810 }
1811 
1812 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)1813 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1814     struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
1815     uint32_t flags)
1816 {
1817 #pragma unused(flags)
1818 	struct pktq dpkts;              /* dst pool alloc'ed packets */
1819 	struct pktq disposed_pkts;         /* done src packets */
1820 	struct pktq dropped_pkts;         /* dropped src packets */
1821 	struct pktq transferred_pkts;         /* dst packet ready for ring */
1822 	struct __kern_packet *pkt, *tpkt;
1823 	struct kern_pbufpool *dpp;
1824 	uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1825 	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1826 	uint16_t buf_array_iter = 0;
1827 	uint32_t cnt, buf_cnt = 0;
1828 	int err;
1829 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1830 	uint16_t line = 0;
1831 
1832 	KPKTQ_INIT(&dpkts);
1833 	KPKTQ_INIT(&dropped_pkts);
1834 	KPKTQ_INIT(&disposed_pkts);
1835 	KPKTQ_INIT(&transferred_pkts);
1836 
1837 	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1838 		SK_ERR("Rx route bad");
1839 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1840 		FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1841 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1842 		line = __LINE__;
1843 		goto done;
1844 	}
1845 
1846 	if (fe->fe_nx_port == FSW_VP_HOST) {
1847 		/*
1848 		 * The host ring does not exist anymore so we can't take
1849 		 * the enqueue path below. This path should only be hit
1850 		 * for the rare tcp fragmentation case.
1851 		 */
1852 
1853 		fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
1854 		return;
1855 	}
1856 
1857 	/* find the ring */
1858 	struct __kern_channel_ring *r;
1859 	r = fsw_flow_get_rx_ring(fsw, fe);
1860 	if (__improbable(r == NULL)) {
1861 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1862 		reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1863 		line = __LINE__;
1864 		goto done;
1865 	}
1866 
1867 	/* snoop before L2 is stripped */
1868 	if (__improbable(pktap_total_tap_count != 0)) {
1869 		fsw_snoop(fsw, fe, rx_pkts, true);
1870 	}
1871 
1872 	dpp = r->ckr_pp;
1873 	/* batch allocate enough packets */
1874 	err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1875 	    SKMEM_NOSLEEP);
1876 	if (__improbable(err == ENOMEM)) {
1877 		ASSERT(KPKTQ_EMPTY(&dpkts));
1878 		KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1879 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1880 		SK_ERR("failed to alloc %u pkts for kr %s, %p", n_pkts,
1881 		    r->ckr_name, SK_KVA(r));
1882 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1883 		line = __LINE__;
1884 		goto done;
1885 	}
1886 
1887 	/*
1888 	 * estimate total number of buflets for the packet chain.
1889 	 */
1890 	cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1891 	if (cnt > n_pkts) {
1892 		ASSERT(dpp->pp_max_frags > 1);
1893 		cnt -= n_pkts;
1894 		buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1895 		err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1896 		    SKMEM_NOSLEEP, false);
1897 		if (__improbable(buf_cnt == 0)) {
1898 			KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1899 			FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1900 			SK_ERR("failed to alloc %d buflets (err %d) for kr %s %p",
1901 			    cnt, err, r->ckr_name, SK_KVA(r));
1902 			reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1903 			line = __LINE__;
1904 			goto done;
1905 		}
1906 		err = 0;
1907 	}
1908 
1909 	/* extra processing for user flow */
1910 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1911 		err = 0;
1912 		KPKTQ_REMOVE(rx_pkts, pkt);
1913 		if (rx_bytes > pkt->pkt_flow_ulen) {
1914 			rx_bytes -= pkt->pkt_flow_ulen;
1915 		} else {
1916 			rx_bytes = 0;
1917 		}
1918 		err = flow_pkt_track(fe, pkt, true);
1919 		_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1920 		if (__improbable(err != 0)) {
1921 			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1922 			FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1923 			/* if need to trigger RST */
1924 			if (err == ENETRESET) {
1925 				flow_track_abort_tcp(fe, pkt, NULL);
1926 			}
1927 			dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1928 			    DROPTAP_FLAG_L2_MISSING);
1929 			continue;
1930 		}
1931 
1932 		/* transfer to dpkt */
1933 		if (pkt->pkt_qum.qum_pp != dpp) {
1934 			struct __kern_buflet *bprev, *bnew;
1935 			struct __kern_packet *dpkt = NULL;
1936 			uint32_t n_bufs, i;
1937 
1938 			KPKTQ_DEQUEUE(&dpkts, dpkt);
1939 			/* XXX Why would dpkt be NULL at this point? */
1940 			if (__improbable(dpkt == NULL)) {
1941 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1942 				dp_drop_pkt_single(fsw, pkt, 0,
1943 				    DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1944 				continue;
1945 			}
1946 			n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1947 			n_bufs--;
1948 			for (i = 0; i < n_bufs; i++) {
1949 				if (__improbable(buf_cnt == 0)) {
1950 					ASSERT(dpp->pp_max_frags > 1);
1951 					buf_array_iter = 0;
1952 					cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1953 					n_pkts = KPKTQ_LEN(rx_pkts);
1954 					if (cnt >= n_pkts) {
1955 						cnt -= n_pkts;
1956 					} else {
1957 						cnt = 0;
1958 					}
1959 					cnt += (n_bufs - i);
1960 					buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1961 					    cnt);
1962 					cnt = buf_cnt;
1963 					err = pp_alloc_buflet_batch(dpp,
1964 					    buf_array, &buf_cnt,
1965 					    SKMEM_NOSLEEP, false);
1966 					if (__improbable(buf_cnt == 0)) {
1967 						FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1968 						dp_drop_pkt_single(fsw, pkt, 0,
1969 						    DROP_REASON_FSW_PP_ALLOC_FAILED,
1970 						    DROPTAP_FLAG_L2_MISSING);
1971 						pkt = NULL;
1972 						pp_free_packet_single(dpkt);
1973 						dpkt = NULL;
1974 						SK_ERR("failed to alloc %d "
1975 						    "buflets (err %d) for "
1976 						    "kr %s, %p", cnt, err,
1977 						    r->ckr_name, SK_KVA(r));
1978 						break;
1979 					}
1980 					err = 0;
1981 				}
1982 				ASSERT(buf_cnt != 0);
1983 				if (i == 0) {
1984 					PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1985 				}
1986 				/*
1987 				 * XXX -fbounds-safety: can't avoid using forge
1988 				 * unless we change the signature of
1989 				 * pp_alloc_buflet_batch().
1990 				 */
1991 				bnew = __unsafe_forge_single(kern_buflet_t,
1992 				    buf_array[buf_array_iter]);
1993 				buf_array[buf_array_iter] = 0;
1994 				buf_array_iter++;
1995 				buf_cnt--;
1996 				VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1997 				    bprev, bnew) == 0);
1998 				bprev = bnew;
1999 			}
2000 			if (__improbable(err != 0)) {
2001 				continue;
2002 			}
2003 			err = copy_packet_from_dev(fsw, pkt, dpkt);
2004 			_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
2005 			if (__improbable(err != 0)) {
2006 				SK_ERR("copy packet failed (err %d)", err);
2007 				dp_drop_pkt_single(fsw, pkt, 0,
2008 				    DROP_REASON_FSW_PKT_COPY_FAILED,
2009 				    DROPTAP_FLAG_L2_MISSING);
2010 				pp_free_packet_single(dpkt);
2011 				dpkt = NULL;
2012 				continue;
2013 			}
2014 			KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2015 			pkt = dpkt;
2016 		}
2017 		_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
2018 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2019 		pkt->pkt_policy_id = fe->fe_policy_id;
2020 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
2021 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2022 		if (pkt->pkt_bufs_cnt > 1) {
2023 			pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
2024 		}
2025 		KPKTQ_ENQUEUE(&transferred_pkts, pkt);
2026 	}
2027 	KPKTQ_FINI(rx_pkts);
2028 
2029 	if (KPKTQ_LEN(&transferred_pkts) > 0) {
2030 		fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2031 	}
2032 	KPKTQ_FINI(&transferred_pkts);
2033 
2034 done:
2035 	/* Free unused buflets */
2036 	while (buf_cnt > 0) {
2037 		/*
2038 		 * XXX -fbounds-safety: can't avoid using forge unless we change
2039 		 * the signature of pp_alloc_buflet_batch().
2040 		 */
2041 		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2042 		    (kern_buflet_t)(buf_array[buf_array_iter])));
2043 		buf_array[buf_array_iter] = 0;
2044 		buf_array_iter++;
2045 		buf_cnt--;
2046 	}
2047 	dp_free_pktq(fsw, &dpkts);
2048 	dp_free_pktq(fsw, &disposed_pkts);
2049 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2050 }
2051 
2052 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes,struct mbufq * host_mq)2053 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2054     struct flow_entry_list *fes, struct mbufq *host_mq)
2055 {
2056 	struct pktq rx_pkts;
2057 	uint32_t rx_bytes;
2058 	uint32_t rx_proc_flags;
2059 
2060 	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2061 	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2062 
2063 	KPKTQ_INIT(&rx_pkts);
2064 	for (;;) {
2065 		lck_mtx_lock(&fe->fe_rx_pktq_lock);
2066 		if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2067 			fe->fe_rx_worker_tid = 0;
2068 			TAILQ_REMOVE(fes, fe, fe_rx_link);
2069 			lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2070 			break;
2071 		}
2072 		KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2073 		KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2074 		rx_bytes = fe->fe_rx_pktq_bytes;
2075 		rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2076 		fe->fe_rx_pktq_bytes = 0;
2077 		fe->fe_rx_frag_count = 0;
2078 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2079 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2080 		    KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2081 		/* flow related processing (default, agg, fpd, etc.) */
2082 		fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, host_mq, rx_proc_flags);
2083 	}
2084 	ASSERT(KPKTQ_EMPTY(&rx_pkts));
2085 
2086 	if (__improbable(fe->fe_want_withdraw)) {
2087 		fsw_reap_sched(fsw);
2088 	}
2089 }
2090 
2091 static void
dp_rx_process_low_power_wake(struct nx_flowswitch * fsw,struct flow_entry * fe)2092 dp_rx_process_low_power_wake(struct nx_flowswitch *fsw, struct flow_entry *fe)
2093 {
2094 	if (fe->fe_port_reservation == NULL || (fe->fe_flags & FLOWENTF_EXTRL_PORT) != 0) {
2095 		return;
2096 	}
2097 	if (fe->fe_key.fk_proto == IPPROTO_TCP && (fe->fe_flags & FLOWENTF_CONNECTION_IDLE)) {
2098 		os_log(wake_packet_log_handle, "dp_rx_process_low_power_wake LPW TCP connection idle");
2099 
2100 		if (flow_track_tcp_want_abort(fe)) {
2101 			os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY | FLOWENTF_WAIT_CLOSE, relaxed);
2102 			fe->fe_want_withdraw = 1;
2103 			flow_track_abort_tcp(fe, NULL, NULL);
2104 		}
2105 	} else {
2106 		if_exit_lpw(fsw->fsw_ifp, "dp_rx_process_low_power_wake LPW connection not idle");
2107 	}
2108 }
2109 
2110 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)2111 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
2112 {
2113 	/*
2114 	 * We only care about wake packets of flows that belong the flow switch
2115 	 * as wake packets for the host stack are handled by the host input
2116 	 * function
2117 	 */
2118 
2119 #if (DEBUG || DEVELOPMENT)
2120 	/* For testing only */
2121 	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2122 		if (check_wake_pkt(fsw->fsw_ifp, pkt) == true) {
2123 			/*
2124 			 * This is a one shot command
2125 			 */
2126 			fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2127 
2128 			pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2129 		}
2130 	}
2131 #endif /* (DEBUG || DEVELOPMENT) */
2132 
2133 	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2134 		if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2135 
2136 		/*
2137 		 * When a packet is received in LPW mode for an idle TCP connection, the connection
2138 		 * is aborted immediately with a RST so the peer drops the connection at once
2139 		 */
2140 		if (if_is_lpw_enabled(fsw->fsw_ifp)) {
2141 			pkt->pkt_pflags |= __PKT_F_LPW;
2142 			dp_rx_process_low_power_wake(fsw, fe);
2143 		}
2144 	}
2145 }
2146 
2147 static void
_fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2148 _fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2149 {
2150 	struct __kern_packet *__single pkt, *__single tpkt;
2151 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2152 	struct flow_entry *__single fe, *__single prev_fe;
2153 	sa_family_t af;
2154 	struct pktq host_pkts, dropped_pkts;
2155 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2156 	uint16_t line = 0;
2157 	int err;
2158 	uint64_t thread_id;
2159 	struct mbufq host_mq;
2160 	struct ifnet *ifp;
2161 
2162 	mbufq_init(&host_mq);
2163 	KPKTQ_INIT(&host_pkts);
2164 	KPKTQ_INIT(&dropped_pkts);
2165 
2166 	FSW_RLOCK(fsw);
2167 
2168 	if (__improbable(FSW_QUIESCED(fsw))) {
2169 		DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2170 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2171 		reason = DROP_REASON_FSW_QUIESCED;
2172 		line = __LINE__;
2173 		goto done;
2174 	}
2175 	if (__improbable(fsw->fsw_demux == NULL)) {
2176 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2177 		reason = DROP_REASON_FSW_DEMUX_FAILED;
2178 		line = __LINE__;
2179 		goto done;
2180 	}
2181 
2182 	ifp = fsw->fsw_ifp;
2183 	thread_id = thread_tid(current_thread());
2184 	prev_fe = NULL;
2185 	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2186 		if (__probable(tpkt)) {
2187 			void *baddr;
2188 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2189 			SK_PREFETCH(baddr, 0);
2190 			/* prefetch L3 and L4 flow structs */
2191 			SK_PREFETCHW(tpkt->pkt_flow, 0);
2192 			SK_PREFETCHW(tpkt->pkt_flow, 128);
2193 		}
2194 
2195 		KPKTQ_REMOVE(pktq, pkt);
2196 
2197 		pkt = rx_prepare_packet(fsw, pkt);
2198 
2199 		af = fsw->fsw_demux(fsw, pkt);
2200 		if (__improbable(af == AF_UNSPEC)) {
2201 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2202 			continue;
2203 		}
2204 
2205 		err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2206 		_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2207 		if (__improbable(err != 0)) {
2208 			FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2209 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2210 			continue;
2211 		}
2212 
2213 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2214 			pkt = rx_process_ip_frag(fsw, pkt);
2215 			if (pkt == NULL) {
2216 				continue;
2217 			}
2218 		}
2219 
2220 		prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2221 		if (__improbable(fe == NULL)) {
2222 			KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2223 			continue;
2224 		}
2225 
2226 		dp_rx_process_wake_packet(fsw, fe, pkt);
2227 
2228 		rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2229 		prev_fe = fe;
2230 	}
2231 
2232 	struct flow_entry *tfe = NULL;
2233 	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2234 		rx_flow_process(fsw, fe, &fes, &host_mq);
2235 		flow_entry_release(&fe);
2236 	}
2237 
2238 	if (!KPKTQ_EMPTY(&host_pkts)) {
2239 		fsw_host_rx_cb(fsw, &host_pkts);
2240 		fsw_host_rx_enqueue_mbq(fsw, &host_pkts, &host_mq);
2241 	}
2242 
2243 done:
2244 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2245 	FSW_RUNLOCK(fsw);
2246 
2247 	fsw_host_sendup(ifp, &host_mq);
2248 }
2249 
2250 #if (DEVELOPMENT || DEBUG)
2251 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2252 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2253     struct __kern_packet *pkt)
2254 {
2255 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2256 
2257 	lck_mtx_lock_spin(&frt->frt_lock);
2258 	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2259 	lck_mtx_unlock(&frt->frt_lock);
2260 }
2261 
2262 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2263 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2264 {
2265 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2266 
2267 	ASSERT(frt->frt_thread != THREAD_NULL);
2268 	lck_mtx_lock_spin(&frt->frt_lock);
2269 	ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2270 
2271 	frt->frt_requests++;
2272 	if (!(frt->frt_flags & FRT_RUNNING)) {
2273 		thread_wakeup((caddr_t)frt);
2274 	}
2275 	lck_mtx_unlock(&frt->frt_lock);
2276 }
2277 
2278 __attribute__((noreturn))
2279 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2280 fsw_rps_thread_cont(void *v, wait_result_t w)
2281 {
2282 	struct fsw_rps_thread *__single frt = v;
2283 	struct nx_flowswitch *fsw = frt->frt_fsw;
2284 
2285 	lck_mtx_lock(&frt->frt_lock);
2286 	if (__improbable(w == THREAD_INTERRUPTIBLE ||
2287 	    (frt->frt_flags & FRT_TERMINATING) != 0)) {
2288 		goto terminate;
2289 	}
2290 	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2291 		goto done;
2292 	}
2293 	frt->frt_flags |= FRT_RUNNING;
2294 
2295 	for (;;) {
2296 		uint32_t requests = frt->frt_requests;
2297 		struct pktq pkts;
2298 
2299 		KPKTQ_INIT(&pkts);
2300 		KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2301 		lck_mtx_unlock(&frt->frt_lock);
2302 
2303 		sk_protect_t protect;
2304 		protect = sk_sync_protect();
2305 		_fsw_receive(fsw, &pkts);
2306 		sk_sync_unprotect(protect);
2307 
2308 		lck_mtx_lock(&frt->frt_lock);
2309 		if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2310 		    requests == frt->frt_requests) {
2311 			frt->frt_requests = 0;
2312 			break;
2313 		}
2314 	}
2315 
2316 done:
2317 	lck_mtx_unlock(&frt->frt_lock);
2318 	if (!(frt->frt_flags & FRT_TERMINATING)) {
2319 		frt->frt_flags &= ~FRT_RUNNING;
2320 		assert_wait(frt, THREAD_UNINT);
2321 		thread_block_parameter(fsw_rps_thread_cont, frt);
2322 		__builtin_unreachable();
2323 	} else {
2324 terminate:
2325 		LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2326 		frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2327 		frt->frt_flags |= FRT_TERMINATED;
2328 
2329 		if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2330 			thread_wakeup((caddr_t)&frt);
2331 		}
2332 		lck_mtx_unlock(&frt->frt_lock);
2333 
2334 		SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2335 		    frt->frt_idx);
2336 
2337 		/* for the extra refcnt from kernel_thread_start() */
2338 		thread_deallocate(current_thread());
2339 		/* this is the end */
2340 		thread_terminate(current_thread());
2341 		/* NOTREACHED */
2342 		__builtin_unreachable();
2343 	}
2344 
2345 	/* must never get here */
2346 	VERIFY(0);
2347 	/* NOTREACHED */
2348 	__builtin_unreachable();
2349 }
2350 
2351 __attribute__((noreturn))
2352 static void
fsw_rps_thread_func(void * v,wait_result_t w)2353 fsw_rps_thread_func(void *v, wait_result_t w)
2354 {
2355 #pragma unused(w)
2356 	struct fsw_rps_thread *__single frt = v;
2357 	struct nx_flowswitch *fsw = frt->frt_fsw;
2358 	const char *__null_terminated tname = NULL;
2359 
2360 	char thread_name[MAXTHREADNAMESIZE];
2361 	bzero(thread_name, sizeof(thread_name));
2362 	tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2363 	    if_name(fsw->fsw_ifp), frt->frt_idx);
2364 
2365 	thread_set_thread_name(frt->frt_thread, tname);
2366 	SK_D("%s spawned", tname);
2367 
2368 	net_thread_marks_push(NET_THREAD_SYNC_RX);
2369 	assert_wait(frt, THREAD_UNINT);
2370 	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
2371 
2372 	__builtin_unreachable();
2373 }
2374 
2375 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2376 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2377 {
2378 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2379 	uint64_t f = (1 * NSEC_PER_MSEC);
2380 	uint64_t s = (1000 * NSEC_PER_SEC);
2381 	uint32_t c = 0;
2382 
2383 	lck_mtx_lock(&frt->frt_lock);
2384 	frt->frt_flags |= FRT_TERMINATING;
2385 
2386 	while (!(frt->frt_flags & FRT_TERMINATED)) {
2387 		uint64_t t = 0;
2388 		nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2389 		clock_absolutetime_interval_to_deadline(t, &t);
2390 		ASSERT(t != 0);
2391 
2392 		frt->frt_flags |= FRT_TERMINATEBLOCK;
2393 		if (!(frt->frt_flags & FRT_RUNNING)) {
2394 			thread_wakeup_one((caddr_t)frt);
2395 		}
2396 		(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2397 		lck_mtx_unlock(&frt->frt_lock);
2398 		thread_block(THREAD_CONTINUE_NULL);
2399 		lck_mtx_lock(&frt->frt_lock);
2400 		frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2401 	}
2402 	ASSERT(frt->frt_flags & FRT_TERMINATED);
2403 	lck_mtx_unlock(&frt->frt_lock);
2404 	frt->frt_thread = THREAD_NULL;
2405 }
2406 
2407 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2408 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2409 {
2410 	kern_return_t error;
2411 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2412 
2413 	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2414 	frt->frt_idx = i;
2415 	frt->frt_fsw = fsw;
2416 	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2417 	ASSERT(!error);
2418 	KPKTQ_INIT(&frt->frt_pktq);
2419 }
2420 
2421 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2422 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2423 {
2424 	if (n > FSW_RPS_MAX_NTHREADS) {
2425 		SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2426 		return EINVAL;
2427 	}
2428 
2429 	FSW_WLOCK(fsw);
2430 	if (n < fsw->fsw_rps_nthreads) {
2431 		for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2432 			fsw_rps_thread_join(fsw, i);
2433 		}
2434 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2435 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2436 		fsw->fsw_rps_nthreads = n;
2437 	} else if (n > fsw->fsw_rps_nthreads) {
2438 		uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2439 
2440 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2441 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2442 		fsw->fsw_rps_nthreads = n;
2443 		for (uint32_t i = nthreads_old; i < n; i++) {
2444 			fsw_rps_thread_spawn(fsw, i);
2445 		}
2446 	}
2447 	FSW_WUNLOCK(fsw);
2448 	return 0;
2449 }
2450 
2451 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2452 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2453 {
2454 	sa_family_t af = fsw->fsw_demux(fsw, pkt);
2455 	if (__improbable(af == AF_UNSPEC)) {
2456 		return 0;
2457 	}
2458 
2459 	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2460 
2461 	if (__improbable((pkt->pkt_qum_qflags &
2462 	    QUM_F_FLOW_CLASSIFIED) == 0)) {
2463 		return 0;
2464 	}
2465 
2466 	struct flow_key key;
2467 	flow_pkt2key(pkt, true, &key);
2468 	key.fk_mask = FKMASK_5TUPLE;
2469 
2470 	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2471 
2472 	return id;
2473 }
2474 
2475 #endif /* !DEVELOPMENT && !DEBUG */
2476 
2477 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2478 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2479 {
2480 #if (DEVELOPMENT || DEBUG)
2481 	FSW_RLOCK(fsw);
2482 	if (fsw->fsw_rps_nthreads != 0) {
2483 		struct __kern_packet *pkt, *tpkt;
2484 		bitmap_t map = 0;
2485 
2486 		static_assert(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2487 		KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2488 			uint32_t id = get_rps_id(fsw, pkt);
2489 			KPKTQ_REMOVE(pktq, pkt);
2490 			fsw_rps_rx(fsw, id, pkt);
2491 			bitmap_set(&map, id);
2492 		}
2493 		for (int i = bitmap_first(&map, 64); i >= 0;
2494 		    i = bitmap_next(&map, i)) {
2495 			fsw_rps_thread_schedule(fsw, i);
2496 		}
2497 		FSW_RUNLOCK(fsw);
2498 	} else
2499 #endif /* !DEVELOPMENT && !DEBUG */
2500 	{
2501 #if (DEVELOPMENT || DEBUG)
2502 		FSW_RUNLOCK(fsw);
2503 #endif /* !DEVELOPMENT && !DEBUG */
2504 		_fsw_receive(fsw, pktq);
2505 	}
2506 }
2507 
2508 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2509 fsw_dev_input_netem_dequeue(void *handle,
2510     pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2511 {
2512 #pragma unused(handle)
2513 	struct nx_flowswitch *__single fsw = handle;
2514 	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2515 	struct pktq pktq;
2516 	sk_protect_t protect;
2517 	uint32_t i;
2518 
2519 	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2520 
2521 	for (i = 0; i < n_pkts; i++) {
2522 		ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2523 		ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2524 		kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2525 	}
2526 
2527 	protect = sk_sync_protect();
2528 	KPKTQ_INIT(&pktq);
2529 	pkts_to_pktq(kpkts, n_pkts, &pktq);
2530 
2531 	fsw_receive(fsw, &pktq);
2532 	KPKTQ_FINI(&pktq);
2533 	sk_sync_unprotect(protect);
2534 
2535 	return 0;
2536 }
2537 
2538 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2539 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2540 {
2541 	classq_pkt_t p;
2542 	struct netem *__single ne;
2543 	struct __kern_packet *pkt, *tpkt;
2544 
2545 	ASSERT(fsw->fsw_ifp != NULL);
2546 	ne = fsw->fsw_ifp->if_input_netem;
2547 	ASSERT(ne != NULL);
2548 	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2549 		bool pdrop;
2550 		KPKTQ_REMOVE(q, pkt);
2551 		CLASSQ_PKT_INIT_PACKET(&p, pkt);
2552 		netem_enqueue(ne, &p, &pdrop);
2553 	}
2554 }
2555 
2556 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2557 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2558     struct nexus_pkt_stats *out_stats)
2559 {
2560 	struct __kern_packet *pkt = pkt_head, *next;
2561 	struct nx_flowswitch *fsw;
2562 	uint32_t n_bytes = 0, n_pkts = 0;
2563 	uint64_t total_pkts = 0, total_bytes = 0;
2564 	struct pktq q;
2565 
2566 	KPKTQ_INIT(&q);
2567 	if (__improbable(devna->na_ifp == NULL ||
2568 	    (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2569 		SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2570 		dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2571 		return;
2572 	}
2573 	while (pkt != NULL) {
2574 		if (__improbable(pkt->pkt_trace_id != 0)) {
2575 			KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2576 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2577 		}
2578 		next = pkt->pkt_nextpkt;
2579 		pkt->pkt_nextpkt = NULL;
2580 
2581 		if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2582 			KPKTQ_ENQUEUE(&q, pkt);
2583 			n_bytes += pkt->pkt_length;
2584 		} else {
2585 			DTRACE_SKYWALK1(non__finalized__drop,
2586 			    struct __kern_packet *, pkt);
2587 			FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2588 			dp_drop_pkt_single(fsw, pkt, 0,
2589 			    DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2590 			    DROPTAP_FLAG_L2_MISSING);
2591 			pkt = NULL;
2592 		}
2593 		n_pkts = KPKTQ_LEN(&q);
2594 		if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2595 			if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2596 				fsw_dev_input_netem_enqueue(fsw, &q);
2597 			} else {
2598 				fsw_receive(fsw, &q);
2599 			}
2600 			total_pkts += n_pkts;
2601 			total_bytes += n_bytes;
2602 			n_pkts = 0;
2603 			n_bytes = 0;
2604 			KPKTQ_FINI(&q);
2605 		}
2606 		pkt = next;
2607 	}
2608 	ASSERT(KPKTQ_LEN(&q) == 0);
2609 	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2610 	if (out_stats != NULL) {
2611 		out_stats->nps_pkts += total_pkts;
2612 		out_stats->nps_bytes += total_bytes;
2613 	}
2614 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2615 }
2616 
2617 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2618 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2619     struct __kern_packet *dpkt)
2620 {
2621 	struct mbuf *__single m = NULL;
2622 	uint32_t bdlen, bdlim, bdoff;
2623 	uint8_t *bdaddr;
2624 	unsigned int one = 1;
2625 	int err = 0;
2626 
2627 	err = mbuf_allocpacket(MBUF_DONTWAIT,
2628 	    (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2629 #if (DEVELOPMENT || DEBUG)
2630 	if (m != NULL) {
2631 		_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2632 	}
2633 #endif /* DEVELOPMENT || DEBUG */
2634 	if (__improbable(m == NULL)) {
2635 		FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2636 		err = ENOBUFS;
2637 		goto done;
2638 	}
2639 
2640 	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2641 	if (fsw->fsw_frame_headroom > bdlim) {
2642 		SK_ERR("not enough space in buffer for headroom");
2643 		err = EINVAL;
2644 		goto done;
2645 	}
2646 
2647 	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2648 	dpkt->pkt_mbuf = m;
2649 	dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2650 
2651 	/* packet copy into mbuf */
2652 	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2653 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2654 	    fsw->fsw_frame_headroom, spkt->pkt_length,
2655 	    PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2656 	    spkt->pkt_csum_tx_start_off);
2657 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2658 
2659 	/* header copy into dpkt buffer for classification */
2660 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2661 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2662 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2663 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2664 	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2665 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2666 	    sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2667 	if (copy_len < spkt->pkt_length) {
2668 		dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2669 	}
2670 
2671 	/*
2672 	 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2673 	 * buflet baddr m_data always points to the beginning of packet and
2674 	 * should represents the same as baddr + headroom
2675 	 */
2676 	ASSERT((uintptr_t)m->m_data ==
2677 	    ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2678 
2679 done:
2680 	return err;
2681 }
2682 
2683 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2684 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2685     struct __kern_packet *dpkt)
2686 {
2687 	struct ifnet *ifp = fsw->fsw_ifp;
2688 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2689 
2690 	if (headroom > UINT8_MAX) {
2691 		SK_ERR("headroom too large %d", headroom);
2692 		return ERANGE;
2693 	}
2694 	dpkt->pkt_headroom = (uint8_t)headroom;
2695 	ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2696 	dpkt->pkt_l2_len = 0;
2697 	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2698 
2699 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2700 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2701 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2702 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2703 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2704 	    dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2705 	    spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2706 	    (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2707 	    (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2708 	    (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2709 
2710 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2711 
2712 	return 0;
2713 }
2714 
2715 #if SK_LOG
2716 /* Hoisted out of line to reduce kernel stack footprint */
2717 SK_LOG_ATTRIBUTE
2718 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2719 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2720     struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2721 {
2722 	struct proc *p = current_proc();
2723 	struct ifnet *ifp = fsw->fsw_ifp;
2724 	uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2725 
2726 	if (error == ERANGE) {
2727 		SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2728 		    "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2729 		    (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2730 		    (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2731 	} else if (error == ENOBUFS) {
2732 		SK_DF(logflags, "%s(%d) packet allocation failure",
2733 		    sk_proc_name(p), sk_proc_pid(p));
2734 	} else if (error == 0) {
2735 		ASSERT(dpkt != NULL);
2736 		char *daddr;
2737 		uint32_t pkt_len;
2738 
2739 		MD_BUFLET_ADDR_ABS(dpkt, daddr);
2740 		pkt_len = __packet_get_real_data_length(dpkt);
2741 		SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2742 		    sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
2743 		    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2744 		    (uint32_t)fsw->fsw_frame_headroom,
2745 		    (uint32_t)ifp->if_tx_headroom);
2746 		SK_DF(logflags | SK_VERB_DUMP, "%s",
2747 		    sk_dump("buf", daddr, pkt_len, 128));
2748 	} else {
2749 		SK_DF(logflags, "%s(%d) error %d", sk_proc_name(p),
2750 		    sk_proc_pid(p), error);
2751 	}
2752 }
2753 #else
2754 #define dp_copy_to_dev_log(...)
2755 #endif /* SK_LOG */
2756 
2757 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2758 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2759 {
2760 	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2761 	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2762 
2763 	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2764 	/* Copy packet metadata */
2765 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2766 	_PKT_COPY(spkt, dpkt);
2767 	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2768 	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2769 	    !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2770 	ASSERT(dpkt->pkt_mbuf == NULL);
2771 
2772 	/* Copy AQM metadata */
2773 	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2774 	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2775 	static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0);
2776 	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2777 	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2778 	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2779 	dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2780 }
2781 
2782 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2783 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2784     struct __kern_packet *dpkt)
2785 {
2786 	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2787 	struct ifnet *ifp = fsw->fsw_ifp;
2788 	uint32_t dev_pkt_len;
2789 	int err = 0;
2790 
2791 	fsw_pkt_copy_metadata(spkt, dpkt);
2792 	switch (fsw->fsw_classq_enq_ptype) {
2793 	case QP_MBUF:
2794 		err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2795 		break;
2796 
2797 	case QP_PACKET:
2798 		dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2799 		    spkt->pkt_length;
2800 		if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2801 			FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2802 			err = ERANGE;
2803 			goto done;
2804 		}
2805 		err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2806 		break;
2807 
2808 	default:
2809 		VERIFY(0);
2810 		__builtin_unreachable();
2811 	}
2812 done:
2813 	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2814 	return err;
2815 }
2816 
2817 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2818 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2819     struct __kern_packet *dpkt)
2820 {
2821 	uint8_t *sbaddr, *dbaddr;
2822 	uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2823 	uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2824 
2825 	fsw_pkt_copy_metadata(spkt, dpkt);
2826 
2827 	MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2828 	ASSERT(sbaddr != NULL);
2829 	sbaddr += spkt->pkt_headroom;
2830 
2831 	MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2832 	ASSERT(dbaddr != NULL);
2833 	dpkt->pkt_headroom = (uint8_t)headroom;
2834 	dbaddr += headroom;
2835 
2836 	pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2837 	METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2838 
2839 	/* packet length is set to the full length */
2840 	dpkt->pkt_length = spkt->pkt_length;
2841 	dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2842 	return 0;
2843 }
2844 
2845 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2846 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2847 {
2848 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2849 	ASSERT(pkt->pkt_mbuf != NULL);
2850 	struct mbuf *m = pkt->pkt_mbuf;
2851 
2852 	/* pass additional metadata generated from flow parse/lookup */
2853 	static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token));
2854 	static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token));
2855 	static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx));
2856 	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2857 	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2858 	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2859 	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2860 	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2861 	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2862 	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2863 
2864 	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2865 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2866 	}
2867 
2868 	/* The packet should have a timestamp by the time we get here. */
2869 	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2870 	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2871 
2872 	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2873 	m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2874 	/* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2875 	m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2876 
2877 	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2878 		m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2879 	}
2880 	KPKT_CLEAR_MBUF_DATA(pkt);
2881 
2882 	/* mbuf has been consumed, release packet as well */
2883 	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2884 	pp_free_packet_single(pkt);
2885 	return m;
2886 }
2887 
2888 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2889 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2890     struct mbuf **head, struct mbuf **tail,
2891     uint32_t *cnt, uint32_t *bytes)
2892 {
2893 	struct __kern_packet *pkt = pkt_list, *next;
2894 	struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2895 	struct mbuf *__single m = NULL;
2896 	uint32_t c = 0, b = 0;
2897 
2898 	while (pkt != NULL) {
2899 		next = pkt->pkt_nextpkt;
2900 		pkt->pkt_nextpkt = NULL;
2901 		m = convert_pkt_to_mbuf(pkt);
2902 		ASSERT(m != NULL);
2903 
2904 		*m_tailp = m;
2905 		m_tailp = &m->m_nextpkt;
2906 		c++;
2907 		b += m_pktlen(m);
2908 		pkt = next;
2909 	}
2910 	if (head != NULL) {
2911 		*head = m_head;
2912 	}
2913 	if (tail != NULL) {
2914 		*tail = m;
2915 	}
2916 	if (cnt != NULL) {
2917 		*cnt = c;
2918 	}
2919 	if (bytes != NULL) {
2920 		*bytes = b;
2921 	}
2922 }
2923 
2924 SK_NO_INLINE_ATTRIBUTE
2925 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2926 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2927     struct __kern_packet *pkt)
2928 {
2929 	struct ifnet *ifp = fsw->fsw_ifp;
2930 	boolean_t pkt_drop = FALSE;
2931 	int err;
2932 
2933 	FSW_LOCK_ASSERT_HELD(fsw);
2934 	ASSERT(fsw->fsw_classq_enabled);
2935 	ASSERT(pkt->pkt_flow_token != 0);
2936 	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2937 	    1, pkt->pkt_length);
2938 
2939 	if (__improbable(pkt->pkt_trace_id != 0)) {
2940 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2941 		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2942 	}
2943 
2944 	switch (fsw->fsw_classq_enq_ptype) {
2945 	case QP_MBUF: {                         /* compat interface */
2946 		struct mbuf *m;
2947 
2948 		m = convert_pkt_to_mbuf(pkt);
2949 		ASSERT(m != NULL);
2950 		pkt = NULL;
2951 
2952 		/* ifnet_enqueue consumes mbuf */
2953 		err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2954 		m = NULL;
2955 #if (DEVELOPMENT || DEBUG)
2956 		if (__improbable(!pkt_drop)) {
2957 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2958 		}
2959 #endif /* DEVELOPMENT || DEBUG */
2960 		if (pkt_drop) {
2961 			FSW_STATS_INC(FSW_STATS_DROP);
2962 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2963 		}
2964 		break;
2965 	}
2966 	case QP_PACKET: {                       /* native interface */
2967 		/* ifnet_enqueue consumes packet */
2968 		err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &pkt_drop);
2969 		pkt = NULL;
2970 #if (DEVELOPMENT || DEBUG)
2971 		if (__improbable(!pkt_drop)) {
2972 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2973 		}
2974 #endif /* DEVELOPMENT || DEBUG */
2975 		if (pkt_drop) {
2976 			FSW_STATS_INC(FSW_STATS_DROP);
2977 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2978 		}
2979 		break;
2980 	}
2981 	default:
2982 		err = EINVAL;
2983 		VERIFY(0);
2984 		/* NOTREACHED */
2985 		__builtin_unreachable();
2986 	}
2987 
2988 	return err;
2989 }
2990 
2991 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2992 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2993     struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2994     uint32_t cnt, uint32_t bytes)
2995 {
2996 	struct ifnet *ifp = fsw->fsw_ifp;
2997 	boolean_t pkt_drop = FALSE;
2998 	uint32_t svc;
2999 	int err;
3000 
3001 	FSW_LOCK_ASSERT_HELD(fsw);
3002 	ASSERT(fsw->fsw_classq_enabled);
3003 	ASSERT(pkt_head->pkt_flow_token != 0);
3004 
3005 	/*
3006 	 * All packets in the flow should have the same svc.
3007 	 */
3008 	svc = pkt_head->pkt_svc_class;
3009 	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
3010 
3011 	switch (fsw->fsw_classq_enq_ptype) {
3012 	case QP_MBUF: {                         /* compat interface */
3013 		struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
3014 		uint32_t c = 0, b = 0;
3015 
3016 		convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
3017 		ASSERT(m_head != NULL && m_tail != NULL);
3018 		ASSERT(c == cnt);
3019 		ASSERT(b == bytes);
3020 		pkt_head = NULL;
3021 
3022 		/* ifnet_enqueue consumes mbuf */
3023 		err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
3024 		    bytes, FALSE, &pkt_drop);
3025 		m_head = NULL;
3026 		m_tail = NULL;
3027 #if (DEVELOPMENT || DEBUG)
3028 		if (__improbable(!pkt_drop)) {
3029 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3030 		}
3031 #endif /* DEVELOPMENT || DEBUG */
3032 		if (pkt_drop) {
3033 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3034 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3035 			    cnt);
3036 		}
3037 		break;
3038 	}
3039 	case QP_PACKET: {                       /* native interface */
3040 		/* ifnet_enqueue consumes packet */
3041 		err = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_head, pkt_tail, cnt,
3042 		    bytes, FALSE, &pkt_drop);
3043 		pkt_head = NULL;
3044 #if (DEVELOPMENT || DEBUG)
3045 		if (__improbable(!pkt_drop)) {
3046 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3047 		}
3048 #endif /* DEVELOPMENT || DEBUG */
3049 		if (pkt_drop) {
3050 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3051 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3052 			    cnt);
3053 		}
3054 		break;
3055 	}
3056 	default:
3057 		err = EINVAL;
3058 		VERIFY(0);
3059 		/* NOTREACHED */
3060 		__builtin_unreachable();
3061 	}
3062 
3063 	return err;
3064 }
3065 
3066 /*
3067  * This code path needs to be kept for interfaces without logical link support.
3068  */
3069 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3070 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3071     bool chain, uint32_t cnt, uint32_t bytes)
3072 {
3073 	struct __kern_packet *pkt, *tail, *tpkt;
3074 	flowadv_idx_t flow_adv_idx;
3075 	bool flowadv_cap;
3076 	flowadv_token_t flow_adv_token;
3077 	int err;
3078 
3079 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3080 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3081 
3082 	if (chain) {
3083 		pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3084 		tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3085 		KPKTQ_INIT(&fe->fe_tx_pktq);
3086 		if (pkt == NULL) {
3087 			return;
3088 		}
3089 		flow_adv_idx = pkt->pkt_flowsrc_fidx;
3090 		flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3091 		flow_adv_token = pkt->pkt_flow_token;
3092 
3093 		err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3094 		DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, int, err);
3095 	} else {
3096 		uint32_t c = 0, b = 0;
3097 
3098 		KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3099 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3100 
3101 			flow_adv_idx = pkt->pkt_flowsrc_fidx;
3102 			flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3103 			flow_adv_token = pkt->pkt_flow_token;
3104 
3105 			c++;
3106 			b += pkt->pkt_length;
3107 			err = classq_enqueue_flow_single(fsw, pkt);
3108 		}
3109 		ASSERT(c == cnt);
3110 		ASSERT(b == bytes);
3111 		DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3112 		    int, err);
3113 	}
3114 }
3115 
3116 /*
3117  * Logical link code path
3118  */
3119 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3120 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3121     bool chain, uint32_t cnt, uint32_t bytes)
3122 {
3123 	struct __kern_packet *pkt, *tail;
3124 	flowadv_idx_t flow_adv_idx;
3125 	bool flowadv_cap;
3126 	flowadv_token_t flow_adv_token;
3127 	uint32_t flowctl = 0, dropped = 0;
3128 	int err;
3129 
3130 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3131 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3132 
3133 	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3134 	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3135 	KPKTQ_INIT(&fe->fe_tx_pktq);
3136 	if (pkt == NULL) {
3137 		return;
3138 	}
3139 	flow_adv_idx = pkt->pkt_flowsrc_fidx;
3140 	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3141 	flow_adv_token = pkt->pkt_flow_token;
3142 
3143 	err = netif_qset_enqueue(fe->fe_qset, chain, pkt, tail, cnt, bytes,
3144 	    &flowctl, &dropped);
3145 
3146 	if (__improbable(err != 0) && dropped > 0) {
3147 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3148 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, dropped);
3149 	}
3150 }
3151 
3152 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3153 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3154 {
3155 #pragma unused(fsw)
3156 	/* finalize here; no more changes to buflets after classq */
3157 	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3158 		kern_packet_t ph = SK_PTR_ENCODE(pkt,
3159 		    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3160 		int err = __packet_finalize(ph);
3161 		VERIFY(err == 0);
3162 	}
3163 }
3164 
3165 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3166 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3167 {
3168 	struct flow_route *fr = fe->fe_route;
3169 	int err;
3170 
3171 	ASSERT(fr != NULL);
3172 
3173 	if (__improbable(!dp_flow_route_process(fsw, fe))) {
3174 		return false;
3175 	}
3176 	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3177 		flow_qset_select_dynamic(fsw, fe, TRUE);
3178 	}
3179 
3180 	_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3181 	    _fsw_error35_handler, 1, fr, NULL, NULL);
3182 	_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3183 	    _fsw_error36_handler, 1, fr, NULL);
3184 
3185 	/*
3186 	 * See if we need to resolve the flow route; note the test against
3187 	 * fr_flags here is done without any lock for performance.  Thus
3188 	 * it's possible that we race against the thread performing route
3189 	 * event updates for a packet (which is OK).  In any case we should
3190 	 * not have any assertion on fr_flags value(s) due to the lack of
3191 	 * serialization.
3192 	 */
3193 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
3194 		goto frame;
3195 	}
3196 
3197 	struct __kern_packet *pkt, *tpkt;
3198 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3199 		err = fsw->fsw_resolve(fsw, fr, pkt);
3200 		_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3201 		_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3202 		/*
3203 		 * If resolver returns EJUSTRETURN then we drop the pkt as the
3204 		 * resolver should have converted the pkt into mbuf (or
3205 		 * detached the attached mbuf from pkt) and added it to the
3206 		 * llinfo queue. If we do have a cached llinfo, then proceed
3207 		 * to using it even though it may be stale (very unlikely)
3208 		 * while the resolution is in progress.
3209 		 * Otherwise, any other error results in dropping pkt.
3210 		 */
3211 		if (err == EJUSTRETURN) {
3212 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3213 			pp_free_packet_single(pkt);
3214 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3215 			continue;
3216 		} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3217 			/* use existing llinfo */
3218 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3219 		} else if (err != 0) {
3220 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3221 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3222 			    DROPTAP_FLAG_L2_MISSING);
3223 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3224 			continue;
3225 		}
3226 	}
3227 
3228 frame:
3229 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3230 		if (fsw->fsw_frame != NULL) {
3231 			fsw->fsw_frame(fsw, fr, pkt);
3232 		}
3233 	}
3234 
3235 	return true;
3236 }
3237 
3238 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3239 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3240 {
3241 #pragma unused(fsw)
3242 	struct __kern_packet *pkt, *tpkt;
3243 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3244 		KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3245 		/* listener is only allowed TCP RST */
3246 		if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3247 		    (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3248 			flow_track_abort_tcp(fe, NULL, pkt);
3249 		} else {
3250 			char *addr;
3251 
3252 			MD_BUFLET_ADDR_ABS(pkt, addr);
3253 			SK_ERR("listener flow sends non-RST packet %s",
3254 			    sk_dump(sk_proc_name(current_proc()),
3255 			    addr, __packet_get_real_data_length(pkt), 128));
3256 		}
3257 		pp_free_packet_single(pkt);
3258 	}
3259 }
3260 
3261 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp,uint64_t now)3262 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3263     volatile uint64_t *rt_ts, ifnet_t ifp, uint64_t now)
3264 {
3265 	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3266 		pkt->pkt_timestamp = now;
3267 	}
3268 	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3269 
3270 	/*
3271 	 * If the packet service class is not background,
3272 	 * update the timestamps on the interface, as well as
3273 	 * the ones in nexus-wide advisory to indicate recent
3274 	 * activity on a foreground flow.
3275 	 */
3276 	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3277 		ifp->if_fg_sendts = (uint32_t)net_uptime();
3278 		if (fg_ts != NULL) {
3279 			*fg_ts = net_uptime();
3280 		}
3281 	}
3282 	if (pkt->pkt_pflags & PKT_F_REALTIME) {
3283 		ifp->if_rt_sendts = (uint32_t)net_uptime();
3284 		if (rt_ts != NULL) {
3285 			*rt_ts = net_uptime();
3286 		}
3287 	}
3288 }
3289 
3290 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw)3291 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw)
3292 {
3293 	return fsw_chain_enqueue != 0 &&
3294 	       fsw->fsw_ifp->if_output_netem == NULL &&
3295 	       (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0;
3296 }
3297 
3298 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3299 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3300     uint32_t flags)
3301 {
3302 	struct pktq dropped_pkts;
3303 	bool chain, same_svc = true;
3304 	bool gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3305 	uint32_t cnt = 0, bytes = 0;
3306 	volatile struct sk_nexusadv *nxadv = NULL;
3307 	volatile uint64_t *fg_ts = NULL;
3308 	volatile uint64_t *rt_ts = NULL;
3309 	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3310 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3311 	uint16_t line = 0;
3312 	uint32_t svc = 0;
3313 	struct timespec now;
3314 	uint64_t now_nsec = 0;
3315 
3316 	KPKTQ_INIT(&dropped_pkts);
3317 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3318 	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3319 		dp_listener_flow_tx_process(fsw, fe);
3320 		return;
3321 	}
3322 	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3323 		SK_RDERR(5, "Tx route bad");
3324 		FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3325 		    KPKTQ_LEN(&fe->fe_tx_pktq));
3326 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3327 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3328 		line = __LINE__;
3329 		goto done;
3330 	}
3331 	chain = fsw_chain_enqueue_enabled(fsw) && KPKTQ_LEN(&fe->fe_tx_pktq) > 1;
3332 	if (chain) {
3333 		nanouptime(&now);
3334 		net_timernsec(&now, &now_nsec);
3335 		nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3336 		if (nxadv != NULL) {
3337 			fg_ts = &nxadv->nxadv_fg_sendts;
3338 			rt_ts = &nxadv->nxadv_rt_sendts;
3339 		}
3340 	}
3341 
3342 	struct __kern_packet *pkt, *tpkt;
3343 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3344 		int err = 0;
3345 		if (svc == 0) {
3346 			svc = pkt->pkt_svc_class;
3347 		}
3348 
3349 		err = flow_pkt_track(fe, pkt, false);
3350 		if (__improbable(err != 0)) {
3351 			SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3352 			FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3353 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3354 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3355 			    DROPTAP_FLAG_L2_MISSING);
3356 			continue;
3357 		}
3358 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3359 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3360 
3361 		/* set AQM related values for outgoing packet */
3362 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3363 			pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3364 			pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3365 			pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3366 		} else {
3367 			pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3368 		}
3369 		_UUID_CLEAR(pkt->pkt_flow_id);
3370 		pkt->pkt_flow_token = fe->fe_flowid;
3371 		pkt->pkt_pflags |= PKT_F_FLOW_ID;
3372 		pkt->pkt_qset_idx = qset_idx;
3373 		pkt->pkt_policy_id = fe->fe_policy_id;
3374 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3375 
3376 		/*
3377 		 * The same code is exercised per packet for the non-chain case
3378 		 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3379 		 * re-walking the chain later.
3380 		 */
3381 		if (chain && (gso || same_svc)) {
3382 			fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp, now_nsec);
3383 		}
3384 		/* mark packet tos/svc_class */
3385 		fsw_qos_mark(fsw, fe, pkt);
3386 
3387 		tx_finalize_packet(fsw, pkt);
3388 		bytes += pkt->pkt_length;
3389 		cnt++;
3390 
3391 		same_svc = (same_svc && (svc == pkt->pkt_svc_class));
3392 		/*
3393 		 * we are using the first 4 bytes of flow_id as the AQM flow
3394 		 * identifier.
3395 		 */
3396 		ASSERT(!uuid_is_null(pkt->pkt_flow_id));
3397 
3398 		if (__improbable(pkt->pkt_trace_id != 0)) {
3399 			KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
3400 			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
3401 		}
3402 	}
3403 
3404 	/* snoop after it's finalized */
3405 	if (__improbable(pktap_total_tap_count != 0)) {
3406 		fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3407 	}
3408 
3409 	chain = chain && (gso || same_svc);
3410 	if (fe->fe_qset != NULL) {
3411 		classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3412 	} else {
3413 		classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3414 	}
3415 done:
3416 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3417 }
3418 
3419 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3420 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3421     struct flow_entry *prev_fe, struct __kern_packet *pkt)
3422 {
3423 	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3424 
3425 	if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3426 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3427 		SK_PERR(current_proc(), "invalid zero fragment id");
3428 		return NULL;
3429 	}
3430 
3431 	SK_PDF(SK_VERB_FSW_DP | SK_VERB_TX, current_proc(),
3432 	    "continuation frag, id %u", pkt->pkt_flow_ip_frag_id);
3433 	if (__improbable(prev_fe == NULL ||
3434 	    !prev_fe->fe_tx_is_cont_frag)) {
3435 		SK_PERR(current_proc(), "unexpected continuation frag %u",
3436 		    pkt->pkt_flow_ip_frag_id);
3437 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3438 		return NULL;
3439 	}
3440 	if (__improbable(pkt->pkt_flow_ip_frag_id !=
3441 	    prev_fe->fe_tx_frag_id)) {
3442 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3443 		SK_PERR(current_proc(), "wrong continuation frag id %u expecting %u",
3444 		    pkt->pkt_flow_ip_frag_id, prev_fe->fe_tx_frag_id);
3445 		return NULL;
3446 	}
3447 
3448 	return prev_fe;
3449 }
3450 
3451 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3452 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3453     struct flow_entry *prev_fe)
3454 {
3455 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3456 	struct flow_entry *__single fe;
3457 
3458 	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3459 	if (__improbable(fe == NULL)) {
3460 		goto done;
3461 	}
3462 
3463 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3464 		SK_RDERR(5, "Tx flow torn down %s",
3465 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3466 		FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3467 		flow_entry_release(&fe);
3468 		goto done;
3469 	}
3470 
3471 	if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
3472 		SK_RDERR(5, "Tx not allowed for this flow");
3473 		SK_RDERR(5, "Tx not allowed for this flow %s",
3474 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3475 		FSW_STATS_INC(FSW_STATS_TX_DISABLED);
3476 		flow_entry_release(&fe);
3477 		goto done;
3478 	}
3479 
3480 	_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3481 	    null_func);
3482 
3483 	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3484 		uuid_string_t flow_id_str, pkt_id_str;
3485 		sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3486 		sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3487 		SK_ERR("pkt flow id %s != flow id %s, %s", pkt_id_str,
3488 		    flow_id_str, fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3489 		flow_entry_release(&fe);
3490 		FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3491 	}
3492 
3493 done:
3494 	return fe;
3495 }
3496 
3497 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3498 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3499     uint32_t flags)
3500 {
3501 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3502 	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3503 
3504 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3505 	    KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3506 
3507 	/* flow related processing (default, agg, etc.) */
3508 	fe->fe_tx_process(fsw, fe, flags);
3509 
3510 	KPKTQ_FINI(&fe->fe_tx_pktq);
3511 }
3512 
3513 #if SK_LOG
3514 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3515 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3516 {
3517 	char *pkt_buf;
3518 	uint32_t pkt_len;
3519 
3520 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3521 	pkt_len = __packet_get_real_data_length(pkt);
3522 	SK_DF(verb, "%s(%d) %s %s", sk_proc_name(current_proc()),
3523 	    sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3524 	    128));
3525 }
3526 #else /* !SK_LOG */
3527 #define dp_tx_log_pkt(...)
3528 #endif /* !SK_LOG */
3529 
3530 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3531 fsw_datamov_begin(struct nx_flowswitch *fsw)
3532 {
3533 	struct ifnet *ifp;
3534 
3535 	ifp = fsw->fsw_ifp;
3536 	if (!ifnet_datamov_begin(ifp)) {
3537 		DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3538 		return NULL;
3539 	}
3540 	return ifp;
3541 }
3542 
3543 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3544 fsw_datamov_end(struct nx_flowswitch *fsw)
3545 {
3546 	ifnet_datamov_end(fsw->fsw_ifp);
3547 }
3548 
3549 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3550 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3551 {
3552 	struct __kern_packet *spkt, *pkt;
3553 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3554 	struct flow_entry *__single fe, *__single prev_fe;
3555 	struct pktq dropped_pkts, dpktq;
3556 	struct nexus_adapter *dev_na;
3557 	struct kern_pbufpool *dev_pp;
3558 	struct ifnet *ifp = NULL;
3559 	sa_family_t af;
3560 	uint32_t n_pkts, n_flows = 0;
3561 	boolean_t do_pacing = FALSE;
3562 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3563 	uint16_t line = 0;
3564 
3565 	int err;
3566 	KPKTQ_INIT(&dpktq);
3567 	KPKTQ_INIT(&dropped_pkts);
3568 	n_pkts = KPKTQ_LEN(spktq);
3569 
3570 	FSW_RLOCK(fsw);
3571 	if (__improbable(FSW_QUIESCED(fsw))) {
3572 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3573 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3574 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3575 		reason = DROP_REASON_FSW_QUIESCED;
3576 		line = __LINE__;
3577 		goto done;
3578 	}
3579 	dev_na = fsw->fsw_dev_ch->ch_na;
3580 	if (__improbable(dev_na == NULL)) {
3581 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3582 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3583 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3584 		reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3585 		line = __LINE__;
3586 		goto done;
3587 	}
3588 	ifp = fsw_datamov_begin(fsw);
3589 	if (ifp == NULL) {
3590 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3591 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3592 		reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3593 		line = __LINE__;
3594 		goto done;
3595 	}
3596 
3597 	/* batch allocate enough packets */
3598 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3599 
3600 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3601 	    NULL, SKMEM_NOSLEEP);
3602 #if DEVELOPMENT || DEBUG
3603 	if (__probable(err != ENOMEM)) {
3604 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3605 	}
3606 #endif /* DEVELOPMENT || DEBUG */
3607 	if (__improbable(err == ENOMEM)) {
3608 		ASSERT(KPKTQ_EMPTY(&dpktq));
3609 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3610 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3611 		SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3612 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3613 		line = __LINE__;
3614 		goto done;
3615 	} else if (__improbable(err == EAGAIN)) {
3616 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3617 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3618 		FSW_STATS_ADD(FSW_STATS_DROP,
3619 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3620 	}
3621 
3622 	n_pkts = KPKTQ_LEN(&dpktq);
3623 	prev_fe = NULL;
3624 	KPKTQ_FOREACH(spkt, spktq) {
3625 		if (n_pkts == 0) {
3626 			break;
3627 		}
3628 		--n_pkts;
3629 
3630 		KPKTQ_DEQUEUE(&dpktq, pkt);
3631 		ASSERT(pkt != NULL);
3632 		err = dp_copy_to_dev(fsw, spkt, pkt);
3633 		if (__improbable(err != 0)) {
3634 			/*
3635 			 * Copy to dev pool failed, so droptap should capture
3636 			 * the source pkt because dev pkt might not have metadata
3637 			 * or buffer filled out yet. Source pkt is freed by
3638 			 * fsw_user_ring_flush, so defer the free to that.
3639 			 */
3640 			dp_drop_pkt_single_nofree(fsw, spkt, 1,
3641 			    DROP_REASON_FSW_PKT_COPY_FAILED, DROPTAP_FLAG_L2_MISSING);
3642 			/* Free the dev pool packet */
3643 			pp_free_packet_single(pkt);
3644 			continue;
3645 		}
3646 
3647 		do_pacing |= __packet_get_tx_timestamp(SK_PKT2PH(pkt)) != 0;
3648 		af = fsw_ip_demux(fsw, pkt);
3649 		if (__improbable(af == AF_UNSPEC)) {
3650 			dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3651 			FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3652 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3653 			    DROPTAP_FLAG_L2_MISSING);
3654 			continue;
3655 		}
3656 
3657 		err = flow_pkt_classify(pkt, ifp, af, false);
3658 		if (__improbable(err != 0)) {
3659 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3660 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3661 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3662 			    DROPTAP_FLAG_L2_MISSING);
3663 			continue;
3664 		}
3665 
3666 		if (__improbable(pkt->pkt_flow_ip_is_frag &&
3667 		    !pkt->pkt_flow_ip_is_first_frag)) {
3668 			fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3669 			if (__probable(fe != NULL)) {
3670 				flow_entry_retain(fe);
3671 				goto flow_batch;
3672 			} else {
3673 				FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3674 				dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3675 				    DROPTAP_FLAG_L2_MISSING);
3676 				continue;
3677 			}
3678 		}
3679 
3680 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3681 		if (__improbable(fe == NULL)) {
3682 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3683 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3684 			    DROPTAP_FLAG_L2_MISSING);
3685 			prev_fe = NULL;
3686 			continue;
3687 		}
3688 flow_batch:
3689 		tx_flow_batch_packet(&fes, fe, pkt);
3690 		prev_fe = fe;
3691 	}
3692 
3693 	struct flow_entry *tfe = NULL;
3694 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3695 		tx_flow_process(fsw, fe, 0);
3696 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3697 		fe->fe_tx_is_cont_frag = false;
3698 		fe->fe_tx_frag_id = 0;
3699 		flow_entry_release(&fe);
3700 		n_flows++;
3701 	}
3702 
3703 done:
3704 	FSW_RUNLOCK(fsw);
3705 	if (n_flows > 0) {
3706 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3707 	}
3708 	if (ifp != NULL) {
3709 		fsw_datamov_end(fsw);
3710 	}
3711 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3712 	KPKTQ_FINI(&dropped_pkts);
3713 	KPKTQ_FINI(&dpktq);
3714 }
3715 
3716 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3717 get_tso_af(struct __kern_packet *pkt)
3718 {
3719 	packet_tso_flags_t tso_flags;
3720 
3721 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3722 	if (tso_flags == PACKET_TSO_IPV4) {
3723 		return AF_INET;
3724 	} else if (tso_flags == PACKET_TSO_IPV6) {
3725 		return AF_INET6;
3726 	} else {
3727 		panic("invalid tso flags: 0x%x\n", tso_flags);
3728 		/* NOTREACHED */
3729 		__builtin_unreachable();
3730 	}
3731 }
3732 
3733 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3734 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3735 {
3736 	struct tcphdr *__single tcp = tcphdr;
3737 
3738 	DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3739 	    void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3740 	pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3741 	pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3742 	pkt->pkt_flow_tcp_flags = tcp->th_flags;
3743 	pkt->pkt_flow_tcp_seq = tcp->th_seq;
3744 	pkt->pkt_flow_ulen = payload_sz;
3745 }
3746 
3747 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3748 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3749     struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3750     struct pktq *gso_pktq)
3751 {
3752 	ifnet_t ifp = fsw->fsw_ifp;
3753 	struct __kern_packet *pkt = first_pkt;
3754 	uint8_t proto = pkt->pkt_flow_ip_proto;
3755 	uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3756 	uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3757 	uint16_t total_hlen = ip_hlen + tcp_hlen;
3758 	uint16_t mtu = (uint16_t)ifp->if_mtu;
3759 	uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3760 	uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3761 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3762 	kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3763 	uint8_t *orig_pkt_baddr;
3764 	struct tcphdr *tcp;
3765 	struct ip *ip;
3766 	struct ip6_hdr *ip6;
3767 	uint32_t tcp_seq;
3768 	uint16_t ipid;
3769 	uint32_t pseudo_hdr_csum, bufsz;
3770 	uint64_t pkt_tx_timestamp = 0;
3771 
3772 	ASSERT(headroom <= UINT8_MAX);
3773 	if (proto != IPPROTO_TCP) {
3774 		SK_ERR("invalid proto: %d", proto);
3775 		DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3776 		    fsw, ifnet_t, ifp, uint8_t, proto);
3777 		return EINVAL;
3778 	}
3779 	if (mss == 0 || mss > (mtu - total_hlen)) {
3780 		SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3781 		    mss, mtu, total_hlen);
3782 		DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3783 		    fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3784 		    uint32_t, total_hlen);
3785 		return EINVAL;
3786 	}
3787 	bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3788 	if ((headroom + total_hlen + mss) > bufsz) {
3789 		SK_ERR("invalid args: headroom %d, total_hlen %d, "
3790 		    "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3791 		DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3792 		    fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3793 		    total_hlen, uint16_t, mss, uint32_t, bufsz);
3794 		return EINVAL;
3795 	}
3796 	n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3797 
3798 	ASSERT(pkt->pkt_headroom == headroom);
3799 	ASSERT(pkt->pkt_length == total_len);
3800 	ASSERT(pkt->pkt_l2_len == 0);
3801 	ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3802 	ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3803 	pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3804 	pkt->pkt_proto_seg_sz = 0;
3805 	pkt->pkt_csum_flags = 0;
3806 	MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3807 	orig_pkt_baddr += orig_pkt->pkt_headroom;
3808 
3809 	if (af == AF_INET) {
3810 		/*
3811 		 * XXX -fbounds-safety: can't avoid using forge unless we change
3812 		 * the flow metadata definition.
3813 		 */
3814 		ip = __unsafe_forge_bidi_indexable(struct ip *,
3815 		    pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3816 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3817 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3818 		ipid = ip->ip_id;
3819 		pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3820 		    pkt->pkt_flow_ipv4_dst.s_addr, 0);
3821 	} else {
3822 		ASSERT(af == AF_INET6);
3823 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3824 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3825 		pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3826 		    &pkt->pkt_flow_ipv6_dst, 0);
3827 	}
3828 	tcp_seq = ntohl(tcp->th_seq);
3829 
3830 	pkt_tx_timestamp = __packet_get_tx_timestamp(orig_ph);
3831 
3832 	for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3833 	    off += payload_sz) {
3834 		uint8_t *baddr, *baddr0;
3835 		uint32_t partial;
3836 
3837 		if (pkt == NULL) {
3838 			n++;
3839 			KPKTQ_DEQUEUE(dev_pktq, pkt);
3840 			ASSERT(pkt != NULL);
3841 		}
3842 		MD_BUFLET_ADDR_ABS(pkt, baddr0);
3843 		baddr = baddr0;
3844 		baddr += headroom;
3845 
3846 		/* Copy headers from the original packet */
3847 		if (n != 1) {
3848 			ASSERT(pkt != first_pkt);
3849 			pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3850 			fsw_pkt_copy_metadata(first_pkt, pkt);
3851 
3852 			ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3853 			/* flow info still needs to be updated below */
3854 			bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3855 			    sizeof(*pkt->pkt_flow));
3856 			pkt->pkt_trace_id = 0;
3857 			ASSERT(pkt->pkt_headroom == headroom);
3858 		} else {
3859 			METADATA_SET_LEN(pkt, 0, 0);
3860 		}
3861 		baddr += total_hlen;
3862 
3863 		/* copy tx timestamp from the orignal packet */
3864 		__packet_set_tx_timestamp(SK_PKT2PH(pkt), pkt_tx_timestamp);
3865 
3866 		/* Copy/checksum the payload from the original packet */
3867 		if (off + payload_sz > total_len) {
3868 			payload_sz = (uint16_t)(total_len - off);
3869 		}
3870 		pkt_copypkt_sum(orig_ph,
3871 		    (uint16_t)(orig_pkt->pkt_headroom + off),
3872 		    SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3873 		    &partial, TRUE);
3874 
3875 		DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3876 		    ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3877 		    uint16_t, mss, uint32_t, partial);
3878 		FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3879 
3880 		/*
3881 		 * Adjust header information and fill in the missing fields.
3882 		 */
3883 		if (af == AF_INET) {
3884 			ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3885 			tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3886 
3887 			if (n != n_pkts) {
3888 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3889 			}
3890 			if (n != 1) {
3891 				tcp->th_flags &= ~TH_CWR;
3892 				tcp->th_seq = htonl(tcp_seq);
3893 			}
3894 			update_flow_info(pkt, ip, tcp, payload_sz);
3895 
3896 			ip->ip_id = htons((ipid)++);
3897 			ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3898 			ip->ip_sum = 0;
3899 			ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3900 			tcp->th_sum = 0;
3901 
3902 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3903 			partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3904 			partial += pseudo_hdr_csum;
3905 			ADDCARRY(partial);
3906 			tcp->th_sum = ~(uint16_t)partial;
3907 		} else {
3908 			ASSERT(af == AF_INET6);
3909 			ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3910 			tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3911 
3912 			if (n != n_pkts) {
3913 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3914 			}
3915 			if (n != 1) {
3916 				tcp->th_flags &= ~TH_CWR;
3917 				tcp->th_seq = htonl(tcp_seq);
3918 			}
3919 			update_flow_info(pkt, ip6, tcp, payload_sz);
3920 
3921 			ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3922 			tcp->th_sum = 0;
3923 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3924 			partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3925 			partial += pseudo_hdr_csum;
3926 			ADDCARRY(partial);
3927 			tcp->th_sum = ~(uint16_t)partial;
3928 		}
3929 		tcp_seq += payload_sz;
3930 		METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3931 #if (DEVELOPMENT || DEBUG)
3932 		struct __kern_buflet *bft;
3933 		uint32_t blen;
3934 		PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3935 		blen = __buflet_get_data_length(bft);
3936 		if (blen != total_hlen + payload_sz) {
3937 			panic("blen (%d) != total_len + payload_sz (%d)\n",
3938 			    blen, total_hlen + payload_sz);
3939 		}
3940 #endif /* DEVELOPMENT || DEBUG */
3941 
3942 		pkt->pkt_length = total_hlen + payload_sz;
3943 		KPKTQ_ENQUEUE(gso_pktq, pkt);
3944 		pkt = NULL;
3945 
3946 		/*
3947 		 * Note that at this point the packet is not yet finalized.
3948 		 * The finalization happens in dp_flow_tx_process() after
3949 		 * the framing is done.
3950 		 */
3951 	}
3952 	ASSERT(n == n_pkts);
3953 	ASSERT(off == total_len);
3954 	DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3955 	    uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3956 	    uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3957 	return 0;
3958 }
3959 
3960 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3961 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3962     struct pktq *gso_pktq)
3963 {
3964 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3965 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3966 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3967 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3968 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3969 		KPKTQ_INIT(gso_pktq);
3970 	} else {
3971 		ASSERT(!TAILQ_EMPTY(fes));
3972 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3973 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3974 		KPKTQ_INIT(gso_pktq);
3975 		flow_entry_release(&fe);
3976 	}
3977 }
3978 
3979 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3980 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3981     uint32_t gso_pkts_estimate)
3982 {
3983 	struct __kern_packet *spkt, *pkt;
3984 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3985 	struct flow_entry *__single fe, *__single prev_fe;
3986 	struct pktq dpktq;
3987 	struct nexus_adapter *dev_na;
3988 	struct kern_pbufpool *dev_pp;
3989 	struct ifnet *ifp = NULL;
3990 	sa_family_t af;
3991 	uint32_t n_pkts, n_flows = 0;
3992 	int err;
3993 
3994 	KPKTQ_INIT(&dpktq);
3995 	n_pkts = KPKTQ_LEN(spktq);
3996 
3997 	FSW_RLOCK(fsw);
3998 	if (__improbable(FSW_QUIESCED(fsw))) {
3999 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
4000 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
4001 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
4002 		    DROPTAP_FLAG_L2_MISSING);
4003 		goto done;
4004 	}
4005 	dev_na = fsw->fsw_dev_ch->ch_na;
4006 	if (__improbable(dev_na == NULL)) {
4007 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
4008 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
4009 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
4010 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4011 		goto done;
4012 	}
4013 	ifp = fsw_datamov_begin(fsw);
4014 	if (ifp == NULL) {
4015 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
4016 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
4017 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4018 		goto done;
4019 	}
4020 
4021 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
4022 
4023 	/*
4024 	 * Batch allocate enough packets to perform GSO on all
4025 	 * packets in spktq.
4026 	 */
4027 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
4028 	    gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
4029 #if DEVELOPMENT || DEBUG
4030 	if (__probable(err != ENOMEM)) {
4031 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
4032 	}
4033 #endif /* DEVELOPMENT || DEBUG */
4034 	/*
4035 	 * We either get all packets or none. No partial allocations.
4036 	 */
4037 	if (__improbable(err != 0)) {
4038 		if (err == ENOMEM) {
4039 			ASSERT(KPKTQ_EMPTY(&dpktq));
4040 		} else {
4041 			dp_free_pktq(fsw, &dpktq);
4042 		}
4043 		DTRACE_SKYWALK1(gso__no__mem, int, err);
4044 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
4045 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4046 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
4047 		SK_ERR("failed to alloc %u pkts from device pool",
4048 		    gso_pkts_estimate);
4049 		goto done;
4050 	}
4051 	prev_fe = NULL;
4052 	KPKTQ_FOREACH(spkt, spktq) {
4053 		KPKTQ_DEQUEUE(&dpktq, pkt);
4054 		ASSERT(pkt != NULL);
4055 		/*
4056 		 * Copy only headers to the first packet of the GSO chain.
4057 		 * The headers will be used for classification below.
4058 		 */
4059 		err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4060 		if (__improbable(err != 0)) {
4061 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4062 			    DROPTAP_FLAG_L2_MISSING);
4063 			DTRACE_SKYWALK2(copy__headers__failed,
4064 			    struct nx_flowswitch *, fsw,
4065 			    struct __kern_packet *, spkt);
4066 			continue;
4067 		}
4068 		af = get_tso_af(pkt);
4069 		ASSERT(af == AF_INET || af == AF_INET6);
4070 
4071 		err = flow_pkt_classify(pkt, ifp, af, false);
4072 		if (__improbable(err != 0)) {
4073 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4074 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4075 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4076 			    DROPTAP_FLAG_L2_MISSING);
4077 			DTRACE_SKYWALK4(classify__failed,
4078 			    struct nx_flowswitch *, fsw,
4079 			    struct __kern_packet *, spkt,
4080 			    struct __kern_packet *, pkt,
4081 			    int, err);
4082 			continue;
4083 		}
4084 		/*
4085 		 * GSO cannot be done on a fragment and it's a bug in user
4086 		 * space to mark a fragment as needing GSO.
4087 		 */
4088 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4089 			FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4090 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4091 			    DROPTAP_FLAG_L2_MISSING);
4092 			DTRACE_SKYWALK3(is__frag,
4093 			    struct nx_flowswitch *, fsw,
4094 			    struct __kern_packet *, spkt,
4095 			    struct __kern_packet *, pkt);
4096 			continue;
4097 		}
4098 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
4099 		if (__improbable(fe == NULL)) {
4100 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4101 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
4102 			    DROPTAP_FLAG_L2_MISSING);
4103 			DTRACE_SKYWALK3(lookup__failed,
4104 			    struct nx_flowswitch *, fsw,
4105 			    struct __kern_packet *, spkt,
4106 			    struct __kern_packet *, pkt);
4107 			prev_fe = NULL;
4108 			continue;
4109 		}
4110 		/*
4111 		 * Perform GSO on spkt using the flow information
4112 		 * obtained above.
4113 		 */
4114 		struct pktq gso_pktq;
4115 		KPKTQ_INIT(&gso_pktq);
4116 		err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4117 		if (__probable(err == 0)) {
4118 			tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4119 			prev_fe = fe;
4120 		} else {
4121 			DTRACE_SKYWALK1(gso__error, int, err);
4122 			/* TODO: increment error stat */
4123 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4124 			    DROPTAP_FLAG_L2_MISSING);
4125 			flow_entry_release(&fe);
4126 			prev_fe = NULL;
4127 		}
4128 		KPKTQ_FINI(&gso_pktq);
4129 	}
4130 	struct flow_entry *tfe = NULL;
4131 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4132 		/* Chain-enqueue can be used for GSO chains */
4133 		tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4134 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
4135 		flow_entry_release(&fe);
4136 		n_flows++;
4137 	}
4138 done:
4139 	FSW_RUNLOCK(fsw);
4140 	if (n_flows > 0) {
4141 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4142 	}
4143 	if (ifp != NULL) {
4144 		fsw_datamov_end(fsw);
4145 	}
4146 
4147 	/*
4148 	 * It's possible for packets to be left in dpktq because
4149 	 * gso_pkts_estimate is only an estimate. The actual number
4150 	 * of packets needed could be less.
4151 	 */
4152 	uint32_t dpktq_len;
4153 	if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4154 		DTRACE_SKYWALK2(leftover__dev__pkts,
4155 		    struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4156 		dp_free_pktq(fsw, &dpktq);
4157 	}
4158 	KPKTQ_FINI(&dpktq);
4159 }
4160 
4161 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4162 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4163     struct proc *p)
4164 {
4165 #pragma unused(p)
4166 	uint32_t total_pkts = 0, total_bytes = 0;
4167 
4168 	for (;;) {
4169 		struct pktq pktq;
4170 		KPKTQ_INIT(&pktq);
4171 		uint32_t n_bytes;
4172 		fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4173 		if (n_bytes == 0) {
4174 			break;
4175 		}
4176 		total_pkts += KPKTQ_LEN(&pktq);
4177 		total_bytes += n_bytes;
4178 
4179 		if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4180 			fsw_receive(fsw, &pktq);
4181 		} else {
4182 			fsw_dev_input_netem_enqueue(fsw, &pktq);
4183 		}
4184 		KPKTQ_FINI(&pktq);
4185 	}
4186 
4187 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4188 	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4189 	    uint32_t, total_bytes);
4190 
4191 	/* compute mitigation rate for delivered traffic */
4192 	if (__probable(r->ckr_netif_mit_stats != NULL)) {
4193 		r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4194 	}
4195 }
4196 
4197 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4198 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4199     struct proc *p)
4200 {
4201 #pragma unused(p)
4202 	static packet_trace_id_t trace_id = 0;
4203 	uint32_t total_pkts = 0, total_bytes = 0;
4204 
4205 	for (;;) {
4206 		struct pktq pktq;
4207 		KPKTQ_INIT(&pktq);
4208 		uint32_t n_bytes;
4209 		uint32_t gso_pkts_estimate = 0;
4210 
4211 		fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4212 		    &gso_pkts_estimate);
4213 		if (n_bytes == 0) {
4214 			break;
4215 		}
4216 		total_pkts += KPKTQ_LEN(&pktq);
4217 		total_bytes += n_bytes;
4218 
4219 		KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4220 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4221 		    KPKTQ_FIRST(&pktq)->pkt_trace_id);
4222 
4223 		if (gso_pkts_estimate > 0) {
4224 			dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4225 		} else {
4226 			dp_tx_pktq(fsw, &pktq);
4227 		}
4228 		dp_free_pktq(fsw, &pktq);
4229 		KPKTQ_FINI(&pktq);
4230 	}
4231 	kr_update_stats(r, total_pkts, total_bytes);
4232 
4233 	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4234 	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4235 	    uint32_t, total_bytes);
4236 }
4237 
4238 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4239 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4240     struct proc *p)
4241 {
4242 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4243 
4244 	ASSERT(sk_is_sync_protected());
4245 	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4246 	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4247 
4248 	if (vpna->vpna_nx_port == FSW_VP_DEV) {
4249 		fsw_dev_ring_flush(fsw, r, p);
4250 	} else {
4251 		fsw_user_ring_flush(fsw, r, p);
4252 	}
4253 }
4254 
4255 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4256 fsw_dp_ctor(struct nx_flowswitch *fsw)
4257 {
4258 	uint32_t fe_cnt = fsw_fe_table_size;
4259 	uint32_t fob_cnt = fsw_flow_owner_buckets;
4260 	uint32_t frb_cnt = fsw_flow_route_buckets;
4261 	uint32_t frib_cnt = fsw_flow_route_id_buckets;
4262 	struct kern_nexus *nx = fsw->fsw_nx;
4263 	char name[64];
4264 	const char *__null_terminated fsw_name = NULL;
4265 	int error = 0;
4266 
4267 	/* just in case */
4268 	if (fe_cnt == 0) {
4269 		fe_cnt = NX_FSW_FE_TABLESZ;
4270 		ASSERT(fe_cnt != 0);
4271 	}
4272 	if (fob_cnt == 0) {
4273 		fob_cnt = NX_FSW_FOB_HASHSZ;
4274 		ASSERT(fob_cnt != 0);
4275 	}
4276 	if (frb_cnt == 0) {
4277 		frb_cnt = NX_FSW_FRB_HASHSZ;
4278 		ASSERT(frb_cnt != 0);
4279 	}
4280 	if (frib_cnt == 0) {
4281 		frib_cnt = NX_FSW_FRIB_HASHSZ;
4282 		ASSERT(frib_cnt != 0);
4283 	}
4284 
4285 	/* make sure fe_cnt is a power of two, else round up */
4286 	if ((fe_cnt & (fe_cnt - 1)) != 0) {
4287 		fe_cnt--;
4288 		fe_cnt |= (fe_cnt >> 1);
4289 		fe_cnt |= (fe_cnt >> 2);
4290 		fe_cnt |= (fe_cnt >> 4);
4291 		fe_cnt |= (fe_cnt >> 8);
4292 		fe_cnt |= (fe_cnt >> 16);
4293 		fe_cnt++;
4294 	}
4295 
4296 	/* make sure frb_cnt is a power of two, else round up */
4297 	if ((frb_cnt & (frb_cnt - 1)) != 0) {
4298 		frb_cnt--;
4299 		frb_cnt |= (frb_cnt >> 1);
4300 		frb_cnt |= (frb_cnt >> 2);
4301 		frb_cnt |= (frb_cnt >> 4);
4302 		frb_cnt |= (frb_cnt >> 8);
4303 		frb_cnt |= (frb_cnt >> 16);
4304 		frb_cnt++;
4305 	}
4306 
4307 	lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4308 	    &nexus_lock_attr);
4309 	lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4310 	lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4311 	TAILQ_INIT(&fsw->fsw_linger_head);
4312 	lck_mtx_init(&fsw->fsw_rxstrc_lock, &nexus_lock_group, &nexus_lock_attr);
4313 	TAILQ_INIT(&fsw->fsw_rxstrc_head);
4314 
4315 	fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4316 	error = nx_advisory_alloc(nx, fsw_name,
4317 	    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4318 	    NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4319 	if (error != 0) {
4320 		fsw_dp_dtor(fsw);
4321 		return error;
4322 	}
4323 
4324 	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4325 	if (fsw->fsw_flow_mgr == NULL) {
4326 		fsw_dp_dtor(fsw);
4327 		return error;
4328 	}
4329 
4330 	/* generic name; will be customized upon ifattach */
4331 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4332 	    FSW_REAP_THREADNAME, name, "");
4333 
4334 	if (kernel_thread_start(fsw_reap_thread_func, fsw,
4335 	    &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4336 		panic_plain("%s: can't create thread", __func__);
4337 		/* NOTREACHED */
4338 		__builtin_unreachable();
4339 	}
4340 	/* this must not fail */
4341 	VERIFY(fsw->fsw_reap_thread != NULL);
4342 
4343 	SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw));
4344 
4345 
4346 	return error;
4347 }
4348 
4349 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4350 fsw_dp_dtor(struct nx_flowswitch *fsw)
4351 {
4352 	uint64_t f = (1 * NSEC_PER_MSEC);         /* 1 ms */
4353 	uint64_t s = (1000 * NSEC_PER_SEC);         /* 1 sec */
4354 	uint32_t i = 0;
4355 
4356 #if (DEVELOPMENT || DEBUG)
4357 	if (fsw->fsw_rps_threads != NULL) {
4358 		for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4359 			fsw_rps_thread_join(fsw, i);
4360 		}
4361 		kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4362 		    fsw->fsw_rps_threads);
4363 	}
4364 #endif /* !DEVELOPMENT && !DEBUG */
4365 
4366 	nx_advisory_free(fsw->fsw_nx);
4367 
4368 	if (fsw->fsw_reap_thread != THREAD_NULL) {
4369 		/* signal thread to begin self-termination */
4370 		lck_mtx_lock(&fsw->fsw_reap_lock);
4371 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4372 
4373 		/*
4374 		 * And wait for thread to terminate; use another
4375 		 * wait channel here other than fsw_reap_flags to
4376 		 * make it more explicit.  In the event the reaper
4377 		 * thread misses a wakeup, we'll try again once
4378 		 * every second (except for the first time).
4379 		 */
4380 		while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4381 			uint64_t t = 0;
4382 
4383 			nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4384 			clock_absolutetime_interval_to_deadline(t, &t);
4385 			ASSERT(t != 0);
4386 
4387 			fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4388 			if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4389 				thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4390 			}
4391 			(void) assert_wait_deadline(&fsw->fsw_reap_thread,
4392 			    THREAD_UNINT, t);
4393 			lck_mtx_unlock(&fsw->fsw_reap_lock);
4394 			thread_block(THREAD_CONTINUE_NULL);
4395 			lck_mtx_lock(&fsw->fsw_reap_lock);
4396 			fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4397 		}
4398 		ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4399 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4400 		fsw->fsw_reap_thread = THREAD_NULL;
4401 	}
4402 
4403 	/* free any remaining flow entries in the linger list */
4404 	fsw_linger_purge(fsw);
4405 	fsw_rxstrc_purge(fsw);
4406 
4407 	if (fsw->fsw_flow_mgr != NULL) {
4408 		flow_mgr_destroy(fsw->fsw_flow_mgr);
4409 		fsw->fsw_flow_mgr = NULL;
4410 	}
4411 
4412 
4413 	lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4414 	lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4415 	lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4416 }
4417 
4418 void
fsw_linger_insert(struct flow_entry * fe)4419 fsw_linger_insert(struct flow_entry *fe)
4420 {
4421 	struct nx_flowswitch *fsw = fe->fe_fsw;
4422 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4423 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4424 
4425 	net_update_uptime();
4426 
4427 	ASSERT(flow_entry_refcnt(fe) >= 1);
4428 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4429 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4430 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4431 	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4432 	ASSERT(fe->fe_linger_wait != 0);
4433 	fe->fe_linger_expire = (net_uptime() + fe->fe_linger_wait);
4434 	os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4435 
4436 	lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4437 	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4438 	fsw->fsw_linger_cnt++;
4439 	VERIFY(fsw->fsw_linger_cnt != 0);
4440 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4441 
4442 	fsw_reap_sched(fsw);
4443 }
4444 
4445 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4446 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4447     struct flow_entry *fe)
4448 {
4449 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4450 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4451 
4452 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4453 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4454 	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4455 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4456 
4457 	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4458 	flow_entry_release(&fe);
4459 }
4460 
4461 static void
fsw_linger_remove(struct flow_entry * fe)4462 fsw_linger_remove(struct flow_entry *fe)
4463 {
4464 	struct nx_flowswitch *fsw = fe->fe_fsw;
4465 
4466 	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4467 
4468 	fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4469 	VERIFY(fsw->fsw_linger_cnt != 0);
4470 	fsw->fsw_linger_cnt--;
4471 }
4472 
4473 void
fsw_linger_purge(struct nx_flowswitch * fsw)4474 fsw_linger_purge(struct nx_flowswitch *fsw)
4475 {
4476 	struct flow_entry *fe, *tfe;
4477 
4478 	lck_mtx_lock(&fsw->fsw_linger_lock);
4479 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4480 		fsw_linger_remove(fe);
4481 	}
4482 	ASSERT(fsw->fsw_linger_cnt == 0);
4483 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4484 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4485 }
4486 
4487 void
fsw_rxstrc_insert(struct flow_entry * fe)4488 fsw_rxstrc_insert(struct flow_entry *fe)
4489 {
4490 	struct nx_flowswitch *fsw = fe->fe_fsw;
4491 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4492 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4493 
4494 	ASSERT(flow_entry_refcnt(fe) >= 1);
4495 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4496 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4497 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4498 	ASSERT(!(fe->fe_flags & FLOWENTF_RXSTRC_PENDING));
4499 	os_atomic_or(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4500 
4501 	flow_entry_retain(fe);
4502 
4503 	lck_mtx_lock_spin(&fsw->fsw_rxstrc_lock);
4504 	TAILQ_INSERT_TAIL(&fsw->fsw_rxstrc_head, fe, fe_rxstrc_link);
4505 	fsw->fsw_rxstrc_cnt++;
4506 	VERIFY(fsw->fsw_rxstrc_cnt != 0);
4507 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4508 
4509 	fsw_reap_sched(fsw);
4510 }
4511 
4512 static void
fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head * rxstrc_head,struct flow_entry * fe)4513 fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head *rxstrc_head,
4514     struct flow_entry *fe)
4515 {
4516 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4517 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4518 
4519 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4520 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4521 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4522 	ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
4523 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4524 
4525 	TAILQ_REMOVE(rxstrc_head, fe, fe_rxstrc_link);
4526 	flow_entry_release(&fe);
4527 }
4528 
4529 static void
fsw_rxstrc_remove(struct flow_entry * fe)4530 fsw_rxstrc_remove(struct flow_entry *fe)
4531 {
4532 	struct nx_flowswitch *fsw = fe->fe_fsw;
4533 
4534 	LCK_MTX_ASSERT(&fsw->fsw_rxstrc_lock, LCK_MTX_ASSERT_OWNED);
4535 
4536 	fsw_rxstrc_remove_internal(&fsw->fsw_rxstrc_head, fe);
4537 	VERIFY(fsw->fsw_rxstrc_cnt != 0);
4538 	fsw->fsw_rxstrc_cnt--;
4539 }
4540 
4541 void
fsw_rxstrc_purge(struct nx_flowswitch * fsw)4542 fsw_rxstrc_purge(struct nx_flowswitch *fsw)
4543 {
4544 	struct flow_entry *fe, *tfe;
4545 
4546 	lck_mtx_lock(&fsw->fsw_rxstrc_lock);
4547 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_rxstrc_head, fe_rxstrc_link, tfe) {
4548 		fsw_rxstrc_remove(fe);
4549 	}
4550 	ASSERT(fsw->fsw_rxstrc_cnt == 0);
4551 	ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
4552 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4553 }
4554 
4555 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4556 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4557 {
4558 	struct kern_nexus *nx;
4559 	uint64_t now = net_uptime();
4560 
4561 	nx = fsw->fsw_nx;
4562 
4563 	/* Walk through all channels and check for Rx stall condition */
4564 	/* uncrustify doesn't handle C blocks properly */
4565 	/* BEGIN IGNORE CODESTYLE */
4566 	nx_port_foreach(nx, ^(nexus_port_t nxport) {
4567 		struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4568 		uint64_t elapsed, enqueue_ts, dequeue_ts;
4569 		struct __kern_channel_ring *ring;
4570 		struct kern_channel *ch;
4571 		struct proc *p;
4572 
4573 		if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4574 			return;
4575 		}
4576 		ch = (struct kern_channel *)na->na_private;
4577 		if (ch == NULL) {
4578 			return;
4579 		}
4580 		ring = KR_SINGLE(na->na_rx_rings);
4581 		enqueue_ts = ring->ckr_rx_enqueue_ts;
4582 		dequeue_ts = ring->ckr_rx_dequeue_ts;
4583 		/* Elapsed time since last Rx enqueue */
4584 		elapsed = now - enqueue_ts;
4585 		if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4586 			p = proc_find(ch->ch_pid);
4587 			if (p == NULL) {
4588 				return;
4589 			}
4590 			if (fsw_rx_stall_defunct) {
4591 				kern_channel_defunct(p, ch);
4592 			}
4593 			proc_rele(p);
4594 			DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4595 			    struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4596 			FSW_STATS_INC(FSW_STATS_RX_STALL);
4597 			SK_ERR("Rx stall detected in proc %s(%d) (%s): "
4598 			    "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4599 			    "defunct: %s",
4600 			    ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4601 			    elapsed, now, enqueue_ts, dequeue_ts,
4602 			    fsw_rx_stall_defunct ? "yes" : "no");
4603 		}
4604 	});
4605 	/* END IGNORE CODESTYLE */
4606 }
4607 
4608 void
fsw_reap_sched(struct nx_flowswitch * fsw)4609 fsw_reap_sched(struct nx_flowswitch *fsw)
4610 {
4611 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4612 	lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4613 	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4614 	    !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4615 		thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4616 	}
4617 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4618 }
4619 
4620 __attribute__((noreturn))
4621 static void
fsw_reap_thread_func(void * v,wait_result_t w)4622 fsw_reap_thread_func(void *v, wait_result_t w)
4623 {
4624 #pragma unused(w)
4625 	struct nx_flowswitch *__single fsw = v;
4626 
4627 	ASSERT(fsw->fsw_reap_thread == current_thread());
4628 	/*
4629 	 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4630 	 * checks to ensure source contains the null terminator, by doing a
4631 	 * linear scan of the string.
4632 	 */
4633 	thread_set_thread_name(current_thread(),
4634 	    __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4635 
4636 	net_update_uptime();
4637 
4638 	lck_mtx_lock(&fsw->fsw_reap_lock);
4639 	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4640 	(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4641 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4642 	thread_block_parameter(fsw_reap_thread_cont, fsw);
4643 	/* NOTREACHED */
4644 	__builtin_unreachable();
4645 }
4646 
4647 __attribute__((noreturn))
4648 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4649 fsw_reap_thread_cont(void *v, wait_result_t wres)
4650 {
4651 	struct nx_flowswitch *__single fsw = v;
4652 	boolean_t low;
4653 	uint64_t t = 0;
4654 
4655 	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4656 
4657 	lck_mtx_lock(&fsw->fsw_reap_lock);
4658 	if (__improbable(wres == THREAD_INTERRUPTED ||
4659 	    (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4660 		goto terminate;
4661 	}
4662 
4663 	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4664 	fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4665 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4666 
4667 	net_update_uptime();
4668 
4669 	/* prevent detach from happening while we're here */
4670 	if (!fsw_detach_barrier_add(fsw)) {
4671 		SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4672 		t = 0;
4673 	} else {
4674 		uint32_t fe_nonviable, fe_freed, fe_aborted;
4675 		uint32_t fr_freed, fr_resid = 0;
4676 		struct ifnet *ifp = fsw->fsw_ifp;
4677 		uint64_t i = FSW_REAP_IVAL;
4678 		uint64_t now = net_uptime();
4679 		uint64_t last;
4680 
4681 		ASSERT(fsw->fsw_ifp != NULL);
4682 
4683 		/*
4684 		 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4685 		 */
4686 		fe_nonviable = fsw_process_deferred(fsw);
4687 
4688 		/*
4689 		 * Pass 2: remove any expired lingering flows.
4690 		 */
4691 		fe_freed = fsw_process_linger(fsw, &fe_aborted);
4692 
4693 		/*
4694 		 * Pass 3: process any pending Rx steering rule cleanup flows
4695 		 */
4696 		fsw_process_rxstrc(fsw);
4697 
4698 		/*
4699 		 * Pass 4: prune idle flow routes.
4700 		 */
4701 		fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4702 		    ifp, &fr_resid);
4703 
4704 		/*
4705 		 * Pass 5: prune flow table
4706 		 *
4707 		 */
4708 		cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4709 
4710 		SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4711 		    "fe_aborted %u fr_freed %u/%u",
4712 		    fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4713 		    (fe_nonviable + fsw->fsw_pending_nonviable),
4714 		    fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4715 		    (fe_freed + fr_resid));
4716 
4717 		/* see if VM memory level is critical */
4718 		low = skmem_lowmem_check();
4719 
4720 		/*
4721 		 * If things appear to be idle, we can prune away cached
4722 		 * object that have fallen out of the working sets (this
4723 		 * is different than purging).  Every once in a while, we
4724 		 * also purge the caches.  Note that this is done across
4725 		 * all flowswitch instances, and so we limit this to no
4726 		 * more than once every FSW_REAP_SK_THRES seconds.
4727 		 */
4728 		last = os_atomic_load(&fsw_reap_last, relaxed);
4729 		if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4730 		    os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4731 			fsw_purge_cache(fsw, low);
4732 
4733 			/* increase sleep interval if idle */
4734 			if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4735 			    fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4736 				i <<= 3;
4737 			}
4738 		} else if (last == 0) {
4739 			os_atomic_store(&fsw_reap_last, now, release);
4740 		}
4741 
4742 		/*
4743 		 * Additionally, run thru the list of channels and prune
4744 		 * or purge away cached objects on "idle" channels.  This
4745 		 * check is rate limited to no more than once every
4746 		 * FSW_DRAIN_CH_THRES seconds.
4747 		 */
4748 		last = fsw->fsw_drain_channel_chk_last;
4749 		if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4750 			SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4751 			    fsw->fsw_flow_mgr->fm_name);
4752 
4753 			fsw->fsw_drain_channel_chk_last = now;
4754 			fsw_drain_channels(fsw, now, low);
4755 		} else if (__improbable(last == 0)) {
4756 			fsw->fsw_drain_channel_chk_last = now;
4757 		}
4758 
4759 		/*
4760 		 * Finally, invoke the interface's reap callback to
4761 		 * tell it to prune or purge away cached objects if
4762 		 * it is idle.  This check is rate limited to no more
4763 		 * than once every FSW_REAP_IF_THRES seconds.
4764 		 */
4765 		last = fsw->fsw_drain_netif_chk_last;
4766 		if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4767 			ASSERT(fsw->fsw_nifna != NULL);
4768 
4769 			if (ifp->if_na_ops != NULL &&
4770 			    ifp->if_na_ops->ni_reap != NULL) {
4771 				SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4772 				    fsw->fsw_flow_mgr->fm_name);
4773 				ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4774 				    FSW_REAP_IF_THRES, low);
4775 			}
4776 
4777 			fsw->fsw_drain_netif_chk_last = now;
4778 		} else if (__improbable(last == 0)) {
4779 			fsw->fsw_drain_netif_chk_last = now;
4780 		}
4781 
4782 		/* emit periodic interface stats ktrace */
4783 		last = fsw->fsw_reap_last;
4784 		if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4785 			KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4786 			    ifp->if_data.ifi_ibytes * 8,
4787 			    ifp->if_data.ifi_opackets,
4788 			    ifp->if_data.ifi_obytes * 8);
4789 
4790 			fsw->fsw_reap_last = now;
4791 		} else if (__improbable(last == 0)) {
4792 			fsw->fsw_reap_last = now;
4793 		}
4794 
4795 		/* Check for Rx stall condition every fsw_rx_stall_thresh seconds */
4796 		last = fsw->fsw_rx_stall_chk_last;
4797 		if (fsw_rx_stall_thresh != 0) {
4798 			if (last != 0 && (now - last) >= fsw_rx_stall_thresh) {
4799 				fsw_defunct_rx_stall_channel(fsw);
4800 				fsw->fsw_rx_stall_chk_last = now;
4801 			} else if (__improbable(last == 0)) {
4802 				fsw->fsw_rx_stall_chk_last = now;
4803 			}
4804 		}
4805 
4806 		nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4807 		clock_absolutetime_interval_to_deadline(t, &t);
4808 		ASSERT(t != 0);
4809 
4810 		/* allow any pending detach to proceed */
4811 		fsw_detach_barrier_remove(fsw);
4812 	}
4813 
4814 	lck_mtx_lock(&fsw->fsw_reap_lock);
4815 	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4816 		fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4817 		(void) assert_wait_deadline(&fsw->fsw_reap_flags,
4818 		    THREAD_UNINT, t);
4819 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4820 		thread_block_parameter(fsw_reap_thread_cont, fsw);
4821 		/* NOTREACHED */
4822 		__builtin_unreachable();
4823 	} else {
4824 terminate:
4825 		LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4826 		fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4827 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4828 		/*
4829 		 * And signal any thread waiting for us to terminate;
4830 		 * wait channel here other than fsw_reap_flags to make
4831 		 * it more explicit.
4832 		 */
4833 		if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4834 			thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4835 		}
4836 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4837 
4838 		SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4839 
4840 		/* for the extra refcnt from kernel_thread_start() */
4841 		thread_deallocate(current_thread());
4842 		/* this is the end */
4843 		thread_terminate(current_thread());
4844 		/* NOTREACHED */
4845 		__builtin_unreachable();
4846 	}
4847 
4848 	/* must never get here */
4849 	VERIFY(0);
4850 	/* NOTREACHED */
4851 	__builtin_unreachable();
4852 }
4853 
4854 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4855 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4856 {
4857 	struct kern_nexus *nx = fsw->fsw_nx;
4858 
4859 	/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4860 	FSW_RLOCK(fsw);
4861 
4862 	/* uncrustify doesn't handle C blocks properly */
4863 	/* BEGIN IGNORE CODESTYLE */
4864 	nx_port_foreach(nx, ^(nexus_port_t p) {
4865 		boolean_t purge;
4866 		struct nexus_adapter *na = nx_port_get_na(nx, p);
4867 
4868 		if (na == NULL) {
4869 			DTRACE_SKYWALK1(ch__drain__na__null, struct nexus_adapter *, na);
4870 			return;
4871 		}
4872 
4873 		/*
4874 		 * If NA is deactivated, no need to proceed further with channel drain.
4875 		 * Note: fsw_vp_na_activate takes FSW_WLOCK before clearing the
4876 		 * NAF_ACTIVE flag.
4877 		 */
4878 		if ((na->na_flags & NAF_ACTIVE) == 0) {
4879 			DTRACE_SKYWALK1(ch__drain__na__inactive, struct nexus_adapter *, na);
4880 			return;
4881 		}
4882 
4883 		if (na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4884 			DTRACE_SKYWALK1(ch__drain__na__invalid, struct nexus_adapter *, na);
4885 			return;
4886 		}
4887 
4888 		/*
4889 		 * If some activity happened in the last FSW_DRAIN_CH_THRES
4890 		 * seconds on this channel, we reclaim memory if the channel
4891 		 * throughput is less than the reap threshold value.
4892 		 */
4893 		if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4894 			struct __kern_channel_ring *__single ring;
4895 			channel_ring_stats *stats;
4896 			uint64_t bps;
4897 
4898 			ring = KR_SINGLE(na->na_rx_rings);
4899 			stats = &ring->ckr_stats;
4900 			bps = stats->crs_bytes_per_second;
4901 
4902 			if (bps < fsw_channel_reap_thresh) {
4903 				purge = FALSE;
4904 				na_drain(na, purge);
4905 			}
4906 			return;
4907 		}
4908 
4909 		/*
4910 		 * If NA has been inactive for some time (twice the drain
4911 		 * threshold), we clear the work timestamp to temporarily skip
4912 		 * this channel until it's active again.  Purging cached objects
4913 		 * can be expensive since we'd need to allocate and construct
4914 		 * them again, so we do it only when necessary.
4915 		 */
4916 		if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4917 			na->na_work_ts = 0;
4918 			purge = TRUE;
4919 		} else {
4920 			purge = FALSE;
4921 		}
4922 
4923 		na_drain(na, purge);  /* purge/prune caches */
4924 	});
4925 	/* END IGNORE CODESTYLE */
4926 
4927 	FSW_RUNLOCK(fsw);
4928 }
4929 
4930 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4931 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4932 {
4933 #pragma unused(fsw)
4934 	uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4935 	uint32_t p = fsw_flow_purge_thresh;
4936 	boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4937 
4938 	SK_DF(SK_VERB_FLOW, "%s: %s caches",
4939 	    fsw->fsw_flow_mgr->fm_name,
4940 	    (purge ? "purge" : "prune"));
4941 
4942 	skmem_cache_reap_now(sk_fo_cache, purge);
4943 	skmem_cache_reap_now(sk_fe_cache, purge);
4944 	skmem_cache_reap_now(sk_fab_cache, purge);
4945 	skmem_cache_reap_now(flow_route_cache, purge);
4946 	skmem_cache_reap_now(flow_stats_cache, purge);
4947 	netns_reap_caches(purge);
4948 	skmem_reap_caches(purge);
4949 
4950 #if CONFIG_MBUF_MCACHE
4951 	if (if_is_fsw_transport_netagent_enabled() && purge) {
4952 		mbuf_drain(FALSE);
4953 	}
4954 #endif /* CONFIG_MBUF_MCACHE */
4955 }
4956 
4957 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4958 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4959 {
4960 	/* When the interface is in low power mode, the flow is nonviable */
4961 	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4962 	    os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4963 		os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4964 	}
4965 }
4966 
4967 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4968 fsw_process_deferred(struct nx_flowswitch *fsw)
4969 {
4970 	struct flow_entry_dead sfed __sk_aligned(8);
4971 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
4972 	struct flow_entry_dead *fed, *tfed;
4973 	LIST_HEAD(, flow_entry_dead) fed_head =
4974 	    LIST_HEAD_INITIALIZER(fed_head);
4975 	uint32_t i, nonviable = 0;
4976 	boolean_t lowpowermode = FALSE;
4977 
4978 	bzero(&sfed, sizeof(sfed));
4979 
4980 	/*
4981 	 * The flows become nonviable when the interface
4982 	 * is in low power mode (edge trigger)
4983 	 */
4984 	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4985 	    fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4986 		lowpowermode = TRUE;
4987 		fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4988 	}
4989 
4990 	/*
4991 	 * Scan thru the flow entry tree, and commit any pending withdraw or
4992 	 * nonviable requests.  We may need to push stats and/or unassign the
4993 	 * nexus from NECP, but we cannot do that while holding the locks;
4994 	 * build a temporary list for those entries.
4995 	 */
4996 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4997 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4998 		struct flow_owner *fo;
4999 
5000 		/*
5001 		 * Grab the lock at all costs when handling low power mode
5002 		 */
5003 		if (__probable(!lowpowermode)) {
5004 			if (!FOB_TRY_LOCK(fob)) {
5005 				continue;
5006 			}
5007 		} else {
5008 			FOB_LOCK(fob);
5009 		}
5010 
5011 		FOB_LOCK_ASSERT_HELD(fob);
5012 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
5013 			struct flow_entry *fe;
5014 
5015 			RB_FOREACH(fe, flow_entry_id_tree,
5016 			    &fo->fo_flow_entry_id_head) {
5017 				/* try first as reader; skip if we can't */
5018 				if (__improbable(lowpowermode)) {
5019 					fsw_flow_handle_low_power(fsw, fe);
5020 				}
5021 				if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
5022 					os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
5023 					flow_namespace_half_close(&fe->fe_port_reservation);
5024 				}
5025 
5026 				/* if not withdrawn/nonviable, skip */
5027 				if (!fe->fe_want_withdraw &&
5028 				    !fe->fe_want_nonviable) {
5029 					continue;
5030 				}
5031 				/*
5032 				 * Here we're holding the lock as writer;
5033 				 * don't spend too much time as we're
5034 				 * blocking the data path now.
5035 				 */
5036 				ASSERT(!uuid_is_null(fe->fe_uuid));
5037 				/* only need flow UUID and booleans */
5038 				uuid_copy(sfed.fed_uuid, fe->fe_uuid);
5039 				sfed.fed_want_clonotify =
5040 				    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
5041 				sfed.fed_want_nonviable = fe->fe_want_nonviable;
5042 				flow_entry_teardown(fo, fe);
5043 
5044 				/* do this outside the flow bucket lock */
5045 				fed = flow_entry_dead_alloc(Z_WAITOK);
5046 				ASSERT(fed != NULL);
5047 				*fed = sfed;
5048 				LIST_INSERT_HEAD(&fed_head, fed, fed_link);
5049 			}
5050 		}
5051 		FOB_UNLOCK(fob);
5052 	}
5053 
5054 	/*
5055 	 * These nonviable flows are no longer useful since we've lost
5056 	 * the source IP address; in the event the client monitors the
5057 	 * viability of the flow, explicitly mark it as nonviable so
5058 	 * that a new flow can be created.
5059 	 */
5060 	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
5061 		LIST_REMOVE(fed, fed_link);
5062 		ASSERT(fsw->fsw_agent_session != NULL);
5063 
5064 		/* if flow is closed early */
5065 		if (fed->fed_want_clonotify) {
5066 			necp_client_early_close(fed->fed_uuid);
5067 		}
5068 
5069 		/* if nonviable, unassign nexus attributes */
5070 		if (fed->fed_want_nonviable) {
5071 			(void) netagent_assign_nexus(fsw->fsw_agent_session,
5072 			    fed->fed_uuid, NULL, 0);
5073 		}
5074 
5075 		flow_entry_dead_free(fed);
5076 		++nonviable;
5077 	}
5078 	ASSERT(LIST_EMPTY(&fed_head));
5079 
5080 	return nonviable;
5081 }
5082 
5083 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)5084 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
5085 {
5086 	struct flow_entry_linger_head linger_head =
5087 	    TAILQ_HEAD_INITIALIZER(linger_head);
5088 	struct flow_entry *fe, *tfe;
5089 	uint64_t now = net_uptime();
5090 	uint32_t i = 0, cnt = 0, freed = 0;
5091 
5092 	ASSERT(fsw->fsw_ifp != NULL);
5093 	ASSERT(abort != NULL);
5094 	*abort = 0;
5095 
5096 	/*
5097 	 * We don't want to contend with the datapath, so move
5098 	 * everything that's in the linger list into a local list.
5099 	 * This allows us to generate RSTs or free the flow entry
5100 	 * outside the lock.  Any remaining flow entry in the local
5101 	 * list will get re-added back to the head of the linger
5102 	 * list, in front of any new ones added since then.
5103 	 */
5104 	lck_mtx_lock(&fsw->fsw_linger_lock);
5105 	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5106 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5107 	cnt = fsw->fsw_linger_cnt;
5108 	fsw->fsw_linger_cnt = 0;
5109 	lck_mtx_unlock(&fsw->fsw_linger_lock);
5110 
5111 	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
5112 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5113 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5114 		ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
5115 
5116 		/*
5117 		 * See if this is a TCP flow that needs to generate
5118 		 * a RST to the remote peer (if not already).
5119 		 */
5120 		if (flow_track_tcp_want_abort(fe)) {
5121 			VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
5122 			ASSERT(!uuid_is_null(fe->fe_uuid));
5123 			flow_track_abort_tcp(fe, NULL, NULL);
5124 			(*abort)++;
5125 			SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
5126 			SK_DF(SK_VERB_FLOW, "fe \"%s\" [RST]",
5127 			    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
5128 		}
5129 
5130 		/*
5131 		 * If flow has expired, remove from list and free;
5132 		 * otherwise leave it around in the linger list.
5133 		 */
5134 		if (fe->fe_linger_expire <= now) {
5135 			freed++;
5136 			fsw_linger_remove_internal(&linger_head, fe);
5137 			fe = NULL;
5138 		}
5139 		++i;
5140 	}
5141 	VERIFY(i == cnt && cnt >= freed);
5142 
5143 	/*
5144 	 * Add any remaining ones back into the linger list.
5145 	 */
5146 	lck_mtx_lock(&fsw->fsw_linger_lock);
5147 	if (!TAILQ_EMPTY(&linger_head)) {
5148 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5149 		TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5150 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5151 		TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5152 		fsw->fsw_linger_cnt += (cnt - freed);
5153 	}
5154 	ASSERT(TAILQ_EMPTY(&linger_head));
5155 	lck_mtx_unlock(&fsw->fsw_linger_lock);
5156 
5157 	return freed;
5158 }
5159 
5160 static void
fsw_process_rxstrc(struct nx_flowswitch * fsw)5161 fsw_process_rxstrc(struct nx_flowswitch *fsw)
5162 {
5163 	struct flow_entry_rxstrc_head rxstrc_head =
5164 	    TAILQ_HEAD_INITIALIZER(rxstrc_head);
5165 	struct flow_entry *fe, *tfe;
5166 
5167 	/*
5168 	 * We don't want to contend with the datapath, so move
5169 	 * everything that's in the rxstrc list into a local list.
5170 	 * This allows us to cleanup Rx steering rules or free the flow entry
5171 	 * outside the lock.
5172 	 */
5173 	lck_mtx_lock(&fsw->fsw_rxstrc_lock);
5174 	TAILQ_CONCAT(&rxstrc_head, &fsw->fsw_rxstrc_head, fe_rxstrc_link);
5175 	ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
5176 	fsw->fsw_rxstrc_cnt = 0;
5177 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
5178 
5179 	TAILQ_FOREACH_SAFE(fe, &rxstrc_head, fe_rxstrc_link, tfe) {
5180 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5181 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5182 		ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
5183 		ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
5184 
5185 		flow_entry_rx_steering_rule_cleanup(fsw, fe);
5186 		fsw_rxstrc_remove_internal(&rxstrc_head, fe);
5187 		fe = NULL;
5188 	}
5189 }
5190 
5191 __attribute__((always_inline))
5192 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5193 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5194 {
5195 	switch (__packet_get_traffic_class(ph)) {
5196 	case PKT_TC_BE:
5197 		ifp->if_tc.ifi_ibepackets++;
5198 		ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5199 		break;
5200 	case PKT_TC_BK:
5201 		ifp->if_tc.ifi_ibkpackets++;
5202 		ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5203 		break;
5204 	case PKT_TC_VI:
5205 		ifp->if_tc.ifi_ivipackets++;
5206 		ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5207 		break;
5208 	case PKT_TC_VO:
5209 		ifp->if_tc.ifi_ivopackets++;
5210 		ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5211 		break;
5212 	default:
5213 		break;
5214 	}
5215 }
5216 
5217 __attribute__((always_inline))
5218 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5219 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5220     uint32_t cnt, uint32_t len)
5221 {
5222 	switch (svc) {
5223 	case PKT_TC_BE:
5224 		ifp->if_tc.ifi_obepackets += cnt;
5225 		ifp->if_tc.ifi_obebytes += len;
5226 		break;
5227 	case PKT_TC_BK:
5228 		ifp->if_tc.ifi_obkpackets += cnt;
5229 		ifp->if_tc.ifi_obkbytes += len;
5230 		break;
5231 	case PKT_TC_VI:
5232 		ifp->if_tc.ifi_ovipackets += cnt;
5233 		ifp->if_tc.ifi_ovibytes += len;
5234 		break;
5235 	case PKT_TC_VO:
5236 		ifp->if_tc.ifi_ovopackets += cnt;
5237 		ifp->if_tc.ifi_ovobytes += len;
5238 		break;
5239 	default:
5240 		break;
5241 	}
5242 }
5243