xref: /xnu-12377.81.4/bsd/skywalk/nexus/flowswitch/fsw_dp.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 /*
55  *  BSD LICENSE
56  *
57  * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58  *  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  *  modification, are permitted provided that the following conditions
62  *  are met:
63  *
64  *    * Redistributions of source code must retain the above copyright
65  *      notice, this list of conditions and the following disclaimer.
66  *    * Redistributions in binary form must reproduce the above copyright
67  *      notice, this list of conditions and the following disclaimer in
68  *      the documentation and/or other materials provided with the
69  *      distribution.
70  *    * Neither the name of NEC Europe Ltd. nor the names of
71  *      its contributors may be used to endorse or promote products derived
72  *      from this software without specific prior written permission.
73  *
74  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <kern/uipc_domain.h>
94 #include <sys/kdebug.h>
95 #include <sys/sdt.h>
96 #include <net/bpf.h>
97 #include <net/if_ports_used.h>
98 #include <net/pktap.h>
99 #include <net/droptap.h>
100 #include <net/pktsched/pktsched_netem.h>
101 #include <netinet/tcp.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105 #include <netinet/in_var.h>
106 
107 extern kern_return_t thread_terminate(thread_t);
108 
109 #define FSW_ZONE_MAX                  256
110 #define FSW_ZONE_NAME                 "skywalk.nx.fsw"
111 
112 static uint64_t fsw_reap_last __sk_aligned(8);
113 static uint64_t fsw_want_purge __sk_aligned(8);
114 
115 #define NX_FSW_FE_TABLESZ       256     /* some power of 2 */
116 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
117 
118 #define NX_FSW_FOB_HASHSZ       31      /* some mersenne prime */
119 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
120 
121 #define NX_FSW_FRB_HASHSZ       128     /* some power of 2 */
122 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
123 
124 #define NX_FSW_FRIB_HASHSZ      13      /* some mersenne prime */
125 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
126 
127 #define NX_FSW_FLOW_REAP_INTERVAL 1     /* seconds */
128 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
129 
130 #define NX_FSW_RX_STALL_THRES   0       /* seconds (0 = disable) */
131 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
132 
133 #define NX_FSW_RX_STALL_DEFUNCT 1       /* defunct Rx-stalled channel (0 = disable) */
134 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
135 
136 #define NX_FSW_FLOW_PURGE_THRES 0       /* purge every N reaps (0 = disable) */
137 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
138 
139 #define FSW_REAP_IVAL            (MAX(1, fsw_flow_reap_interval))
140 #define FSW_REAP_SK_THRES        (FSW_REAP_IVAL << 5)
141 #define FSW_REAP_IF_THRES        (FSW_REAP_IVAL << 5)
142 #define FSW_DRAIN_CH_THRES       (FSW_REAP_IVAL << 5)
143 #define FSW_IFSTATS_THRES        1
144 
145 #define NX_FSW_CHANNEL_REAP_THRES 1000  /* threshold (bytes/sec) for reaping*/
146 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
147 
148 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
149 
150 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
151 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
152 uint32_t fsw_gso_batch = 8;
153 #if (DEVELOPMENT || DEBUG)
154 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
155     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
156     "flowswitch Rx batch size");
157 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
158     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
159     "flowswitch Tx batch size");
160 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
161     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
162     "flowswitch GSO batch size");
163 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
164     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
165     "flowswitch channel reap threshold throughput (bytes/sec)");
166 #endif /* !DEVELOPMENT && !DEBUG */
167 
168 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
169     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
170     "flowswitch RX aggregation for tcp flows (enable/disable)");
171 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
172     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
173     "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
174 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
175     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
176     "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
177 
178 /*
179  * IP reassembly
180  * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
181  * enable/disable the reassembly routine regardless of whether the
182  * transport netagent is enabled or not.
183  *
184  * 'fsw_ip_reass' is a tri-state:
185  *    0 means force IP reassembly off
186  *    1 means force IP reassembly on
187  *    2 means don't force the value, use what's appropriate for this flowswitch
188  */
189 #define FSW_IP_REASS_FORCE_OFF          0
190 #define FSW_IP_REASS_FORCE_ON           1
191 #define FSW_IP_REASS_AUTO               2
192 
193 uint32_t fsw_ip_reass = FSW_IP_REASS_AUTO;
194 
195 static int
196 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
197 {
198 #pragma unused(oidp, arg1, arg2)
199 	unsigned int new_value;
200 	int changed;
201 	int error;
202 
203 	error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
204 	    &new_value, &changed);
205 	if (error == 0 && changed != 0) {
206 		if (new_value > FSW_IP_REASS_AUTO) {
207 			return EINVAL;
208 		}
209 		fsw_ip_reass = new_value;
210 	}
211 	return error;
212 }
213 
214 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
215     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
216     0, 0, fsw_ip_reass_sysctl, "IU",
217     "adjust flowswitch IP reassembly");
218 
219 #if (DEVELOPMENT || DEBUG)
220 static uint64_t _fsw_inject_error = 0;
221 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
222 	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
223 	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
224 
225 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
226 	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
227 	        SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
228 	        if ((_f) != NULL)                                       \
229 	                (_f)(__VA_ARGS__);                              \
230 	}                                                               \
231 } while (0)
232 
233 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
234     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
235 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
236     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
237 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
238     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
239 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
240     flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
241     &fsw_flow_route_id_buckets, 0, "");
242 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
243     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
244 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
245     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
246 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
247     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
248 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
249     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
250 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
251     CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
252 #else
253 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
254 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
255 #endif /* !DEVELOPMENT && !DEBUG */
256 
257 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
258     struct flow_entry *);
259 static void fsw_reap_thread_func(void *, wait_result_t);
260 static void fsw_reap_thread_cont(void *, wait_result_t);
261 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
262 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
263 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
264 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
265 static void fsw_process_rxstrc(struct nx_flowswitch *);
266 
267 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
268     struct __kern_packet *);
269 
270 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
271 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
272     uint32_t, uint32_t);
273 
274 static int __fsw_dp_inited = 0;
275 
276 int
fsw_dp_init(void)277 fsw_dp_init(void)
278 {
279 	static_assert(FSW_VP_DEV == 0);
280 	static_assert(FSW_VP_HOST == 1);
281 	static_assert((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
282 	static_assert((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
283 
284 	ASSERT(!__fsw_dp_inited);
285 
286 	flow_mgr_init();
287 	flow_init();
288 
289 	__fsw_dp_inited = 1;
290 
291 	return 0;
292 }
293 
294 void
fsw_dp_uninit(void)295 fsw_dp_uninit(void)
296 {
297 	if (__fsw_dp_inited) {
298 		flow_fini();
299 		flow_mgr_fini();
300 
301 		__fsw_dp_inited = 0;
302 	}
303 }
304 
305 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)306 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
307 {
308 	pp_free_pktq(pktq);
309 }
310 
311 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do {         \
312 	uint32_t _len = KPKTQ_LEN(pktq);                                      \
313 	if (KPKTQ_EMPTY(pktq)) {                                              \
314 	        ASSERT(_len == 0);                                            \
315 	        break;                                                        \
316 	}                                                                     \
317 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len);        \
318 	FSW_STATS_ADD(FSW_STATS_DROP, _len);                                  \
319 	DTRACE_SKYWALK1(fsw__dp__drop, int, _len);                            \
320 	if (__probable(droptap_total_tap_count == 0)) {                       \
321 	        dp_free_pktq(fsw, pktq);                                      \
322 	        break;                                                        \
323 	}                                                                     \
324 	drop_func_t dropfunc;                                                 \
325 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
326 	struct __kern_packet *kpkt = KPKTQ_FIRST(pktq);                       \
327 	struct __kern_packet *next_pkt;                                       \
328 	for (; kpkt != NULL; kpkt = next_pkt) {                               \
329 	        next_pkt = kpkt->pkt_nextpkt;                                 \
330 	        dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags,    \
331 	            fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL,      \
332 	            0, 0);                                                    \
333 	}                                                                     \
334 	dp_free_pktq(fsw, pktq);                                              \
335 } while (0)
336 
337 #define dp_drop_pkt_single_nofree(fsw, pkt, outgoing, _reason, _flags) do { \
338 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet");                \
339 	FSW_STATS_ADD(FSW_STATS_DROP, 1);                                     \
340 	if (__probable(droptap_total_tap_count == 0)) {                       \
341 	        break;                                                        \
342 	}                                                                     \
343 	drop_func_t dropfunc;                                                 \
344 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
345 	dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags,         \
346 	    fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);      \
347 } while (0)
348 
349 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do {          \
350 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet");                \
351 	FSW_STATS_ADD(FSW_STATS_DROP, 1);                                     \
352 	if (__probable(droptap_total_tap_count == 0)) {                       \
353 	        pp_free_packet_single(pkt);                                   \
354 	        break;                                                        \
355 	}                                                                     \
356 	drop_func_t dropfunc;                                                 \
357 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
358 	dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags,         \
359 	    fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);      \
360 	pp_free_packet_single(pkt);                                           \
361 } while (0)
362 
363 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do {                \
364 	if (__probable(droptap_total_tap_count == 0)) {                       \
365 	        pp_free_packet_chain(pkt, NULL);                              \
366 	        break;                                                        \
367 	}                                                                     \
368 	drop_func_t dropfunc;                                                 \
369 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
370 	struct __kern_packet *next_pkt;                                       \
371 	for (; pkt != NULL; pkt = next_pkt) {                                 \
372 	        next_pkt = pkt->pkt_nextpkt;                                  \
373 	        dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
374 	            NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL,               \
375 	            0, 0);                                                    \
376 	}                                                                     \
377 	pp_free_packet_chain(pkt, NULL);                                      \
378 } while (0)
379 
380 
381 SK_NO_INLINE_ATTRIBUTE
382 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)383 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
384     bool input)
385 {
386 	pid_t pid;
387 	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
388 	const char *__null_terminated proc_name = NULL;
389 	pid_t epid;
390 	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
391 	const char *__null_terminated eproc_name = NULL;
392 	sa_family_t af;
393 	bool tap_early = false;
394 	struct __kern_packet *pkt;
395 
396 	ASSERT(fe != NULL);
397 	ASSERT(fsw->fsw_ifp != NULL);
398 
399 	if (fe->fe_nx_port == FSW_VP_HOST) {
400 		/* allow packets to be tapped before aggregation happens */
401 		tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
402 		if (!tap_early) {
403 			/* all other traffic will be tapped in the dlil input path */
404 			return;
405 		}
406 	}
407 	if (fe->fe_key.fk_ipver == IPVERSION) {
408 		af = AF_INET;
409 	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
410 		af = AF_INET6;
411 	} else {
412 		return;
413 	}
414 
415 	pid = fe->fe_pid;
416 	if (fe->fe_proc_name[0] != '\0') {
417 		proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
418 		    fe->fe_proc_name, sizeof(fe->fe_proc_name));
419 	}
420 	epid = fe->fe_epid;
421 	if (fe->fe_eproc_name[0] != '\0') {
422 		eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
423 		    fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
424 	}
425 	if (input) {
426 		KPKTQ_FOREACH(pkt, pktq) {
427 			pktap_input_packet(fsw->fsw_ifp, af,
428 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
429 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
430 			    IPPROTO_TCP, fe->fe_flowid,
431 			    tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
432 		}
433 	} else {
434 		KPKTQ_FOREACH(pkt, pktq) {
435 			pktap_output_packet(fsw->fsw_ifp, af,
436 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
437 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
438 			    0, 0, PTH_FLAG_NEXUS_CHAN);
439 		}
440 	}
441 }
442 
443 #if (DEVELOPMENT || DEBUG)
444 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)445 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
446     int *ret)
447 {
448 	static boolean_t _err35_flag_modified = FALSE;
449 
450 	switch (step) {
451 	case 1:
452 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
453 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
454 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
455 			_err35_flag_modified = TRUE;
456 		}
457 		break;
458 
459 	case 2:
460 		if (!_err35_flag_modified) {
461 			return;
462 		}
463 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
464 			m_freem(pkt->pkt_mbuf);
465 			pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
466 			pkt->pkt_mbuf = NULL;
467 		}
468 		*ret = EJUSTRETURN;
469 		fr->fr_flags |= FLOWRTF_RESOLVED;
470 		_err35_flag_modified = FALSE;
471 		break;
472 
473 	default:
474 		VERIFY(0);
475 		/* not reached */
476 	}
477 }
478 
479 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)480 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
481 {
482 	static boolean_t _err36_flag_modified = FALSE;
483 
484 	switch (step) {
485 	case 1:
486 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
487 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
488 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
489 			_err36_flag_modified = TRUE;
490 		}
491 		break;
492 
493 	case 2:
494 		if (!_err36_flag_modified) {
495 			return;
496 		}
497 		*ret = ENETUNREACH;
498 		fr->fr_flags |= FLOWRTF_RESOLVED;
499 		_err36_flag_modified = FALSE;
500 		break;
501 
502 	default:
503 		VERIFY(0);
504 		/* not reached */
505 	}
506 }
507 #else /* !DEVELOPMENT && !DEBUG */
508 #define _fsw_error35_handler(...)
509 #define _fsw_error36_handler(...)
510 #endif /* DEVELOPMENT || DEBUG */
511 
512 /*
513  * Check if the source packet content can fit into the destination
514  * ring's packet. Returns TRUE if the source packet can fit.
515  * Note: Failures could be caused by misconfigured packet pool sizes,
516  * missing packet size check again MTU or if the source packet is from
517  * a compat netif and the attached mbuf is larger than MTU due to LRO.
518  */
519 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)520 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
521     uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
522     uint32_t *copy_len)
523 {
524 	uint32_t tlen = 0;
525 	uint32_t splen = spkt->pkt_length - skip_l2hlen;
526 
527 	if (l2hlen != 0) {
528 		VERIFY(skip_l2hlen == 0);
529 		tlen += l2hlen;
530 	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
531 		splen -= ETHER_CRC_LEN;
532 	}
533 
534 	tlen += splen;
535 	*copy_len = splen;
536 
537 	return tlen <= ((__packet_get_buflet_count(dph) *
538 	       PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
539 	       headroom);
540 }
541 
542 #if SK_LOG
543 /* Hoisted out of line to reduce kernel stack footprint */
544 SK_LOG_ATTRIBUTE
545 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)546 copy_packet_from_dev_log(struct __kern_packet *spkt,
547     struct __kern_packet *dpkt, struct proc *p)
548 {
549 	uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
550 	    ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
551 	    SK_VERB_COPY_MBUF : SK_VERB_COPY));
552 	char *daddr;
553 	uint32_t pkt_len;
554 
555 	MD_BUFLET_ADDR_ABS(dpkt, daddr);
556 	pkt_len = __packet_get_real_data_length(dpkt);
557 	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
558 	    sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
559 	    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
560 	    (uint32_t)dpkt->pkt_l2_len);
561 	SK_DF(logflags | SK_VERB_DUMP, "%s",
562 	    sk_dump("buf", daddr, pkt_len, 128));
563 }
564 #else
565 #define copy_packet_from_dev_log(...)
566 #endif /* SK_LOG */
567 
568 
569 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)570 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
571     struct __kern_packet *dpkt)
572 {
573 	/*
574 	 * source and destination nexus don't share the packet pool
575 	 * sync operation here is to
576 	 * - alloc packet for the rx(dst) ring
577 	 * - copy data/metadata from src packet to dst packet
578 	 * - attach alloc'd packet to rx(dst) ring
579 	 */
580 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
581 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
582 	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
583 	    METADATA_SUBTYPE(spkt));
584 	boolean_t do_cksum_rx;
585 	uint16_t skip_l2h_len = spkt->pkt_l2_len;
586 	uint16_t iphlen;
587 	uint32_t dlen;
588 	int err;
589 
590 	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
591 	    &dlen))) {
592 		SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
593 		    PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
594 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
595 		return EINVAL;
596 	}
597 
598 	/* Copy packet metadata */
599 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
600 	_PKT_COPY(spkt, dpkt);
601 	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
602 	    PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
603 	ASSERT(dpkt->pkt_mbuf == NULL);
604 
605 	dpkt->pkt_headroom = 0;
606 	dpkt->pkt_l2_len = 0;
607 
608 	/* don't include IP header from partial sum */
609 	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
610 		iphlen = spkt->pkt_flow_ip_hlen;
611 		do_cksum_rx = sk_cksum_rx;
612 	} else {
613 		iphlen = 0;
614 		do_cksum_rx = FALSE;
615 	}
616 
617 	/* Copy packet payload */
618 	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
619 	    (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
620 		FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
621 		/*
622 		 * Source packet has truncated contents (just enough for
623 		 * the classifer) of an mbuf from the compat driver; copy
624 		 * the entire entire mbuf contents to destination packet.
625 		 */
626 		m_adj(spkt->pkt_mbuf, skip_l2h_len);
627 		ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
628 		fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
629 		    spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
630 	} else {
631 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
632 		/*
633 		 * Source packet has full contents, either from an mbuf
634 		 * that came up from the compat driver, or because it
635 		 * originated on the native driver; copy to destination.
636 		 */
637 		fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
638 		    (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
639 		    iphlen, 0, FALSE);
640 	}
641 
642 #if DEBUG || DEVELOPMENT
643 	if (__improbable(pkt_trailers > 0)) {
644 		dlen += pkt_add_trailers(dph, dlen, iphlen);
645 	}
646 #endif /* DEBUG || DEVELOPMENT */
647 
648 	/* Finalize and attach packet to Rx ring */
649 	METADATA_ADJUST_LEN(dpkt, 0, 0);
650 	err = __packet_finalize(dph);
651 	VERIFY(err == 0);
652 
653 	copy_packet_from_dev_log(spkt, dpkt, kernproc);
654 
655 	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
656 		ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
657 		mbuf_freem(spkt->pkt_mbuf);
658 		KPKT_CLEAR_MBUF_DATA(spkt);
659 	} else {
660 		fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
661 	}
662 
663 	if (__probable(do_cksum_rx != 0)) {
664 		FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
665 	}
666 
667 	return 0;
668 }
669 
670 SK_NO_INLINE_ATTRIBUTE
671 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)672 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
673 {
674 	char *pkt_buf;
675 	void *l3_hdr;
676 	uint16_t nfrags, tlen;
677 	int err = 0;
678 
679 	switch (fsw_ip_reass) {
680 	case FSW_IP_REASS_FORCE_OFF:
681 		return pkt;
682 	case FSW_IP_REASS_FORCE_ON:
683 		break;
684 	default:
685 		if (!FSW_NETAGENT_ENABLED(fsw) ||
686 		    flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
687 			return pkt;
688 		}
689 		break;
690 	}
691 
692 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
693 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
694 
695 	ASSERT(fsw->fsw_ipfm != NULL);
696 	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
697 
698 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
699 		struct ip *ip = l3_hdr;
700 		err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
701 	} else {
702 		struct ip6_hdr *ip6_hdr = l3_hdr;
703 		struct ip6_frag *__single ip6_frag =
704 		    (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
705 
706 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
707 		/* we only handle frag header immediately after v6 header */
708 		err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
709 		    &nfrags, &tlen);
710 	}
711 	if (__improbable(err != 0)) {
712 		/* if we get a bad fragment, free it */
713 		pp_free_packet_single(pkt);
714 		pkt = NULL;
715 	} else {
716 		ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
717 	}
718 
719 	return pkt;
720 }
721 
722 SK_NO_INLINE_ATTRIBUTE
723 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)724 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
725 {
726 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
727 	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
728 	kern_packet_t ph =  SK_PTR_ENCODE(pkt,
729 	    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
730 	/*
731 	 * This is the case when the packet is coming in from
732 	 * compat-netif. This packet only has valid metadata
733 	 * and an attached mbuf. We need to copy enough data
734 	 * from the mbuf to the packet buffer for the
735 	 * classifier. Compat netif packet pool is configured
736 	 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
737 	 * which is just enough to hold the protocol headers
738 	 * for the flowswitch classifier.
739 	 */
740 
741 	pkt->pkt_headroom = 0;
742 	METADATA_ADJUST_LEN(pkt, 0, 0);
743 	/*
744 	 * Copy the initial 128 bytes of the packet for
745 	 * classification.
746 	 * Ethernet(14) + IPv6 header(40) +
747 	 * + IPv6 fragment header(8) +
748 	 * TCP header with options(60).
749 	 */
750 	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
751 	    pkt->pkt_headroom, pkt->pkt_mbuf, 0,
752 	    MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
753 	    FALSE, 0);
754 
755 	int err = __packet_finalize_with_mbuf(pkt);
756 	VERIFY(err == 0);
757 }
758 
759 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)760 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
761 {
762 	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
763 
764 	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
765 		rx_prepare_packet_mbuf(fsw, pkt);
766 	}
767 
768 	return pkt;
769 }
770 
771 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)772 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
773     bool input, struct flow_entry *prev_fe)
774 {
775 	struct flow_key key __sk_aligned(16);
776 	struct flow_entry *__single fe = NULL;
777 
778 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
779 	flow_pkt2key(pkt, input, &key);
780 
781 	if (__probable(prev_fe != NULL &&
782 	    prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
783 		uint16_t saved_mask = key.fk_mask;
784 		key.fk_mask = FKMASK_5TUPLE;
785 		if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
786 			flow_entry_retain(prev_fe);
787 			fe = prev_fe;
788 		} else {
789 			key.fk_mask = saved_mask;
790 		}
791 	}
792 
793 top:
794 	if (__improbable(fe == NULL)) {
795 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
796 	}
797 
798 	if (__improbable(fe != NULL &&
799 	    (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
800 		/* Rx */
801 		if (input) {
802 			if (fe->fe_flags & FLOWENTF_PARENT) {
803 				struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
804 				if (child_fe != NULL) {
805 					flow_entry_release(&fe);
806 					fe = child_fe;
807 				}
808 			} else {
809 				if (!rx_flow_demux_match(fsw, fe, pkt)) {
810 					flow_entry_release(&fe);
811 					fe = NULL;
812 					goto top;
813 				}
814 			}
815 		} else {
816 			/* Tx */
817 			if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
818 				if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
819 					struct flow_entry *__single parent_fe = fe;
820 					fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
821 					flow_entry_release(&parent_fe);
822 				} else {
823 					flow_entry_release(&fe);
824 					fe = NULL;
825 					goto top;
826 				}
827 			}
828 		}
829 	}
830 
831 	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
832 	SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
833 	    "%s %s %s \"%s\" fe %p",
834 	    input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
835 	    sk_proc_name(current_proc()),
836 	    fk2str(&key, fkbuf, sizeof(fkbuf)), SK_KVA(fe));
837 
838 	return fe;
839 }
840 
841 SK_NO_INLINE_ATTRIBUTE
842 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)843 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
844 {
845 	struct nx_flowswitch *fsw = fe->fe_fsw;
846 	struct ifnet *ifp = fsw->fsw_ifp;
847 	struct in_ifaddr *ia = NULL;
848 	struct in_ifaddr *best_ia = NULL;
849 	struct in6_ifaddr *ia6 = NULL;
850 	struct in6_ifaddr *best_ia6 = NULL;
851 	struct ifnet *match_ifp = NULL;
852 	struct __flow *flow = pkt->pkt_flow;
853 	bool result = false;
854 
855 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
856 
857 	if (flow->flow_ip_ver == IPVERSION) {
858 		if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
859 		    IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
860 		    IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
861 		    IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
862 		    IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
863 		    IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
864 		    INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
865 			result = true;
866 			goto done;
867 		}
868 
869 		/*
870 		 * Check for a match in the hash bucket.
871 		 */
872 		lck_rw_lock_shared(&in_ifaddr_rwlock);
873 		TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
874 			if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
875 				best_ia = ia;
876 				match_ifp = ia->ia_ifp;
877 
878 				if (match_ifp == ifp) {
879 					break;
880 				}
881 				/*
882 				 * Continue the loop in case there's a exact match with another
883 				 * interface
884 				 */
885 			}
886 		}
887 
888 		if (best_ia != NULL) {
889 			if (match_ifp != ifp && ipforwarding == 0 &&
890 			    (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
891 			    match_ifp->if_family == IFNET_FAMILY_UTUN)) {
892 				/*
893 				 * Drop when interface address check is strict and forwarding
894 				 * is disabled
895 				 */
896 			} else {
897 				lck_rw_done(&in_ifaddr_rwlock);
898 				result = true;
899 				goto done;
900 			}
901 		}
902 		lck_rw_done(&in_ifaddr_rwlock);
903 
904 		if (ifp->if_flags & IFF_BROADCAST) {
905 			/*
906 			 * Check for broadcast addresses.
907 			 *
908 			 * Only accept broadcast packets that arrive via the matching
909 			 * interface.  Reception of forwarded directed broadcasts would be
910 			 * handled via ip_forward() and ether_frameout() with the loopback
911 			 * into the stack for SIMPLEX interfaces handled by ether_frameout().
912 			 */
913 			struct ifaddr *ifa;
914 
915 			ifnet_lock_shared(ifp);
916 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
917 				if (ifa->ifa_addr->sa_family != AF_INET) {
918 					continue;
919 				}
920 				ia = ifatoia(ifa);
921 				if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
922 				    ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
923 					ifnet_lock_done(ifp);
924 					result = true;
925 					goto done;
926 				}
927 			}
928 			ifnet_lock_done(ifp);
929 		}
930 	} else {
931 		struct in6_ifaddrhashhead *ia6_hash_head;
932 
933 		if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
934 		    IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
935 		    IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
936 			result = true;
937 			goto done;
938 		}
939 
940 		/*
941 		 * Check for exact addresses in the hash bucket.
942 		 */
943 		lck_rw_lock_shared(&in6_ifaddr_rwlock);
944 		/* XXX -fbounds-safety: external dependency on ip6_input.c */
945 		ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
946 		    in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
947 		ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
948 
949 		TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
950 			if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
951 			    ia6->ia_ifp->if_index, ifp->if_index)) {
952 				if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
953 					continue;
954 				}
955 				best_ia6 = ia6;
956 				if (ia6->ia_ifp == ifp) {
957 					break;
958 				}
959 				/*
960 				 * Continue the loop in case there's a exact match with another
961 				 * interface
962 				 */
963 			}
964 		}
965 		if (best_ia6 != NULL) {
966 			if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
967 			    (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
968 			    best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
969 				/*
970 				 * Drop when interface address check is strict and forwarding
971 				 * is disabled
972 				 */
973 			} else {
974 				lck_rw_done(&in6_ifaddr_rwlock);
975 				result = true;
976 				goto done;
977 			}
978 		}
979 		lck_rw_done(&in6_ifaddr_rwlock);
980 	}
981 
982 	/*
983 	 * In forwarding mode, if the destination address
984 	 * of the packet does not match any interface
985 	 * address, it maybe destined to the client device
986 	 */
987 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
988 	    "Rx flow does not match interface address");
989 done:
990 	return result;
991 }
992 
993 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)994 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
995     struct flow_entry *prev_fe)
996 {
997 	struct flow_entry *__single fe;
998 
999 	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
1000 	_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
1001 	if (fe == NULL) {
1002 		FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
1003 		return NULL;
1004 	}
1005 
1006 	if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1007 	    fe->fe_flags & FLOWENTF_LISTENER) &&
1008 	    !pkt_is_for_listener(fe, pkt)) {
1009 		FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
1010 		flow_entry_release(&fe);
1011 		return NULL;
1012 	}
1013 
1014 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1015 		FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1016 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1017 		    "Rx flow torn down");
1018 		flow_entry_release(&fe);
1019 		return NULL;
1020 	}
1021 
1022 	if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
1023 		FSW_STATS_INC(FSW_STATS_RX_DISABLED);
1024 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1025 		    "Rx not allowed for this flow");
1026 		flow_entry_release(&fe);
1027 	}
1028 	return fe;
1029 }
1030 
1031 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1032 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1033     struct __kern_packet *pkt, uint64_t tid)
1034 {
1035 	/*
1036 	 * Among threads working on the same fe, the first thread that reaches here
1037 	 * will be responsible for processing all the packets until a point when
1038 	 * it does not see new packets in fe_rx_pktq. Other threads only
1039 	 * enqueue their packets but do not add the flow entry to their flow entry list.
1040 	 */
1041 	lck_mtx_lock(&fe->fe_rx_pktq_lock);
1042 
1043 	if (fe->fe_rx_worker_tid == 0) {
1044 		fe->fe_rx_worker_tid = tid;
1045 	} else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1046 		STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1047 	}
1048 
1049 	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1050 		fe->fe_rx_frag_count++;
1051 	}
1052 
1053 	fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1054 	/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1055 	if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1056 		ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1057 		TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1058 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1059 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1060 	} else {
1061 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1062 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1063 		flow_entry_release(&fe);
1064 	}
1065 }
1066 
1067 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1068 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1069     struct __kern_packet *pkt)
1070 {
1071 	/* record frag continuation */
1072 	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1073 		ASSERT(pkt->pkt_flow_ip_is_frag);
1074 		fe->fe_tx_is_cont_frag = true;
1075 		fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1076 	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1077 		fe->fe_tx_is_cont_frag = false;
1078 		fe->fe_tx_frag_id = 0;
1079 	}
1080 
1081 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1082 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1083 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1084 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1085 	} else {
1086 		ASSERT(!TAILQ_EMPTY(fes));
1087 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1088 		flow_entry_release(&fe);
1089 	}
1090 }
1091 
1092 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1093 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1094     uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1095 {
1096 	uint32_t n_pkts = 0;
1097 	slot_idx_t idx, idx_end;
1098 	idx = r->ckr_khead;
1099 	idx_end = r->ckr_rhead;
1100 
1101 	ASSERT(KPKTQ_EMPTY(pktq));
1102 	*n_bytes = 0;
1103 	for (; n_pkts < n_pkts_max && idx != idx_end;
1104 	    idx = SLOT_NEXT(idx, r->ckr_lim)) {
1105 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1106 		struct __kern_packet *pkt = ksd->sd_pkt;
1107 
1108 		ASSERT(pkt->pkt_nextpkt == NULL);
1109 		KR_SLOT_DETACH_METADATA(r, ksd);
1110 
1111 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1112 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1113 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1114 		    || (pkt->pkt_length == 0)) {
1115 			FSW_STATS_INC(FSW_STATS_DROP);
1116 			pp_free_packet_single(pkt);
1117 			continue;
1118 		}
1119 		n_pkts++;
1120 		*n_bytes += pkt->pkt_length;
1121 
1122 		KPKTQ_ENQUEUE(pktq, pkt);
1123 	}
1124 	r->ckr_khead = idx;
1125 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1126 }
1127 
1128 /*
1129  * This is only for estimating how many packets each GSO packet will need.
1130  * The number does not need to be exact because any leftover packets allocated
1131  * will be freed.
1132  */
1133 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1134 estimate_gso_pkts(struct __kern_packet *pkt)
1135 {
1136 	packet_tso_flags_t tso_flags;
1137 	uint16_t mss;
1138 	uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1139 
1140 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1141 	mss = pkt->pkt_proto_seg_sz;
1142 
1143 	if (tso_flags == PACKET_TSO_IPV4) {
1144 		total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1145 	} else if (tso_flags == PACKET_TSO_IPV6) {
1146 		total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1147 	}
1148 	if (total_hlen != 0 && mss != 0) {
1149 		total_len = pkt->pkt_length;
1150 		n_pkts = (uint32_t)
1151 		    (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1152 	}
1153 	DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1154 	    uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1155 	    uint32_t, n_pkts);
1156 	return n_pkts;
1157 }
1158 
1159 /*
1160  * This function retrieves a chain of packets of the same type only
1161  * (GSO or non-GSO).
1162  */
1163 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1164 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1165     struct __kern_channel_ring *r, uint32_t n_pkts_max,
1166     struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1167 {
1168 	uint32_t n_pkts = 0;
1169 	slot_idx_t idx, idx_end;
1170 	idx = r->ckr_khead;
1171 	idx_end = r->ckr_rhead;
1172 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1173 	boolean_t gso_enabled, gso_required;
1174 	uint32_t gso_pkts;
1175 
1176 	gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1177 	ASSERT(KPKTQ_EMPTY(pktq));
1178 	*n_bytes = 0;
1179 	for (; n_pkts < n_pkts_max &&
1180 	    (!gso_enabled || fsw_gso_batch == 0 ||
1181 	    *gso_pkts_estimate < fsw_gso_batch) &&
1182 	    idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1183 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1184 		struct __kern_packet *pkt = ksd->sd_pkt;
1185 
1186 		ASSERT(pkt->pkt_nextpkt == NULL);
1187 
1188 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1189 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1190 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1191 		    || (pkt->pkt_length == 0)) {
1192 			KR_SLOT_DETACH_METADATA(r, ksd);
1193 			FSW_STATS_INC(FSW_STATS_DROP);
1194 			pp_free_packet_single(pkt);
1195 			continue;
1196 		}
1197 		if (gso_enabled) {
1198 			gso_pkts = estimate_gso_pkts(pkt);
1199 
1200 			/*
1201 			 * We use the first packet to determine what
1202 			 * type the subsequent ones need to be (GSO or
1203 			 * non-GSO).
1204 			 */
1205 			if (n_pkts == 0) {
1206 				gso_required = (gso_pkts != 0);
1207 			} else {
1208 				if (gso_required != (gso_pkts != 0)) {
1209 					break;
1210 				}
1211 			}
1212 			*gso_pkts_estimate += gso_pkts;
1213 		}
1214 		KR_SLOT_DETACH_METADATA(r, ksd);
1215 		if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1216 			__packet_set_tx_nx_port(SK_PKT2PH(pkt),
1217 			    vpna->vpna_nx_port, vpna->vpna_gencnt);
1218 		}
1219 		n_pkts++;
1220 		*n_bytes += pkt->pkt_length;
1221 		KPKTQ_ENQUEUE(pktq, pkt);
1222 	}
1223 	r->ckr_khead = idx;
1224 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1225 	DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1226 	    ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1227 	    uint32_t, *gso_pkts_estimate);
1228 }
1229 
1230 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1231 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1232     struct pktq *pktq)
1233 {
1234 #pragma unused(fsw)
1235 	struct __kern_packet *pkt;
1236 	struct __kern_quantum *kqum;
1237 	uint32_t kr_space_avail = 0;
1238 	uint32_t n, n_pkts = 0, n_bytes = 0;
1239 	slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1240 
1241 	kr_enter(r, TRUE);
1242 
1243 	idx_start = r->ckr_ktail;
1244 	kr_space_avail = kr_available_slots_rxring(r);
1245 	_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1246 	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1247 	_FSW_INJECT_ERROR(41, n, 0, null_func);
1248 	idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1249 
1250 	idx = idx_start;
1251 	while (idx != idx_end) {
1252 		KPKTQ_DEQUEUE(pktq, pkt);
1253 		kqum = SK_PTR_ADDR_KQUM(pkt);
1254 		kqum->qum_qflags |= QUM_F_FINALIZED;
1255 		n_pkts++;
1256 		n_bytes += pkt->pkt_length;
1257 		KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1258 		if (__improbable(pkt->pkt_trace_id != 0)) {
1259 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1260 			KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1261 		}
1262 		idx = SLOT_NEXT(idx, r->ckr_lim);
1263 	}
1264 
1265 	kr_update_stats(r, n_pkts, n_bytes);
1266 
1267 	/*
1268 	 * ensure slot attachments are visible before updating the
1269 	 * tail pointer
1270 	 */
1271 	os_atomic_thread_fence(seq_cst);
1272 
1273 	r->ckr_ktail = idx_end;
1274 
1275 	kr_exit(r);
1276 
1277 	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1278 
1279 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1280 	    r->ckr_name, n_pkts);
1281 }
1282 
1283 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1284 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1285 {
1286 	ASSERT(KPKTQ_EMPTY(pktq));
1287 
1288 	for (uint32_t i = 0; i < n_pkts; i++) {
1289 		struct __kern_packet *__single pkt = pkts[i];
1290 		ASSERT(pkt->pkt_nextpkt == NULL);
1291 		KPKTQ_ENQUEUE(pktq, pkt);
1292 	}
1293 }
1294 
1295 /*
1296  * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1297  */
1298 SK_NO_INLINE_ATTRIBUTE
1299 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1300 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1301     struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1302 {
1303 	uint32_t tot_cnt;
1304 	unsigned int num_segs = 1;
1305 	struct mbuf *__single mhead, *__single head = NULL;
1306 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1307 	uint32_t mhead_cnt, mhead_bufsize;
1308 	uint32_t mhead_waste = 0;
1309 	uint32_t mcnt = 0, mbytes = 0;
1310 	uint32_t largest, max_pkt_len;
1311 	struct __kern_packet *__single pkt;
1312 	struct kern_pbufpool *pp;
1313 
1314 	tot_cnt = KPKTQ_LEN(pktq);
1315 	ASSERT(tot_cnt > 0);
1316 	mhead_cnt = tot_cnt;
1317 
1318 	/*
1319 	 * Opportunistically batch-allocate the mbufs based on the largest
1320 	 * packet size we've seen in the recent past.  Note that we reset
1321 	 * fe_rx_largest_size below if we notice that we're under-utilizing the
1322 	 * allocated buffers (thus disabling this batch allocation).
1323 	 */
1324 	largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1325 	if (__probable(largest != 0)) {
1326 		if (largest <= MCLBYTES) {
1327 			mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1328 			    &num_segs, M_NOWAIT, 1, 0);
1329 			mhead_bufsize = MCLBYTES;
1330 		} else if (largest <= MBIGCLBYTES) {
1331 			mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1332 			    &num_segs, M_NOWAIT, 1, 0);
1333 			mhead_bufsize = MBIGCLBYTES;
1334 		} else if (largest <= M16KCLBYTES) {
1335 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1336 			    &num_segs, M_NOWAIT, 1, 0);
1337 			mhead_bufsize = M16KCLBYTES;
1338 		} else if (largest <= M16KCLBYTES * 2) {
1339 			num_segs = 2;
1340 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1341 			    &num_segs, M_NOWAIT, 1, 0);
1342 			mhead_bufsize = M16KCLBYTES * 2;
1343 		} else {
1344 			mhead = NULL;
1345 			mhead_bufsize = mhead_cnt = 0;
1346 		}
1347 	} else {
1348 		mhead = NULL;
1349 		mhead_bufsize = mhead_cnt = 0;
1350 	}
1351 	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1352 	    uint32_t, mhead_cnt, uint32_t, tot_cnt);
1353 
1354 	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1355 	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1356 
1357 	KPKTQ_FOREACH(pkt, pktq) {
1358 		uint32_t tot_len, len;
1359 		uint16_t pad, llhlen, iphlen;
1360 		boolean_t do_cksum_rx;
1361 		struct mbuf *__single m;
1362 		int error;
1363 
1364 		llhlen = pkt->pkt_l2_len;
1365 		len = pkt->pkt_length;
1366 		if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1367 			DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1368 			    struct __kern_packet *, pkt);
1369 			FSW_STATS_INC(FSW_STATS_DROP);
1370 			FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1371 			continue;
1372 		}
1373 		/* begin payload on 32-bit boundary; figure out the padding */
1374 		pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1375 		tot_len = pad + len;
1376 
1377 		/* remember largest packet size */
1378 		if (__improbable(largest < tot_len)) {
1379 			largest = MAX(tot_len, MCLBYTES);
1380 		}
1381 
1382 		/*
1383 		 * If the above batch allocation returned partial
1384 		 * success, we try a blocking allocation here again.
1385 		 */
1386 		m = mhead;
1387 		if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1388 			ASSERT(mhead != NULL || mhead_cnt == 0);
1389 			num_segs = 1;
1390 			if (tot_len > M16KCLBYTES) {
1391 				num_segs = 0;
1392 			}
1393 			if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1394 			    &num_segs, &m)) != 0) {
1395 				DTRACE_SKYWALK2(bad__len,
1396 				    struct nx_flowswitch *, fsw,
1397 				    struct __kern_packet *, pkt);
1398 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1399 				FSW_STATS_INC(FSW_STATS_DROP);
1400 				continue;
1401 			}
1402 		} else {
1403 			mhead = m->m_nextpkt;
1404 			m->m_nextpkt = NULL;
1405 			ASSERT(mhead_cnt != 0);
1406 			--mhead_cnt;
1407 
1408 			/* check if we're underutilizing large buffers */
1409 			if (__improbable(mhead_bufsize > MCLBYTES &&
1410 			    tot_len < (mhead_bufsize >> 1))) {
1411 				++mhead_waste;
1412 			}
1413 			/*
1414 			 * Clean up unused mbuf.
1415 			 * Ony need to do this when we pre-alloc 2x16K mbufs
1416 			 */
1417 			if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1418 				ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1419 				struct mbuf *m_extra = m->m_next;
1420 				ASSERT(m_extra != NULL);
1421 				ASSERT(m_extra->m_len == 0);
1422 				ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1423 				m->m_next = NULL;
1424 				m_freem(m_extra);
1425 				FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1426 			}
1427 		}
1428 		m->m_data += pad;
1429 		/*
1430 		 * XXX -fbounds-safety: external dependency
1431 		 * mtod does not work because m_len is 0
1432 		 */
1433 		m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1434 
1435 		/* don't include IP header from partial sum */
1436 		if (__probable((pkt->pkt_qum_qflags &
1437 		    QUM_F_FLOW_CLASSIFIED) != 0)) {
1438 			iphlen = pkt->pkt_flow_ip_hlen;
1439 			do_cksum_rx = sk_cksum_rx;
1440 		} else {
1441 			iphlen = 0;
1442 			do_cksum_rx = FALSE;
1443 		}
1444 
1445 		fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1446 		    pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1447 		    llhlen + iphlen);
1448 
1449 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1450 		if (do_cksum_rx) {
1451 			FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1452 		}
1453 #if DEBUG || DEVELOPMENT
1454 		if (__improbable(pkt_trailers > 0)) {
1455 			(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1456 		}
1457 #endif /* DEBUG || DEVELOPMENT */
1458 		m_adj(m, llhlen);
1459 
1460 		m->m_pkthdr.rcvif = fsw->fsw_ifp;
1461 		if (__improbable((pkt->pkt_link_flags &
1462 		    PKT_LINKF_ETHFCS) != 0)) {
1463 			m->m_flags |= M_HASFCS;
1464 		}
1465 		if (__improbable((pkt->pkt_link_flags &
1466 		    PKT_LINKF_BCAST) != 0)) {
1467 			m->m_flags |= M_BCAST;
1468 		}
1469 		if (__improbable((pkt->pkt_link_flags &
1470 		    PKT_LINKF_MCAST) != 0)) {
1471 			m->m_flags |= M_MCAST;
1472 		}
1473 		if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1474 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1475 		}
1476 		ASSERT(m->m_nextpkt == NULL);
1477 		tail = m;
1478 		*tailp = m;
1479 		tailp = &m->m_nextpkt;
1480 		mcnt++;
1481 		mbytes += m_pktlen(m);
1482 	}
1483 	/* free any leftovers */
1484 	if (__improbable(mhead != NULL)) {
1485 		DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1486 		ASSERT(mhead_cnt != 0);
1487 		(void) m_freem_list(mhead);
1488 		mhead = NULL;
1489 		mhead_cnt = 0;
1490 	}
1491 
1492 	/* reset if most packets (>50%) are smaller than our batch buffers */
1493 	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1494 		DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1495 		    struct flow_entry *, NULL, uint32_t, mhead_waste,
1496 		    uint32_t, tot_cnt);
1497 		largest = 0;
1498 	}
1499 
1500 	if (largest != fsw->fsw_rx_largest_size) {
1501 		os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1502 	}
1503 
1504 	pp_free_pktq(pktq);
1505 	*m_headp = head;
1506 	*m_tailp = tail;
1507 	*cnt = mcnt;
1508 	*bytes = mbytes;
1509 }
1510 
1511 /*
1512  * This function only extracts the mbuf from the packet. The caller frees
1513  * the packet.
1514  */
1515 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1516 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1517 {
1518 	struct mbuf *m;
1519 	struct pkthdr *mhdr;
1520 	uint16_t llhlen;
1521 
1522 	m = pkt->pkt_mbuf;
1523 	ASSERT(m != NULL);
1524 
1525 	llhlen = pkt->pkt_l2_len;
1526 	if (llhlen > pkt->pkt_length) {
1527 		m_freem(m);
1528 		KPKT_CLEAR_MBUF_DATA(pkt);
1529 		DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1530 		    struct __kern_packet *, pkt);
1531 		FSW_STATS_INC(FSW_STATS_DROP);
1532 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1533 		return NULL;
1534 	}
1535 	mhdr = &m->m_pkthdr;
1536 	if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1537 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1538 		mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1539 		mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1540 		mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1541 		mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1542 	}
1543 #if DEBUG || DEVELOPMENT
1544 	uint32_t extra = 0;
1545 	if (__improbable(pkt_trailers > 0)) {
1546 		extra = pkt_add_trailers_mbuf(m, llhlen);
1547 	}
1548 #endif /* DEBUG || DEVELOPMENT */
1549 	m_adj(m, llhlen);
1550 	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1551 	KPKT_CLEAR_MBUF_DATA(pkt);
1552 	return m;
1553 }
1554 
1555 SK_NO_INLINE_ATTRIBUTE
1556 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1557 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1558     struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1559 {
1560 	struct __kern_packet *pkt;
1561 	struct mbuf *__single m, *__single head = NULL;
1562 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1563 	uint32_t c = 0, b = 0;
1564 
1565 	KPKTQ_FOREACH(pkt, pktq) {
1566 		m = convert_compat_pkt_to_mbuf(fsw, pkt);
1567 		if (__improbable(m == NULL)) {
1568 			continue;
1569 		}
1570 		tail = m;
1571 		*tailp = m;
1572 		tailp = &m->m_nextpkt;
1573 		c++;
1574 		b += m_pktlen(m);
1575 	}
1576 	pp_free_pktq(pktq);
1577 	*m_head = head;
1578 	*m_tail = tail;
1579 	*cnt = c;
1580 	*bytes = b;
1581 }
1582 
1583 void
fsw_host_sendup(struct ifnet * ifp,struct mbufq * host_mq)1584 fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq)
1585 {
1586 	struct ifnet_stat_increment_param s;
1587 
1588 	if (mbufq_empty(host_mq)) {
1589 		return;
1590 	}
1591 
1592 	bzero(&s, sizeof(s));
1593 	s.packets_in = host_mq->count;
1594 	s.bytes_in = host_mq->bytes;
1595 	dlil_input_handler(ifp, mbufq_first(host_mq), mbufq_last(host_mq), &s, FALSE, NULL);
1596 }
1597 
1598 void
fsw_host_rx_cb(struct nx_flowswitch * fsw,struct pktq * pktq)1599 fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq)
1600 {
1601 	ifnet_fsw_rx_cb_t __single cb;
1602 	void *__single cb_arg;
1603 
1604 	ASSERT(!KPKTQ_EMPTY(pktq));
1605 	if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1606 		ASSERT(cb != NULL);
1607 		ASSERT(cb_arg != NULL);
1608 		(*cb)(cb_arg, pktq);
1609 		ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1610 		if (KPKTQ_EMPTY(pktq)) {
1611 			return;
1612 		} else {
1613 			DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1614 			    struct pktq *, pktq);
1615 		}
1616 	}
1617 }
1618 
1619 void
fsw_host_rx_enqueue_mbq(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbufq * host_mq)1620 fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq,
1621     struct mbufq *host_mq)
1622 {
1623 	struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1624 	uint32_t cnt = 0, bytes = 0;
1625 	boolean_t compat;
1626 
1627 	if (KPKTQ_EMPTY(pktq)) {
1628 		return;
1629 	}
1630 
1631 	/* All packets in the pktq must have the same type */
1632 	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1633 	if (compat) {
1634 		convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1635 		    &bytes);
1636 	} else {
1637 		convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1638 		    &bytes);
1639 	}
1640 	if (__improbable(m_head == NULL)) {
1641 		DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1642 		return;
1643 	}
1644 
1645 	mbufq_enqueue(host_mq, m_head, m_tail, cnt, bytes);
1646 }
1647 
1648 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1649 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1650     struct __kern_channel_ring *r, struct pktq *pktq)
1651 {
1652 	fsw_ring_enqueue_pktq(fsw, r, pktq);
1653 	/*
1654 	 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1655 	 * This is to ensure we use the timestamp of the earliest enqueue without
1656 	 * a dequeue.
1657 	 */
1658 	if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1659 		r->ckr_rx_enqueue_ts = net_uptime();
1660 	}
1661 	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1662 	dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1663 	    DROPTAP_FLAG_L2_MISSING);
1664 }
1665 
1666 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1667 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1668 {
1669 	struct kern_nexus *nx = fsw->fsw_nx;
1670 	struct nexus_adapter *na = NULL;
1671 	nexus_port_t port = fe->fe_nx_port;
1672 
1673 	if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1674 		SK_ERR("dev or host ports have no NA");
1675 		return NULL;
1676 	}
1677 
1678 	if (__improbable(!nx_port_is_valid(nx, port))) {
1679 		SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1680 		    if_name(fsw->fsw_ifp), port);
1681 		return NULL;
1682 	}
1683 
1684 	na = nx_port_get_na(nx, port);
1685 	if (__improbable(na == NULL)) {
1686 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1687 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1688 		    if_name(fsw->fsw_ifp), port);
1689 		return NULL;
1690 	}
1691 
1692 	if (__improbable(!NA_IS_ACTIVE(na))) {
1693 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1694 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1695 		    if_name(fsw->fsw_ifp), port);
1696 		return NULL;
1697 	}
1698 
1699 	if (__improbable(nx_port_is_defunct(nx, port))) {
1700 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1701 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1702 		    if_name(fsw->fsw_ifp), port);
1703 		return NULL;
1704 	}
1705 
1706 	return na;
1707 }
1708 
1709 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1710 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1711 {
1712 	struct nexus_vp_adapter *na = NULL;
1713 	struct __kern_channel_ring *__single r = NULL;
1714 
1715 	na = VPNA(flow_get_na(fsw, fe));
1716 	if (__improbable(na == NULL)) {
1717 		return NULL;
1718 	}
1719 
1720 	switch (txrx) {
1721 	case NR_RX:
1722 		r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1723 		break;
1724 	case NR_TX:
1725 		r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1726 		break;
1727 	default:
1728 		__builtin_unreachable();
1729 		VERIFY(0);
1730 	}
1731 
1732 	if (__improbable(KR_DROP(r))) {
1733 		FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1734 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %p %s drop mode",
1735 		    SK_KVA(r), r->ckr_name);
1736 		return NULL;
1737 	}
1738 
1739 	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1740 
1741 #if (DEVELOPMENT || DEBUG)
1742 	if (r != NULL) {
1743 		_FSW_INJECT_ERROR(4, r, NULL, null_func);
1744 	}
1745 #endif /* DEVELOPMENT || DEBUG */
1746 
1747 	return r;
1748 }
1749 
1750 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1751 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1752 {
1753 	return flow_get_ring(fsw, fe, NR_RX);
1754 }
1755 
1756 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1757 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1758 {
1759 	struct flow_route *fr = fe->fe_route;
1760 	struct ifnet *ifp = fsw->fsw_ifp;
1761 
1762 	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1763 	    !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1764 	    fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1765 	    !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1766 		/*
1767 		 * The source address is no longer around; we want this
1768 		 * flow to be nonviable, but that requires holding the lock
1769 		 * as writer (which isn't the case now.)  Indicate that
1770 		 * we need to finalize the nonviable later down below.
1771 		 *
1772 		 * We also request that the flow route be re-configured,
1773 		 * if this is a connected mode flow.
1774 		 *
1775 		 */
1776 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1777 			/*
1778 			 * fsw_pending_nonviable is a hint for reaper thread;
1779 			 * due to the fact that setting fe_want_nonviable and
1780 			 * incrementing fsw_pending_nonviable counter is not
1781 			 * atomic, let the increment happen first, and the
1782 			 * thread losing the CAS does decrement.
1783 			 */
1784 			os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1785 			if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1786 				fsw_reap_sched(fsw);
1787 			} else {
1788 				os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1789 			}
1790 		}
1791 		if (fr != NULL) {
1792 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1793 		}
1794 	}
1795 
1796 	/* if flow was (or is going to be) marked as nonviable, drop it */
1797 	if (__improbable(fe->fe_want_nonviable ||
1798 	    (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1799 		SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow %p non-viable",
1800 		    SK_KVA(fe));
1801 		return false;
1802 	}
1803 	return true;
1804 }
1805 
1806 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1807 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1808 {
1809 	bool okay;
1810 	okay = dp_flow_route_process(fsw, fe);
1811 #if (DEVELOPMENT || DEBUG)
1812 	if (okay) {
1813 		_FSW_INJECT_ERROR(5, okay, false, null_func);
1814 	}
1815 #endif /* DEVELOPMENT || DEBUG */
1816 
1817 	return okay;
1818 }
1819 
1820 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)1821 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1822     struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
1823     uint32_t flags)
1824 {
1825 #pragma unused(flags)
1826 	struct pktq dpkts;              /* dst pool alloc'ed packets */
1827 	struct pktq disposed_pkts;         /* done src packets */
1828 	struct pktq dropped_pkts;         /* dropped src packets */
1829 	struct pktq transferred_pkts;         /* dst packet ready for ring */
1830 	struct __kern_packet *pkt, *tpkt;
1831 	struct kern_pbufpool *dpp;
1832 	uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1833 	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1834 	uint16_t buf_array_iter = 0;
1835 	uint32_t cnt, buf_cnt = 0;
1836 	int err;
1837 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1838 	uint16_t line = 0;
1839 
1840 	KPKTQ_INIT(&dpkts);
1841 	KPKTQ_INIT(&dropped_pkts);
1842 	KPKTQ_INIT(&disposed_pkts);
1843 	KPKTQ_INIT(&transferred_pkts);
1844 
1845 	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1846 		SK_ERR("Rx route bad");
1847 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1848 		FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1849 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1850 		line = __LINE__;
1851 		goto done;
1852 	}
1853 
1854 	if (fe->fe_nx_port == FSW_VP_HOST) {
1855 		/*
1856 		 * The host ring does not exist anymore so we can't take
1857 		 * the enqueue path below. This path should only be hit
1858 		 * for the rare tcp fragmentation case.
1859 		 */
1860 
1861 		fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
1862 		return;
1863 	}
1864 
1865 	/* find the ring */
1866 	struct __kern_channel_ring *r;
1867 	r = fsw_flow_get_rx_ring(fsw, fe);
1868 	if (__improbable(r == NULL)) {
1869 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1870 		reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1871 		line = __LINE__;
1872 		goto done;
1873 	}
1874 
1875 	/* snoop before L2 is stripped */
1876 	if (__improbable(pktap_total_tap_count != 0)) {
1877 		fsw_snoop(fsw, fe, rx_pkts, true);
1878 	}
1879 
1880 	dpp = r->ckr_pp;
1881 	/* batch allocate enough packets */
1882 	err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1883 	    SKMEM_NOSLEEP);
1884 	if (__improbable(err == ENOMEM)) {
1885 		ASSERT(KPKTQ_EMPTY(&dpkts));
1886 		KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1887 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1888 		SK_ERR("failed to alloc %u pkts for kr %s, %p", n_pkts,
1889 		    r->ckr_name, SK_KVA(r));
1890 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1891 		line = __LINE__;
1892 		goto done;
1893 	}
1894 
1895 	/*
1896 	 * estimate total number of buflets for the packet chain.
1897 	 */
1898 	cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1899 	if (cnt > n_pkts) {
1900 		ASSERT(dpp->pp_max_frags > 1);
1901 		cnt -= n_pkts;
1902 		buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1903 		err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1904 		    SKMEM_NOSLEEP, false);
1905 		if (__improbable(buf_cnt == 0)) {
1906 			KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1907 			FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1908 			SK_ERR("failed to alloc %d buflets (err %d) for kr %s %p",
1909 			    cnt, err, r->ckr_name, SK_KVA(r));
1910 			reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1911 			line = __LINE__;
1912 			goto done;
1913 		}
1914 		err = 0;
1915 	}
1916 
1917 	/* extra processing for user flow */
1918 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1919 		err = 0;
1920 		KPKTQ_REMOVE(rx_pkts, pkt);
1921 		if (rx_bytes > pkt->pkt_flow_ulen) {
1922 			rx_bytes -= pkt->pkt_flow_ulen;
1923 		} else {
1924 			rx_bytes = 0;
1925 		}
1926 		err = flow_pkt_track(fe, pkt, true);
1927 		_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1928 		if (__improbable(err != 0)) {
1929 			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1930 			FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1931 			/* if need to trigger RST */
1932 			if (err == ENETRESET) {
1933 				flow_track_abort_tcp(fe, pkt, NULL);
1934 			}
1935 			dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1936 			    DROPTAP_FLAG_L2_MISSING);
1937 			continue;
1938 		}
1939 
1940 		/* transfer to dpkt */
1941 		if (pkt->pkt_qum.qum_pp != dpp) {
1942 			struct __kern_buflet *bprev, *bnew;
1943 			struct __kern_packet *dpkt = NULL;
1944 			uint32_t n_bufs, i;
1945 
1946 			KPKTQ_DEQUEUE(&dpkts, dpkt);
1947 			/* XXX Why would dpkt be NULL at this point? */
1948 			if (__improbable(dpkt == NULL)) {
1949 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1950 				dp_drop_pkt_single(fsw, pkt, 0,
1951 				    DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1952 				continue;
1953 			}
1954 			n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1955 			n_bufs--;
1956 			for (i = 0; i < n_bufs; i++) {
1957 				if (__improbable(buf_cnt == 0)) {
1958 					ASSERT(dpp->pp_max_frags > 1);
1959 					buf_array_iter = 0;
1960 					cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1961 					n_pkts = KPKTQ_LEN(rx_pkts);
1962 					if (cnt >= n_pkts) {
1963 						cnt -= n_pkts;
1964 					} else {
1965 						cnt = 0;
1966 					}
1967 					cnt += (n_bufs - i);
1968 					buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1969 					    cnt);
1970 					cnt = buf_cnt;
1971 					err = pp_alloc_buflet_batch(dpp,
1972 					    buf_array, &buf_cnt,
1973 					    SKMEM_NOSLEEP, false);
1974 					if (__improbable(buf_cnt == 0)) {
1975 						FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1976 						dp_drop_pkt_single(fsw, pkt, 0,
1977 						    DROP_REASON_FSW_PP_ALLOC_FAILED,
1978 						    DROPTAP_FLAG_L2_MISSING);
1979 						pkt = NULL;
1980 						pp_free_packet_single(dpkt);
1981 						dpkt = NULL;
1982 						SK_ERR("failed to alloc %d "
1983 						    "buflets (err %d) for "
1984 						    "kr %s, %p", cnt, err,
1985 						    r->ckr_name, SK_KVA(r));
1986 						break;
1987 					}
1988 					err = 0;
1989 				}
1990 				ASSERT(buf_cnt != 0);
1991 				if (i == 0) {
1992 					PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1993 				}
1994 				/*
1995 				 * XXX -fbounds-safety: can't avoid using forge
1996 				 * unless we change the signature of
1997 				 * pp_alloc_buflet_batch().
1998 				 */
1999 				bnew = __unsafe_forge_single(kern_buflet_t,
2000 				    buf_array[buf_array_iter]);
2001 				buf_array[buf_array_iter] = 0;
2002 				buf_array_iter++;
2003 				buf_cnt--;
2004 				VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
2005 				    bprev, bnew) == 0);
2006 				bprev = bnew;
2007 			}
2008 			if (__improbable(err != 0)) {
2009 				continue;
2010 			}
2011 			err = copy_packet_from_dev(fsw, pkt, dpkt);
2012 			_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
2013 			if (__improbable(err != 0)) {
2014 				SK_ERR("copy packet failed (err %d)", err);
2015 				dp_drop_pkt_single(fsw, pkt, 0,
2016 				    DROP_REASON_FSW_PKT_COPY_FAILED,
2017 				    DROPTAP_FLAG_L2_MISSING);
2018 				pp_free_packet_single(dpkt);
2019 				dpkt = NULL;
2020 				continue;
2021 			}
2022 			KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2023 			pkt = dpkt;
2024 		}
2025 		_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
2026 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2027 		pkt->pkt_policy_id = fe->fe_policy_id;
2028 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
2029 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2030 		if (pkt->pkt_bufs_cnt > 1) {
2031 			pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
2032 		}
2033 		KPKTQ_ENQUEUE(&transferred_pkts, pkt);
2034 	}
2035 	KPKTQ_FINI(rx_pkts);
2036 
2037 	if (KPKTQ_LEN(&transferred_pkts) > 0) {
2038 		fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2039 	}
2040 	KPKTQ_FINI(&transferred_pkts);
2041 
2042 done:
2043 	/* Free unused buflets */
2044 	while (buf_cnt > 0) {
2045 		/*
2046 		 * XXX -fbounds-safety: can't avoid using forge unless we change
2047 		 * the signature of pp_alloc_buflet_batch().
2048 		 */
2049 		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2050 		    (kern_buflet_t)(buf_array[buf_array_iter])));
2051 		buf_array[buf_array_iter] = 0;
2052 		buf_array_iter++;
2053 		buf_cnt--;
2054 	}
2055 	dp_free_pktq(fsw, &dpkts);
2056 	dp_free_pktq(fsw, &disposed_pkts);
2057 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2058 }
2059 
2060 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes,struct mbufq * host_mq)2061 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2062     struct flow_entry_list *fes, struct mbufq *host_mq)
2063 {
2064 	struct pktq rx_pkts;
2065 	uint32_t rx_bytes;
2066 	uint32_t rx_proc_flags;
2067 
2068 	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2069 	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2070 
2071 	KPKTQ_INIT(&rx_pkts);
2072 	for (;;) {
2073 		lck_mtx_lock(&fe->fe_rx_pktq_lock);
2074 		if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2075 			fe->fe_rx_worker_tid = 0;
2076 			TAILQ_REMOVE(fes, fe, fe_rx_link);
2077 			lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2078 			break;
2079 		}
2080 		KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2081 		KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2082 		rx_bytes = fe->fe_rx_pktq_bytes;
2083 		rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2084 		fe->fe_rx_pktq_bytes = 0;
2085 		fe->fe_rx_frag_count = 0;
2086 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2087 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2088 		    KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2089 		/* flow related processing (default, agg, fpd, etc.) */
2090 		fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, host_mq, rx_proc_flags);
2091 	}
2092 	ASSERT(KPKTQ_EMPTY(&rx_pkts));
2093 
2094 	if (__improbable(fe->fe_want_withdraw)) {
2095 		fsw_reap_sched(fsw);
2096 	}
2097 }
2098 
2099 static void
dp_rx_process_low_power_wake(struct nx_flowswitch * fsw,struct flow_entry * fe)2100 dp_rx_process_low_power_wake(struct nx_flowswitch *fsw, struct flow_entry *fe)
2101 {
2102 	if (fe->fe_port_reservation == NULL || (fe->fe_flags & FLOWENTF_EXTRL_PORT) != 0) {
2103 		return;
2104 	}
2105 	if (fe->fe_key.fk_proto == IPPROTO_TCP && (fe->fe_flags & FLOWENTF_CONNECTION_IDLE)) {
2106 		os_log(wake_packet_log_handle, "dp_rx_process_low_power_wake LPW TCP connection idle");
2107 
2108 		if (flow_track_tcp_want_abort(fe)) {
2109 			os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY | FLOWENTF_WAIT_CLOSE, relaxed);
2110 			fe->fe_want_withdraw = 1;
2111 			flow_track_abort_tcp(fe, NULL, NULL);
2112 		}
2113 	} else {
2114 		if_exit_lpw(fsw->fsw_ifp, "dp_rx_process_low_power_wake LPW connection not idle");
2115 	}
2116 }
2117 
2118 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)2119 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
2120 {
2121 	/*
2122 	 * We only care about wake packets of flows that belong the flow switch
2123 	 * as wake packets for the host stack are handled by the host input
2124 	 * function
2125 	 */
2126 
2127 #if (DEBUG || DEVELOPMENT)
2128 	/* For testing only */
2129 	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2130 		if (check_wake_pkt(fsw->fsw_ifp, pkt) == true) {
2131 			/*
2132 			 * This is a one shot command
2133 			 */
2134 			fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2135 
2136 			pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2137 		}
2138 	}
2139 #endif /* (DEBUG || DEVELOPMENT) */
2140 
2141 	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2142 		if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2143 
2144 		/*
2145 		 * When a packet is received in LPW mode for an idle TCP connection, the connection
2146 		 * is aborted immediately with a RST so the peer drops the connection at once
2147 		 */
2148 		if (if_is_lpw_enabled(fsw->fsw_ifp)) {
2149 			pkt->pkt_pflags |= __PKT_F_LPW;
2150 			dp_rx_process_low_power_wake(fsw, fe);
2151 		}
2152 	}
2153 }
2154 
2155 static void
_fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2156 _fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2157 {
2158 	struct __kern_packet *__single pkt, *__single tpkt;
2159 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2160 	struct flow_entry *__single fe, *__single prev_fe;
2161 	sa_family_t af;
2162 	struct pktq host_pkts, dropped_pkts;
2163 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2164 	uint16_t line = 0;
2165 	int err;
2166 	uint64_t thread_id;
2167 	struct mbufq host_mq;
2168 	struct ifnet *ifp;
2169 
2170 	mbufq_init(&host_mq);
2171 	KPKTQ_INIT(&host_pkts);
2172 	KPKTQ_INIT(&dropped_pkts);
2173 
2174 	FSW_RLOCK(fsw);
2175 
2176 	if (__improbable(FSW_QUIESCED(fsw))) {
2177 		DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2178 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2179 		reason = DROP_REASON_FSW_QUIESCED;
2180 		line = __LINE__;
2181 		goto done;
2182 	}
2183 	if (__improbable(fsw->fsw_demux == NULL)) {
2184 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2185 		reason = DROP_REASON_FSW_DEMUX_FAILED;
2186 		line = __LINE__;
2187 		goto done;
2188 	}
2189 
2190 	ifp = fsw->fsw_ifp;
2191 	thread_id = thread_tid(current_thread());
2192 	prev_fe = NULL;
2193 	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2194 		if (__probable(tpkt)) {
2195 			void *baddr;
2196 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2197 			SK_PREFETCH(baddr, 0);
2198 			/* prefetch L3 and L4 flow structs */
2199 			SK_PREFETCHW(tpkt->pkt_flow, 0);
2200 			SK_PREFETCHW(tpkt->pkt_flow, 128);
2201 		}
2202 
2203 		KPKTQ_REMOVE(pktq, pkt);
2204 
2205 		pkt = rx_prepare_packet(fsw, pkt);
2206 
2207 		af = fsw->fsw_demux(fsw, pkt);
2208 		if (__improbable(af == AF_UNSPEC)) {
2209 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2210 			continue;
2211 		}
2212 
2213 		err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2214 		_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2215 		if (__improbable(err != 0)) {
2216 			FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2217 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2218 			continue;
2219 		}
2220 
2221 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2222 			pkt = rx_process_ip_frag(fsw, pkt);
2223 			if (pkt == NULL) {
2224 				continue;
2225 			}
2226 		}
2227 
2228 		prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2229 		if (__improbable(fe == NULL)) {
2230 			KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2231 			continue;
2232 		}
2233 
2234 		dp_rx_process_wake_packet(fsw, fe, pkt);
2235 
2236 		rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2237 		prev_fe = fe;
2238 	}
2239 
2240 	struct flow_entry *tfe = NULL;
2241 	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2242 		rx_flow_process(fsw, fe, &fes, &host_mq);
2243 		flow_entry_release(&fe);
2244 	}
2245 
2246 	if (!KPKTQ_EMPTY(&host_pkts)) {
2247 		fsw_host_rx_cb(fsw, &host_pkts);
2248 		fsw_host_rx_enqueue_mbq(fsw, &host_pkts, &host_mq);
2249 	}
2250 
2251 done:
2252 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2253 	FSW_RUNLOCK(fsw);
2254 
2255 	fsw_host_sendup(ifp, &host_mq);
2256 }
2257 
2258 #if (DEVELOPMENT || DEBUG)
2259 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2260 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2261     struct __kern_packet *pkt)
2262 {
2263 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2264 
2265 	lck_mtx_lock_spin(&frt->frt_lock);
2266 	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2267 	lck_mtx_unlock(&frt->frt_lock);
2268 }
2269 
2270 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2271 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2272 {
2273 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2274 
2275 	ASSERT(frt->frt_thread != THREAD_NULL);
2276 	lck_mtx_lock_spin(&frt->frt_lock);
2277 	ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2278 
2279 	frt->frt_requests++;
2280 	if (!(frt->frt_flags & FRT_RUNNING)) {
2281 		thread_wakeup((caddr_t)frt);
2282 	}
2283 	lck_mtx_unlock(&frt->frt_lock);
2284 }
2285 
2286 __attribute__((noreturn))
2287 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2288 fsw_rps_thread_cont(void *v, wait_result_t w)
2289 {
2290 	struct fsw_rps_thread *__single frt = v;
2291 	struct nx_flowswitch *fsw = frt->frt_fsw;
2292 
2293 	lck_mtx_lock(&frt->frt_lock);
2294 	if (__improbable(w == THREAD_INTERRUPTIBLE ||
2295 	    (frt->frt_flags & FRT_TERMINATING) != 0)) {
2296 		goto terminate;
2297 	}
2298 	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2299 		goto done;
2300 	}
2301 	frt->frt_flags |= FRT_RUNNING;
2302 
2303 	for (;;) {
2304 		uint32_t requests = frt->frt_requests;
2305 		struct pktq pkts;
2306 
2307 		KPKTQ_INIT(&pkts);
2308 		KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2309 		lck_mtx_unlock(&frt->frt_lock);
2310 
2311 		sk_protect_t protect;
2312 		protect = sk_sync_protect();
2313 		_fsw_receive(fsw, &pkts);
2314 		sk_sync_unprotect(protect);
2315 
2316 		lck_mtx_lock(&frt->frt_lock);
2317 		if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2318 		    requests == frt->frt_requests) {
2319 			frt->frt_requests = 0;
2320 			break;
2321 		}
2322 	}
2323 
2324 done:
2325 	lck_mtx_unlock(&frt->frt_lock);
2326 	if (!(frt->frt_flags & FRT_TERMINATING)) {
2327 		frt->frt_flags &= ~FRT_RUNNING;
2328 		assert_wait(frt, THREAD_UNINT);
2329 		thread_block_parameter(fsw_rps_thread_cont, frt);
2330 		__builtin_unreachable();
2331 	} else {
2332 terminate:
2333 		LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2334 		frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2335 		frt->frt_flags |= FRT_TERMINATED;
2336 
2337 		if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2338 			thread_wakeup((caddr_t)&frt);
2339 		}
2340 		lck_mtx_unlock(&frt->frt_lock);
2341 
2342 		SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2343 		    frt->frt_idx);
2344 
2345 		/* for the extra refcnt from kernel_thread_start() */
2346 		thread_deallocate(current_thread());
2347 		/* this is the end */
2348 		thread_terminate(current_thread());
2349 		/* NOTREACHED */
2350 		__builtin_unreachable();
2351 	}
2352 
2353 	/* must never get here */
2354 	VERIFY(0);
2355 	/* NOTREACHED */
2356 	__builtin_unreachable();
2357 }
2358 
2359 __attribute__((noreturn))
2360 static void
fsw_rps_thread_func(void * v,wait_result_t w)2361 fsw_rps_thread_func(void *v, wait_result_t w)
2362 {
2363 #pragma unused(w)
2364 	struct fsw_rps_thread *__single frt = v;
2365 	struct nx_flowswitch *fsw = frt->frt_fsw;
2366 	const char *__null_terminated tname = NULL;
2367 
2368 	char thread_name[MAXTHREADNAMESIZE];
2369 	bzero(thread_name, sizeof(thread_name));
2370 	tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2371 	    if_name(fsw->fsw_ifp), frt->frt_idx);
2372 
2373 	thread_set_thread_name(frt->frt_thread, tname);
2374 	SK_D("%s spawned", tname);
2375 
2376 	net_thread_marks_push(NET_THREAD_SYNC_RX);
2377 	assert_wait(frt, THREAD_UNINT);
2378 	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
2379 
2380 	__builtin_unreachable();
2381 }
2382 
2383 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2384 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2385 {
2386 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2387 	uint64_t f = (1 * NSEC_PER_MSEC);
2388 	uint64_t s = (1000 * NSEC_PER_SEC);
2389 	uint32_t c = 0;
2390 
2391 	lck_mtx_lock(&frt->frt_lock);
2392 	frt->frt_flags |= FRT_TERMINATING;
2393 
2394 	while (!(frt->frt_flags & FRT_TERMINATED)) {
2395 		uint64_t t = 0;
2396 		nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2397 		clock_absolutetime_interval_to_deadline(t, &t);
2398 		ASSERT(t != 0);
2399 
2400 		frt->frt_flags |= FRT_TERMINATEBLOCK;
2401 		if (!(frt->frt_flags & FRT_RUNNING)) {
2402 			thread_wakeup_one((caddr_t)frt);
2403 		}
2404 		(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2405 		lck_mtx_unlock(&frt->frt_lock);
2406 		thread_block(THREAD_CONTINUE_NULL);
2407 		lck_mtx_lock(&frt->frt_lock);
2408 		frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2409 	}
2410 	ASSERT(frt->frt_flags & FRT_TERMINATED);
2411 	lck_mtx_unlock(&frt->frt_lock);
2412 	frt->frt_thread = THREAD_NULL;
2413 }
2414 
2415 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2416 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2417 {
2418 	kern_return_t error;
2419 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2420 
2421 	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2422 	frt->frt_idx = i;
2423 	frt->frt_fsw = fsw;
2424 	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2425 	ASSERT(!error);
2426 	KPKTQ_INIT(&frt->frt_pktq);
2427 }
2428 
2429 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2430 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2431 {
2432 	if (n > FSW_RPS_MAX_NTHREADS) {
2433 		SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2434 		return EINVAL;
2435 	}
2436 
2437 	FSW_WLOCK(fsw);
2438 	if (n < fsw->fsw_rps_nthreads) {
2439 		for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2440 			fsw_rps_thread_join(fsw, i);
2441 		}
2442 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2443 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2444 		fsw->fsw_rps_nthreads = n;
2445 	} else if (n > fsw->fsw_rps_nthreads) {
2446 		uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2447 
2448 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2449 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2450 		fsw->fsw_rps_nthreads = n;
2451 		for (uint32_t i = nthreads_old; i < n; i++) {
2452 			fsw_rps_thread_spawn(fsw, i);
2453 		}
2454 	}
2455 	FSW_WUNLOCK(fsw);
2456 	return 0;
2457 }
2458 
2459 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2460 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2461 {
2462 	sa_family_t af = fsw->fsw_demux(fsw, pkt);
2463 	if (__improbable(af == AF_UNSPEC)) {
2464 		return 0;
2465 	}
2466 
2467 	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2468 
2469 	if (__improbable((pkt->pkt_qum_qflags &
2470 	    QUM_F_FLOW_CLASSIFIED) == 0)) {
2471 		return 0;
2472 	}
2473 
2474 	struct flow_key key;
2475 	flow_pkt2key(pkt, true, &key);
2476 	key.fk_mask = FKMASK_5TUPLE;
2477 
2478 	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2479 
2480 	return id;
2481 }
2482 
2483 #endif /* !DEVELOPMENT && !DEBUG */
2484 
2485 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2486 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2487 {
2488 #if (DEVELOPMENT || DEBUG)
2489 	FSW_RLOCK(fsw);
2490 	if (fsw->fsw_rps_nthreads != 0) {
2491 		struct __kern_packet *pkt, *tpkt;
2492 		bitmap_t map = 0;
2493 
2494 		static_assert(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2495 		KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2496 			uint32_t id = get_rps_id(fsw, pkt);
2497 			KPKTQ_REMOVE(pktq, pkt);
2498 			fsw_rps_rx(fsw, id, pkt);
2499 			bitmap_set(&map, id);
2500 		}
2501 		for (int i = bitmap_first(&map, 64); i >= 0;
2502 		    i = bitmap_next(&map, i)) {
2503 			fsw_rps_thread_schedule(fsw, i);
2504 		}
2505 		FSW_RUNLOCK(fsw);
2506 	} else
2507 #endif /* !DEVELOPMENT && !DEBUG */
2508 	{
2509 #if (DEVELOPMENT || DEBUG)
2510 		FSW_RUNLOCK(fsw);
2511 #endif /* !DEVELOPMENT && !DEBUG */
2512 		_fsw_receive(fsw, pktq);
2513 	}
2514 }
2515 
2516 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2517 fsw_dev_input_netem_dequeue(void *handle,
2518     pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2519 {
2520 #pragma unused(handle)
2521 	struct nx_flowswitch *__single fsw = handle;
2522 	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2523 	struct pktq pktq;
2524 	sk_protect_t protect;
2525 	uint32_t i;
2526 
2527 	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2528 
2529 	for (i = 0; i < n_pkts; i++) {
2530 		ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2531 		ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2532 		kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2533 	}
2534 
2535 	protect = sk_sync_protect();
2536 	KPKTQ_INIT(&pktq);
2537 	pkts_to_pktq(kpkts, n_pkts, &pktq);
2538 
2539 	fsw_receive(fsw, &pktq);
2540 	KPKTQ_FINI(&pktq);
2541 	sk_sync_unprotect(protect);
2542 
2543 	return 0;
2544 }
2545 
2546 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2547 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2548 {
2549 	classq_pkt_t p;
2550 	struct netem *__single ne;
2551 	struct __kern_packet *pkt, *tpkt;
2552 
2553 	ASSERT(fsw->fsw_ifp != NULL);
2554 	ne = fsw->fsw_ifp->if_input_netem;
2555 	ASSERT(ne != NULL);
2556 	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2557 		bool pdrop;
2558 		KPKTQ_REMOVE(q, pkt);
2559 		CLASSQ_PKT_INIT_PACKET(&p, pkt);
2560 		netem_enqueue(ne, &p, &pdrop);
2561 	}
2562 }
2563 
2564 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2565 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2566     struct nexus_pkt_stats *out_stats)
2567 {
2568 	struct __kern_packet *pkt = pkt_head, *next;
2569 	struct nx_flowswitch *fsw;
2570 	uint32_t n_bytes = 0, n_pkts = 0;
2571 	uint64_t total_pkts = 0, total_bytes = 0;
2572 	struct pktq q;
2573 
2574 	KPKTQ_INIT(&q);
2575 	if (__improbable(devna->na_ifp == NULL ||
2576 	    (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2577 		SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2578 		dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2579 		return;
2580 	}
2581 	while (pkt != NULL) {
2582 		if (__improbable(pkt->pkt_trace_id != 0)) {
2583 			KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2584 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2585 		}
2586 		next = pkt->pkt_nextpkt;
2587 		pkt->pkt_nextpkt = NULL;
2588 
2589 		if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2590 			KPKTQ_ENQUEUE(&q, pkt);
2591 			n_bytes += pkt->pkt_length;
2592 		} else {
2593 			DTRACE_SKYWALK1(non__finalized__drop,
2594 			    struct __kern_packet *, pkt);
2595 			FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2596 			dp_drop_pkt_single(fsw, pkt, 0,
2597 			    DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2598 			    DROPTAP_FLAG_L2_MISSING);
2599 			pkt = NULL;
2600 		}
2601 		n_pkts = KPKTQ_LEN(&q);
2602 		if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2603 			if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2604 				fsw_dev_input_netem_enqueue(fsw, &q);
2605 			} else {
2606 				fsw_receive(fsw, &q);
2607 			}
2608 			total_pkts += n_pkts;
2609 			total_bytes += n_bytes;
2610 			n_pkts = 0;
2611 			n_bytes = 0;
2612 			KPKTQ_FINI(&q);
2613 		}
2614 		pkt = next;
2615 	}
2616 	ASSERT(KPKTQ_LEN(&q) == 0);
2617 	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2618 	if (out_stats != NULL) {
2619 		out_stats->nps_pkts += total_pkts;
2620 		out_stats->nps_bytes += total_bytes;
2621 	}
2622 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2623 }
2624 
2625 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2626 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2627     struct __kern_packet *dpkt)
2628 {
2629 	struct mbuf *__single m = NULL;
2630 	uint32_t bdlen, bdlim, bdoff;
2631 	uint8_t *bdaddr;
2632 	unsigned int one = 1;
2633 	int err = 0;
2634 
2635 	err = mbuf_allocpacket(MBUF_DONTWAIT,
2636 	    (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2637 #if (DEVELOPMENT || DEBUG)
2638 	if (m != NULL) {
2639 		_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2640 	}
2641 #endif /* DEVELOPMENT || DEBUG */
2642 	if (__improbable(m == NULL)) {
2643 		FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2644 		err = ENOBUFS;
2645 		goto done;
2646 	}
2647 
2648 	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2649 	if (fsw->fsw_frame_headroom > bdlim) {
2650 		SK_ERR("not enough space in buffer for headroom");
2651 		err = EINVAL;
2652 		goto done;
2653 	}
2654 
2655 	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2656 	dpkt->pkt_mbuf = m;
2657 	dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2658 
2659 	/* packet copy into mbuf */
2660 	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2661 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2662 	    fsw->fsw_frame_headroom, spkt->pkt_length,
2663 	    PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2664 	    spkt->pkt_csum_tx_start_off);
2665 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2666 
2667 	/* header copy into dpkt buffer for classification */
2668 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2669 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2670 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2671 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2672 	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2673 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2674 	    sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2675 	if (copy_len < spkt->pkt_length) {
2676 		dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2677 	}
2678 
2679 	/*
2680 	 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2681 	 * buflet baddr m_data always points to the beginning of packet and
2682 	 * should represents the same as baddr + headroom
2683 	 */
2684 	ASSERT((uintptr_t)m->m_data ==
2685 	    ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2686 
2687 done:
2688 	return err;
2689 }
2690 
2691 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2692 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2693     struct __kern_packet *dpkt)
2694 {
2695 	struct ifnet *ifp = fsw->fsw_ifp;
2696 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2697 
2698 	if (headroom > UINT8_MAX) {
2699 		SK_ERR("headroom too large %d", headroom);
2700 		return ERANGE;
2701 	}
2702 	dpkt->pkt_headroom = (uint8_t)headroom;
2703 	ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2704 	dpkt->pkt_l2_len = 0;
2705 	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2706 
2707 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2708 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2709 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2710 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2711 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2712 	    dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2713 	    spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2714 	    (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2715 	    (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2716 	    (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2717 
2718 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2719 
2720 	return 0;
2721 }
2722 
2723 #if SK_LOG
2724 /* Hoisted out of line to reduce kernel stack footprint */
2725 SK_LOG_ATTRIBUTE
2726 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2727 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2728     struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2729 {
2730 	struct proc *p = current_proc();
2731 	struct ifnet *ifp = fsw->fsw_ifp;
2732 	uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2733 
2734 	if (error == ERANGE) {
2735 		SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2736 		    "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2737 		    (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2738 		    (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2739 	} else if (error == ENOBUFS) {
2740 		SK_DF(logflags, "%s(%d) packet allocation failure",
2741 		    sk_proc_name(p), sk_proc_pid(p));
2742 	} else if (error == 0) {
2743 		ASSERT(dpkt != NULL);
2744 		char *daddr;
2745 		uint32_t pkt_len;
2746 
2747 		MD_BUFLET_ADDR_ABS(dpkt, daddr);
2748 		pkt_len = __packet_get_real_data_length(dpkt);
2749 		SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2750 		    sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
2751 		    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2752 		    (uint32_t)fsw->fsw_frame_headroom,
2753 		    (uint32_t)ifp->if_tx_headroom);
2754 		SK_DF(logflags | SK_VERB_DUMP, "%s",
2755 		    sk_dump("buf", daddr, pkt_len, 128));
2756 	} else {
2757 		SK_DF(logflags, "%s(%d) error %d", sk_proc_name(p),
2758 		    sk_proc_pid(p), error);
2759 	}
2760 }
2761 #else
2762 #define dp_copy_to_dev_log(...)
2763 #endif /* SK_LOG */
2764 
2765 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2766 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2767 {
2768 	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2769 	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2770 
2771 	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2772 	/* Copy packet metadata */
2773 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2774 	_PKT_COPY(spkt, dpkt);
2775 	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2776 	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2777 	    !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2778 	ASSERT(dpkt->pkt_mbuf == NULL);
2779 
2780 	/* Copy AQM metadata */
2781 	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2782 	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2783 	static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0);
2784 	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2785 	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2786 	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2787 	dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2788 }
2789 
2790 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2791 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2792     struct __kern_packet *dpkt)
2793 {
2794 	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2795 	struct ifnet *ifp = fsw->fsw_ifp;
2796 	uint32_t dev_pkt_len;
2797 	int err = 0;
2798 
2799 	fsw_pkt_copy_metadata(spkt, dpkt);
2800 	switch (fsw->fsw_classq_enq_ptype) {
2801 	case QP_MBUF:
2802 		err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2803 		break;
2804 
2805 	case QP_PACKET:
2806 		dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2807 		    spkt->pkt_length;
2808 		if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2809 			FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2810 			err = ERANGE;
2811 			goto done;
2812 		}
2813 		err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2814 		break;
2815 
2816 	default:
2817 		VERIFY(0);
2818 		__builtin_unreachable();
2819 	}
2820 done:
2821 	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2822 	return err;
2823 }
2824 
2825 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2826 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2827     struct __kern_packet *dpkt)
2828 {
2829 	uint8_t *sbaddr, *dbaddr;
2830 	uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2831 	uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2832 
2833 	fsw_pkt_copy_metadata(spkt, dpkt);
2834 
2835 	MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2836 	ASSERT(sbaddr != NULL);
2837 	sbaddr += spkt->pkt_headroom;
2838 
2839 	MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2840 	ASSERT(dbaddr != NULL);
2841 	dpkt->pkt_headroom = (uint8_t)headroom;
2842 	dbaddr += headroom;
2843 
2844 	pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2845 	METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2846 
2847 	/* packet length is set to the full length */
2848 	dpkt->pkt_length = spkt->pkt_length;
2849 	dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2850 	return 0;
2851 }
2852 
2853 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2854 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2855 {
2856 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2857 	ASSERT(pkt->pkt_mbuf != NULL);
2858 	struct mbuf *m = pkt->pkt_mbuf;
2859 
2860 	/* pass additional metadata generated from flow parse/lookup */
2861 	static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token));
2862 	static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token));
2863 	static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx));
2864 	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2865 	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2866 	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2867 	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2868 	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2869 	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2870 	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2871 
2872 	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2873 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2874 	}
2875 
2876 	/* The packet should have a timestamp by the time we get here. */
2877 	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2878 	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2879 
2880 	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2881 	m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2882 	/* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2883 	m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2884 
2885 	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2886 		m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2887 	}
2888 	KPKT_CLEAR_MBUF_DATA(pkt);
2889 
2890 	/* mbuf has been consumed, release packet as well */
2891 	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2892 	pp_free_packet_single(pkt);
2893 	return m;
2894 }
2895 
2896 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2897 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2898     struct mbuf **head, struct mbuf **tail,
2899     uint32_t *cnt, uint32_t *bytes)
2900 {
2901 	struct __kern_packet *pkt = pkt_list, *next;
2902 	struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2903 	struct mbuf *__single m = NULL;
2904 	uint32_t c = 0, b = 0;
2905 
2906 	while (pkt != NULL) {
2907 		next = pkt->pkt_nextpkt;
2908 		pkt->pkt_nextpkt = NULL;
2909 		m = convert_pkt_to_mbuf(pkt);
2910 		ASSERT(m != NULL);
2911 
2912 		*m_tailp = m;
2913 		m_tailp = &m->m_nextpkt;
2914 		c++;
2915 		b += m_pktlen(m);
2916 		pkt = next;
2917 	}
2918 	if (head != NULL) {
2919 		*head = m_head;
2920 	}
2921 	if (tail != NULL) {
2922 		*tail = m;
2923 	}
2924 	if (cnt != NULL) {
2925 		*cnt = c;
2926 	}
2927 	if (bytes != NULL) {
2928 		*bytes = b;
2929 	}
2930 }
2931 
2932 SK_NO_INLINE_ATTRIBUTE
2933 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2934 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2935     struct __kern_packet *pkt)
2936 {
2937 	struct ifnet *ifp = fsw->fsw_ifp;
2938 	boolean_t pkt_drop = FALSE;
2939 	int err;
2940 
2941 	FSW_LOCK_ASSERT_HELD(fsw);
2942 	ASSERT(fsw->fsw_classq_enabled);
2943 	ASSERT(pkt->pkt_flow_token != 0);
2944 	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2945 	    1, pkt->pkt_length);
2946 
2947 	if (__improbable(pkt->pkt_trace_id != 0)) {
2948 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2949 		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2950 	}
2951 
2952 	switch (fsw->fsw_classq_enq_ptype) {
2953 	case QP_MBUF: {                         /* compat interface */
2954 		struct mbuf *m;
2955 
2956 		m = convert_pkt_to_mbuf(pkt);
2957 		ASSERT(m != NULL);
2958 		pkt = NULL;
2959 
2960 		/* ifnet_enqueue consumes mbuf */
2961 		err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2962 		m = NULL;
2963 #if (DEVELOPMENT || DEBUG)
2964 		if (__improbable(!pkt_drop)) {
2965 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2966 		}
2967 #endif /* DEVELOPMENT || DEBUG */
2968 		if (pkt_drop) {
2969 			FSW_STATS_INC(FSW_STATS_DROP);
2970 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2971 		}
2972 		break;
2973 	}
2974 	case QP_PACKET: {                       /* native interface */
2975 		/* ifnet_enqueue consumes packet */
2976 		err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &pkt_drop);
2977 		pkt = NULL;
2978 #if (DEVELOPMENT || DEBUG)
2979 		if (__improbable(!pkt_drop)) {
2980 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2981 		}
2982 #endif /* DEVELOPMENT || DEBUG */
2983 		if (pkt_drop) {
2984 			FSW_STATS_INC(FSW_STATS_DROP);
2985 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2986 		}
2987 		break;
2988 	}
2989 	default:
2990 		err = EINVAL;
2991 		VERIFY(0);
2992 		/* NOTREACHED */
2993 		__builtin_unreachable();
2994 	}
2995 
2996 	return err;
2997 }
2998 
2999 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)3000 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
3001     struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
3002     uint32_t cnt, uint32_t bytes)
3003 {
3004 	struct ifnet *ifp = fsw->fsw_ifp;
3005 	boolean_t pkt_drop = FALSE;
3006 	uint32_t svc;
3007 	int err;
3008 
3009 	FSW_LOCK_ASSERT_HELD(fsw);
3010 	ASSERT(fsw->fsw_classq_enabled);
3011 	ASSERT(pkt_head->pkt_flow_token != 0);
3012 
3013 	/*
3014 	 * All packets in the flow should have the same svc.
3015 	 */
3016 	svc = pkt_head->pkt_svc_class;
3017 	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
3018 
3019 	switch (fsw->fsw_classq_enq_ptype) {
3020 	case QP_MBUF: {                         /* compat interface */
3021 		struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
3022 		uint32_t c = 0, b = 0;
3023 
3024 		convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
3025 		ASSERT(m_head != NULL && m_tail != NULL);
3026 		ASSERT(c == cnt);
3027 		ASSERT(b == bytes);
3028 		pkt_head = NULL;
3029 
3030 		/* ifnet_enqueue consumes mbuf */
3031 		err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
3032 		    bytes, FALSE, &pkt_drop);
3033 		m_head = NULL;
3034 		m_tail = NULL;
3035 #if (DEVELOPMENT || DEBUG)
3036 		if (__improbable(!pkt_drop)) {
3037 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3038 		}
3039 #endif /* DEVELOPMENT || DEBUG */
3040 		if (pkt_drop) {
3041 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3042 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3043 			    cnt);
3044 		}
3045 		break;
3046 	}
3047 	case QP_PACKET: {                       /* native interface */
3048 		/* ifnet_enqueue consumes packet */
3049 		err = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_head, pkt_tail, cnt,
3050 		    bytes, FALSE, &pkt_drop);
3051 		pkt_head = NULL;
3052 #if (DEVELOPMENT || DEBUG)
3053 		if (__improbable(!pkt_drop)) {
3054 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3055 		}
3056 #endif /* DEVELOPMENT || DEBUG */
3057 		if (pkt_drop) {
3058 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3059 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3060 			    cnt);
3061 		}
3062 		break;
3063 	}
3064 	default:
3065 		err = EINVAL;
3066 		VERIFY(0);
3067 		/* NOTREACHED */
3068 		__builtin_unreachable();
3069 	}
3070 
3071 	return err;
3072 }
3073 
3074 /*
3075  * This code path needs to be kept for interfaces without logical link support.
3076  */
3077 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3078 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3079     bool chain, uint32_t cnt, uint32_t bytes)
3080 {
3081 	struct __kern_packet *pkt, *tail, *tpkt;
3082 	flowadv_idx_t flow_adv_idx;
3083 	bool flowadv_cap;
3084 	flowadv_token_t flow_adv_token;
3085 	int err;
3086 
3087 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3088 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3089 
3090 	if (chain) {
3091 		pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3092 		tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3093 		KPKTQ_INIT(&fe->fe_tx_pktq);
3094 		if (pkt == NULL) {
3095 			return;
3096 		}
3097 		flow_adv_idx = pkt->pkt_flowsrc_fidx;
3098 		flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3099 		flow_adv_token = pkt->pkt_flow_token;
3100 
3101 		err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3102 		DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, int, err);
3103 	} else {
3104 		uint32_t c = 0, b = 0;
3105 
3106 		KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3107 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3108 
3109 			flow_adv_idx = pkt->pkt_flowsrc_fidx;
3110 			flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3111 			flow_adv_token = pkt->pkt_flow_token;
3112 
3113 			c++;
3114 			b += pkt->pkt_length;
3115 			err = classq_enqueue_flow_single(fsw, pkt);
3116 		}
3117 		ASSERT(c == cnt);
3118 		ASSERT(b == bytes);
3119 		DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3120 		    int, err);
3121 	}
3122 }
3123 
3124 /*
3125  * Logical link code path
3126  */
3127 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3128 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3129     bool chain, uint32_t cnt, uint32_t bytes)
3130 {
3131 	struct __kern_packet *pkt, *tail;
3132 	flowadv_idx_t flow_adv_idx;
3133 	bool flowadv_cap;
3134 	flowadv_token_t flow_adv_token;
3135 	uint32_t flowctl = 0, dropped = 0;
3136 	int err;
3137 
3138 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3139 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3140 
3141 	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3142 	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3143 	KPKTQ_INIT(&fe->fe_tx_pktq);
3144 	if (pkt == NULL) {
3145 		return;
3146 	}
3147 	flow_adv_idx = pkt->pkt_flowsrc_fidx;
3148 	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3149 	flow_adv_token = pkt->pkt_flow_token;
3150 
3151 	err = netif_qset_enqueue(fe->fe_qset, chain, pkt, tail, cnt, bytes,
3152 	    &flowctl, &dropped);
3153 
3154 	if (__improbable(err != 0) && dropped > 0) {
3155 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3156 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, dropped);
3157 	}
3158 }
3159 
3160 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3161 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3162 {
3163 #pragma unused(fsw)
3164 	/* finalize here; no more changes to buflets after classq */
3165 	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3166 		kern_packet_t ph = SK_PTR_ENCODE(pkt,
3167 		    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3168 		int err = __packet_finalize(ph);
3169 		VERIFY(err == 0);
3170 	}
3171 }
3172 
3173 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3174 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3175 {
3176 	struct flow_route *fr = fe->fe_route;
3177 	int err;
3178 
3179 	ASSERT(fr != NULL);
3180 
3181 	if (__improbable(!dp_flow_route_process(fsw, fe))) {
3182 		return false;
3183 	}
3184 	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3185 		flow_qset_select_dynamic(fsw, fe, TRUE);
3186 	}
3187 
3188 	_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3189 	    _fsw_error35_handler, 1, fr, NULL, NULL);
3190 	_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3191 	    _fsw_error36_handler, 1, fr, NULL);
3192 
3193 	/*
3194 	 * See if we need to resolve the flow route; note the test against
3195 	 * fr_flags here is done without any lock for performance.  Thus
3196 	 * it's possible that we race against the thread performing route
3197 	 * event updates for a packet (which is OK).  In any case we should
3198 	 * not have any assertion on fr_flags value(s) due to the lack of
3199 	 * serialization.
3200 	 */
3201 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
3202 		goto frame;
3203 	}
3204 
3205 	struct __kern_packet *pkt, *tpkt;
3206 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3207 		err = fsw->fsw_resolve(fsw, fr, pkt);
3208 		_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3209 		_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3210 		/*
3211 		 * If resolver returns EJUSTRETURN then we drop the pkt as the
3212 		 * resolver should have converted the pkt into mbuf (or
3213 		 * detached the attached mbuf from pkt) and added it to the
3214 		 * llinfo queue. If we do have a cached llinfo, then proceed
3215 		 * to using it even though it may be stale (very unlikely)
3216 		 * while the resolution is in progress.
3217 		 * Otherwise, any other error results in dropping pkt.
3218 		 */
3219 		if (err == EJUSTRETURN) {
3220 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3221 			pp_free_packet_single(pkt);
3222 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3223 			continue;
3224 		} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3225 			/* use existing llinfo */
3226 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3227 		} else if (err != 0) {
3228 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3229 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3230 			    DROPTAP_FLAG_L2_MISSING);
3231 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3232 			continue;
3233 		}
3234 	}
3235 
3236 frame:
3237 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3238 		if (fsw->fsw_frame != NULL) {
3239 			fsw->fsw_frame(fsw, fr, pkt);
3240 		}
3241 	}
3242 
3243 	return true;
3244 }
3245 
3246 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3247 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3248 {
3249 #pragma unused(fsw)
3250 	struct __kern_packet *pkt, *tpkt;
3251 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3252 		KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3253 		/* listener is only allowed TCP RST */
3254 		if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3255 		    (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3256 			flow_track_abort_tcp(fe, NULL, pkt);
3257 		} else {
3258 			char *addr;
3259 
3260 			MD_BUFLET_ADDR_ABS(pkt, addr);
3261 			SK_ERR("listener flow sends non-RST packet %s",
3262 			    sk_dump(sk_proc_name(current_proc()),
3263 			    addr, __packet_get_real_data_length(pkt), 128));
3264 		}
3265 		pp_free_packet_single(pkt);
3266 	}
3267 }
3268 
3269 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp,uint64_t now)3270 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3271     volatile uint64_t *rt_ts, ifnet_t ifp, uint64_t now)
3272 {
3273 	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3274 		pkt->pkt_timestamp = now;
3275 	}
3276 	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3277 
3278 	/*
3279 	 * If the packet service class is not background,
3280 	 * update the timestamps on the interface, as well as
3281 	 * the ones in nexus-wide advisory to indicate recent
3282 	 * activity on a foreground flow.
3283 	 */
3284 	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3285 		ifp->if_fg_sendts = (uint32_t)net_uptime();
3286 		if (fg_ts != NULL) {
3287 			*fg_ts = net_uptime();
3288 		}
3289 	}
3290 	if (pkt->pkt_pflags & PKT_F_REALTIME) {
3291 		ifp->if_rt_sendts = (uint32_t)net_uptime();
3292 		if (rt_ts != NULL) {
3293 			*rt_ts = net_uptime();
3294 		}
3295 	}
3296 }
3297 
3298 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw)3299 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw)
3300 {
3301 	return fsw_chain_enqueue != 0 &&
3302 	       fsw->fsw_ifp->if_output_netem == NULL &&
3303 	       (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0;
3304 }
3305 
3306 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3307 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3308     uint32_t flags)
3309 {
3310 	struct pktq dropped_pkts;
3311 	bool chain, same_svc = true;
3312 	bool gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3313 	uint32_t cnt = 0, bytes = 0;
3314 	volatile struct sk_nexusadv *nxadv = NULL;
3315 	volatile uint64_t *fg_ts = NULL;
3316 	volatile uint64_t *rt_ts = NULL;
3317 	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3318 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3319 	uint16_t line = 0;
3320 	uint32_t svc = 0;
3321 	struct timespec now;
3322 	uint64_t now_nsec = 0;
3323 
3324 	KPKTQ_INIT(&dropped_pkts);
3325 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3326 	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3327 		dp_listener_flow_tx_process(fsw, fe);
3328 		return;
3329 	}
3330 	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3331 		SK_RDERR(5, "Tx route bad");
3332 		FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3333 		    KPKTQ_LEN(&fe->fe_tx_pktq));
3334 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3335 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3336 		line = __LINE__;
3337 		goto done;
3338 	}
3339 	chain = fsw_chain_enqueue_enabled(fsw) && KPKTQ_LEN(&fe->fe_tx_pktq) > 1;
3340 	if (chain) {
3341 		nanouptime(&now);
3342 		net_timernsec(&now, &now_nsec);
3343 		nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3344 		if (nxadv != NULL) {
3345 			fg_ts = &nxadv->nxadv_fg_sendts;
3346 			rt_ts = &nxadv->nxadv_rt_sendts;
3347 		}
3348 	}
3349 
3350 	struct __kern_packet *pkt, *tpkt;
3351 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3352 		int err = 0;
3353 		if (svc == 0) {
3354 			svc = pkt->pkt_svc_class;
3355 		}
3356 
3357 		err = flow_pkt_track(fe, pkt, false);
3358 		if (__improbable(err != 0)) {
3359 			SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3360 			FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3361 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3362 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3363 			    DROPTAP_FLAG_L2_MISSING);
3364 			continue;
3365 		}
3366 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3367 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3368 
3369 		/* set AQM related values for outgoing packet */
3370 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3371 			pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3372 			pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3373 			pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3374 		} else {
3375 			pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3376 		}
3377 		_UUID_CLEAR(pkt->pkt_flow_id);
3378 		pkt->pkt_flow_token = fe->fe_flowid;
3379 		pkt->pkt_pflags |= PKT_F_FLOW_ID;
3380 		pkt->pkt_qset_idx = qset_idx;
3381 		pkt->pkt_policy_id = fe->fe_policy_id;
3382 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3383 
3384 		/*
3385 		 * The same code is exercised per packet for the non-chain case
3386 		 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3387 		 * re-walking the chain later.
3388 		 */
3389 		if (chain && (gso || same_svc)) {
3390 			fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp, now_nsec);
3391 		}
3392 		/* mark packet tos/svc_class */
3393 		fsw_qos_mark(fsw, fe, pkt);
3394 
3395 		tx_finalize_packet(fsw, pkt);
3396 		bytes += pkt->pkt_length;
3397 		cnt++;
3398 
3399 		same_svc = (same_svc && (svc == pkt->pkt_svc_class));
3400 		/*
3401 		 * we are using the first 4 bytes of flow_id as the AQM flow
3402 		 * identifier.
3403 		 */
3404 		ASSERT(!uuid_is_null(pkt->pkt_flow_id));
3405 
3406 		if (__improbable(pkt->pkt_trace_id != 0)) {
3407 			KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
3408 			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
3409 		}
3410 	}
3411 
3412 	/* snoop after it's finalized */
3413 	if (__improbable(pktap_total_tap_count != 0)) {
3414 		fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3415 	}
3416 
3417 	chain = chain && (gso || same_svc);
3418 	if (fe->fe_qset != NULL) {
3419 		classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3420 	} else {
3421 		classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3422 	}
3423 done:
3424 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3425 }
3426 
3427 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3428 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3429     struct flow_entry *prev_fe, struct __kern_packet *pkt)
3430 {
3431 	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3432 
3433 	if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3434 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3435 		SK_PERR(current_proc(), "invalid zero fragment id");
3436 		return NULL;
3437 	}
3438 
3439 	SK_PDF(SK_VERB_FSW_DP | SK_VERB_TX, current_proc(),
3440 	    "continuation frag, id %u", pkt->pkt_flow_ip_frag_id);
3441 	if (__improbable(prev_fe == NULL ||
3442 	    !prev_fe->fe_tx_is_cont_frag)) {
3443 		SK_PERR(current_proc(), "unexpected continuation frag %u",
3444 		    pkt->pkt_flow_ip_frag_id);
3445 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3446 		return NULL;
3447 	}
3448 	if (__improbable(pkt->pkt_flow_ip_frag_id !=
3449 	    prev_fe->fe_tx_frag_id)) {
3450 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3451 		SK_PERR(current_proc(), "wrong continuation frag id %u expecting %u",
3452 		    pkt->pkt_flow_ip_frag_id, prev_fe->fe_tx_frag_id);
3453 		return NULL;
3454 	}
3455 
3456 	return prev_fe;
3457 }
3458 
3459 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3460 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3461     struct flow_entry *prev_fe)
3462 {
3463 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3464 	struct flow_entry *__single fe;
3465 
3466 	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3467 	if (__improbable(fe == NULL)) {
3468 		goto done;
3469 	}
3470 
3471 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3472 		SK_RDERR(5, "Tx flow torn down %s",
3473 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3474 		FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3475 		flow_entry_release(&fe);
3476 		goto done;
3477 	}
3478 
3479 	if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
3480 		SK_RDERR(5, "Tx not allowed for this flow");
3481 		SK_RDERR(5, "Tx not allowed for this flow %s",
3482 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3483 		FSW_STATS_INC(FSW_STATS_TX_DISABLED);
3484 		flow_entry_release(&fe);
3485 		goto done;
3486 	}
3487 
3488 	_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3489 	    null_func);
3490 
3491 	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3492 		uuid_string_t flow_id_str, pkt_id_str;
3493 		sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3494 		sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3495 		SK_ERR("pkt flow id %s != flow id %s, %s", pkt_id_str,
3496 		    flow_id_str, fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3497 		flow_entry_release(&fe);
3498 		FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3499 	}
3500 
3501 done:
3502 	return fe;
3503 }
3504 
3505 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3506 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3507     uint32_t flags)
3508 {
3509 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3510 	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3511 
3512 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3513 	    KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3514 
3515 	/* flow related processing (default, agg, etc.) */
3516 	fe->fe_tx_process(fsw, fe, flags);
3517 
3518 	KPKTQ_FINI(&fe->fe_tx_pktq);
3519 }
3520 
3521 #if SK_LOG
3522 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3523 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3524 {
3525 	char *pkt_buf;
3526 	uint32_t pkt_len;
3527 
3528 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3529 	pkt_len = __packet_get_real_data_length(pkt);
3530 	SK_DF(verb, "%s(%d) %s %s", sk_proc_name(current_proc()),
3531 	    sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3532 	    128));
3533 }
3534 #else /* !SK_LOG */
3535 #define dp_tx_log_pkt(...)
3536 #endif /* !SK_LOG */
3537 
3538 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3539 fsw_datamov_begin(struct nx_flowswitch *fsw)
3540 {
3541 	struct ifnet *ifp;
3542 
3543 	ifp = fsw->fsw_ifp;
3544 	if (!ifnet_datamov_begin(ifp)) {
3545 		DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3546 		return NULL;
3547 	}
3548 	return ifp;
3549 }
3550 
3551 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3552 fsw_datamov_end(struct nx_flowswitch *fsw)
3553 {
3554 	ifnet_datamov_end(fsw->fsw_ifp);
3555 }
3556 
3557 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3558 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3559 {
3560 	struct __kern_packet *spkt, *pkt;
3561 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3562 	struct flow_entry *__single fe, *__single prev_fe;
3563 	struct pktq dropped_pkts, dpktq;
3564 	struct nexus_adapter *dev_na;
3565 	struct kern_pbufpool *dev_pp;
3566 	struct ifnet *ifp = NULL;
3567 	sa_family_t af;
3568 	uint32_t n_pkts, n_flows = 0;
3569 	boolean_t do_pacing = FALSE;
3570 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3571 	uint16_t line = 0;
3572 
3573 	int err;
3574 	KPKTQ_INIT(&dpktq);
3575 	KPKTQ_INIT(&dropped_pkts);
3576 	n_pkts = KPKTQ_LEN(spktq);
3577 
3578 	FSW_RLOCK(fsw);
3579 	if (__improbable(FSW_QUIESCED(fsw))) {
3580 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3581 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3582 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3583 		reason = DROP_REASON_FSW_QUIESCED;
3584 		line = __LINE__;
3585 		goto done;
3586 	}
3587 	dev_na = fsw->fsw_dev_ch->ch_na;
3588 	if (__improbable(dev_na == NULL)) {
3589 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3590 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3591 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3592 		reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3593 		line = __LINE__;
3594 		goto done;
3595 	}
3596 	ifp = fsw_datamov_begin(fsw);
3597 	if (ifp == NULL) {
3598 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3599 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3600 		reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3601 		line = __LINE__;
3602 		goto done;
3603 	}
3604 
3605 	/* batch allocate enough packets */
3606 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3607 
3608 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3609 	    NULL, SKMEM_NOSLEEP);
3610 #if DEVELOPMENT || DEBUG
3611 	if (__probable(err != ENOMEM)) {
3612 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3613 	}
3614 #endif /* DEVELOPMENT || DEBUG */
3615 	if (__improbable(err == ENOMEM)) {
3616 		ASSERT(KPKTQ_EMPTY(&dpktq));
3617 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3618 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3619 		SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3620 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3621 		line = __LINE__;
3622 		goto done;
3623 	} else if (__improbable(err == EAGAIN)) {
3624 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3625 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3626 		FSW_STATS_ADD(FSW_STATS_DROP,
3627 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3628 	}
3629 
3630 	n_pkts = KPKTQ_LEN(&dpktq);
3631 	prev_fe = NULL;
3632 	KPKTQ_FOREACH(spkt, spktq) {
3633 		if (n_pkts == 0) {
3634 			break;
3635 		}
3636 		--n_pkts;
3637 
3638 		KPKTQ_DEQUEUE(&dpktq, pkt);
3639 		ASSERT(pkt != NULL);
3640 		err = dp_copy_to_dev(fsw, spkt, pkt);
3641 		if (__improbable(err != 0)) {
3642 			/*
3643 			 * Copy to dev pool failed, so droptap should capture
3644 			 * the source pkt because dev pkt might not have metadata
3645 			 * or buffer filled out yet. Source pkt is freed by
3646 			 * fsw_user_ring_flush, so defer the free to that.
3647 			 */
3648 			dp_drop_pkt_single_nofree(fsw, spkt, 1,
3649 			    DROP_REASON_FSW_PKT_COPY_FAILED, DROPTAP_FLAG_L2_MISSING);
3650 			/* Free the dev pool packet */
3651 			pp_free_packet_single(pkt);
3652 			continue;
3653 		}
3654 
3655 		do_pacing |= __packet_get_tx_timestamp(SK_PKT2PH(pkt)) != 0;
3656 		af = fsw_ip_demux(fsw, pkt);
3657 		if (__improbable(af == AF_UNSPEC)) {
3658 			dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3659 			FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3660 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3661 			    DROPTAP_FLAG_L2_MISSING);
3662 			continue;
3663 		}
3664 
3665 		err = flow_pkt_classify(pkt, ifp, af, false);
3666 		if (__improbable(err != 0)) {
3667 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3668 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3669 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3670 			    DROPTAP_FLAG_L2_MISSING);
3671 			continue;
3672 		}
3673 
3674 		if (__improbable(pkt->pkt_flow_ip_is_frag &&
3675 		    !pkt->pkt_flow_ip_is_first_frag)) {
3676 			fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3677 			if (__probable(fe != NULL)) {
3678 				flow_entry_retain(fe);
3679 				goto flow_batch;
3680 			} else {
3681 				FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3682 				dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3683 				    DROPTAP_FLAG_L2_MISSING);
3684 				continue;
3685 			}
3686 		}
3687 
3688 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3689 		if (__improbable(fe == NULL)) {
3690 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3691 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3692 			    DROPTAP_FLAG_L2_MISSING);
3693 			prev_fe = NULL;
3694 			continue;
3695 		}
3696 flow_batch:
3697 		tx_flow_batch_packet(&fes, fe, pkt);
3698 		prev_fe = fe;
3699 	}
3700 
3701 	struct flow_entry *tfe = NULL;
3702 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3703 		tx_flow_process(fsw, fe, 0);
3704 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3705 		fe->fe_tx_is_cont_frag = false;
3706 		fe->fe_tx_frag_id = 0;
3707 		flow_entry_release(&fe);
3708 		n_flows++;
3709 	}
3710 
3711 done:
3712 	FSW_RUNLOCK(fsw);
3713 	if (n_flows > 0) {
3714 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3715 	}
3716 	if (ifp != NULL) {
3717 		fsw_datamov_end(fsw);
3718 	}
3719 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3720 	KPKTQ_FINI(&dropped_pkts);
3721 	KPKTQ_FINI(&dpktq);
3722 }
3723 
3724 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3725 get_tso_af(struct __kern_packet *pkt)
3726 {
3727 	packet_tso_flags_t tso_flags;
3728 
3729 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3730 	if (tso_flags == PACKET_TSO_IPV4) {
3731 		return AF_INET;
3732 	} else if (tso_flags == PACKET_TSO_IPV6) {
3733 		return AF_INET6;
3734 	} else {
3735 		panic("invalid tso flags: 0x%x\n", tso_flags);
3736 		/* NOTREACHED */
3737 		__builtin_unreachable();
3738 	}
3739 }
3740 
3741 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3742 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3743 {
3744 	struct tcphdr *__single tcp = tcphdr;
3745 
3746 	DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3747 	    void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3748 	pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3749 	pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3750 	pkt->pkt_flow_tcp_flags = tcp->th_flags;
3751 	pkt->pkt_flow_tcp_seq = tcp->th_seq;
3752 	pkt->pkt_flow_ulen = payload_sz;
3753 }
3754 
3755 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3756 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3757     struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3758     struct pktq *gso_pktq)
3759 {
3760 	ifnet_t ifp = fsw->fsw_ifp;
3761 	struct __kern_packet *pkt = first_pkt;
3762 	uint8_t proto = pkt->pkt_flow_ip_proto;
3763 	uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3764 	uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3765 	uint16_t total_hlen = ip_hlen + tcp_hlen;
3766 	uint16_t mtu = (uint16_t)ifp->if_mtu;
3767 	uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3768 	uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3769 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3770 	kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3771 	uint8_t *orig_pkt_baddr;
3772 	struct tcphdr *tcp;
3773 	struct ip *ip;
3774 	struct ip6_hdr *ip6;
3775 	uint32_t tcp_seq;
3776 	uint16_t ipid;
3777 	uint32_t pseudo_hdr_csum, bufsz;
3778 	uint64_t pkt_tx_timestamp = 0;
3779 
3780 	ASSERT(headroom <= UINT8_MAX);
3781 	if (proto != IPPROTO_TCP) {
3782 		SK_ERR("invalid proto: %d", proto);
3783 		DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3784 		    fsw, ifnet_t, ifp, uint8_t, proto);
3785 		return EINVAL;
3786 	}
3787 	if (mss == 0 || mss > (mtu - total_hlen)) {
3788 		SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3789 		    mss, mtu, total_hlen);
3790 		DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3791 		    fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3792 		    uint32_t, total_hlen);
3793 		return EINVAL;
3794 	}
3795 	bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3796 	if ((headroom + total_hlen + mss) > bufsz) {
3797 		SK_ERR("invalid args: headroom %d, total_hlen %d, "
3798 		    "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3799 		DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3800 		    fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3801 		    total_hlen, uint16_t, mss, uint32_t, bufsz);
3802 		return EINVAL;
3803 	}
3804 	n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3805 
3806 	ASSERT(pkt->pkt_headroom == headroom);
3807 	ASSERT(pkt->pkt_length == total_len);
3808 	ASSERT(pkt->pkt_l2_len == 0);
3809 	ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3810 	ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3811 	pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3812 	pkt->pkt_proto_seg_sz = 0;
3813 	pkt->pkt_csum_flags = 0;
3814 	MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3815 	orig_pkt_baddr += orig_pkt->pkt_headroom;
3816 
3817 	if (af == AF_INET) {
3818 		/*
3819 		 * XXX -fbounds-safety: can't avoid using forge unless we change
3820 		 * the flow metadata definition.
3821 		 */
3822 		ip = __unsafe_forge_bidi_indexable(struct ip *,
3823 		    pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3824 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3825 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3826 		ipid = ip->ip_id;
3827 		pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3828 		    pkt->pkt_flow_ipv4_dst.s_addr, 0);
3829 	} else {
3830 		ASSERT(af == AF_INET6);
3831 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3832 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3833 		pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3834 		    &pkt->pkt_flow_ipv6_dst, 0);
3835 	}
3836 	tcp_seq = ntohl(tcp->th_seq);
3837 
3838 	pkt_tx_timestamp = __packet_get_tx_timestamp(orig_ph);
3839 
3840 	for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3841 	    off += payload_sz) {
3842 		uint8_t *baddr, *baddr0;
3843 		uint32_t partial;
3844 
3845 		if (pkt == NULL) {
3846 			n++;
3847 			KPKTQ_DEQUEUE(dev_pktq, pkt);
3848 			ASSERT(pkt != NULL);
3849 		}
3850 		MD_BUFLET_ADDR_ABS(pkt, baddr0);
3851 		baddr = baddr0;
3852 		baddr += headroom;
3853 
3854 		/* Copy headers from the original packet */
3855 		if (n != 1) {
3856 			ASSERT(pkt != first_pkt);
3857 			pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3858 			fsw_pkt_copy_metadata(first_pkt, pkt);
3859 
3860 			ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3861 			/* flow info still needs to be updated below */
3862 			bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3863 			    sizeof(*pkt->pkt_flow));
3864 			pkt->pkt_trace_id = 0;
3865 			ASSERT(pkt->pkt_headroom == headroom);
3866 		} else {
3867 			METADATA_SET_LEN(pkt, 0, 0);
3868 		}
3869 		baddr += total_hlen;
3870 
3871 		/* copy tx timestamp from the orignal packet */
3872 		__packet_set_tx_timestamp(SK_PKT2PH(pkt), pkt_tx_timestamp);
3873 
3874 		/* Copy/checksum the payload from the original packet */
3875 		if (off + payload_sz > total_len) {
3876 			payload_sz = (uint16_t)(total_len - off);
3877 		}
3878 		pkt_copypkt_sum(orig_ph,
3879 		    (uint16_t)(orig_pkt->pkt_headroom + off),
3880 		    SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3881 		    &partial, TRUE);
3882 
3883 		DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3884 		    ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3885 		    uint16_t, mss, uint32_t, partial);
3886 		FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3887 
3888 		/*
3889 		 * Adjust header information and fill in the missing fields.
3890 		 */
3891 		if (af == AF_INET) {
3892 			ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3893 			tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3894 
3895 			if (n != n_pkts) {
3896 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3897 			}
3898 			if (n != 1) {
3899 				tcp->th_flags &= ~TH_CWR;
3900 				tcp->th_seq = htonl(tcp_seq);
3901 			}
3902 			update_flow_info(pkt, ip, tcp, payload_sz);
3903 
3904 			ip->ip_id = htons((ipid)++);
3905 			ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3906 			ip->ip_sum = 0;
3907 			ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3908 			tcp->th_sum = 0;
3909 
3910 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3911 			partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3912 			partial += pseudo_hdr_csum;
3913 			ADDCARRY(partial);
3914 			tcp->th_sum = ~(uint16_t)partial;
3915 		} else {
3916 			ASSERT(af == AF_INET6);
3917 			ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3918 			tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3919 
3920 			if (n != n_pkts) {
3921 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3922 			}
3923 			if (n != 1) {
3924 				tcp->th_flags &= ~TH_CWR;
3925 				tcp->th_seq = htonl(tcp_seq);
3926 			}
3927 			update_flow_info(pkt, ip6, tcp, payload_sz);
3928 
3929 			ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3930 			tcp->th_sum = 0;
3931 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3932 			partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3933 			partial += pseudo_hdr_csum;
3934 			ADDCARRY(partial);
3935 			tcp->th_sum = ~(uint16_t)partial;
3936 		}
3937 		tcp_seq += payload_sz;
3938 		METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3939 #if (DEVELOPMENT || DEBUG)
3940 		struct __kern_buflet *bft;
3941 		uint32_t blen;
3942 		PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3943 		blen = __buflet_get_data_length(bft);
3944 		if (blen != total_hlen + payload_sz) {
3945 			panic("blen (%d) != total_len + payload_sz (%d)\n",
3946 			    blen, total_hlen + payload_sz);
3947 		}
3948 #endif /* DEVELOPMENT || DEBUG */
3949 
3950 		pkt->pkt_length = total_hlen + payload_sz;
3951 		KPKTQ_ENQUEUE(gso_pktq, pkt);
3952 		pkt = NULL;
3953 
3954 		/*
3955 		 * Note that at this point the packet is not yet finalized.
3956 		 * The finalization happens in dp_flow_tx_process() after
3957 		 * the framing is done.
3958 		 */
3959 	}
3960 	ASSERT(n == n_pkts);
3961 	ASSERT(off == total_len);
3962 	DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3963 	    uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3964 	    uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3965 	return 0;
3966 }
3967 
3968 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3969 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3970     struct pktq *gso_pktq)
3971 {
3972 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3973 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3974 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3975 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3976 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3977 		KPKTQ_INIT(gso_pktq);
3978 	} else {
3979 		ASSERT(!TAILQ_EMPTY(fes));
3980 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3981 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3982 		KPKTQ_INIT(gso_pktq);
3983 		flow_entry_release(&fe);
3984 	}
3985 }
3986 
3987 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3988 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3989     uint32_t gso_pkts_estimate)
3990 {
3991 	struct __kern_packet *spkt, *pkt;
3992 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3993 	struct flow_entry *__single fe, *__single prev_fe;
3994 	struct pktq dpktq;
3995 	struct nexus_adapter *dev_na;
3996 	struct kern_pbufpool *dev_pp;
3997 	struct ifnet *ifp = NULL;
3998 	sa_family_t af;
3999 	uint32_t n_pkts, n_flows = 0;
4000 	int err;
4001 
4002 	KPKTQ_INIT(&dpktq);
4003 	n_pkts = KPKTQ_LEN(spktq);
4004 
4005 	FSW_RLOCK(fsw);
4006 	if (__improbable(FSW_QUIESCED(fsw))) {
4007 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
4008 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
4009 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
4010 		    DROPTAP_FLAG_L2_MISSING);
4011 		goto done;
4012 	}
4013 	dev_na = fsw->fsw_dev_ch->ch_na;
4014 	if (__improbable(dev_na == NULL)) {
4015 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
4016 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
4017 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
4018 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4019 		goto done;
4020 	}
4021 	ifp = fsw_datamov_begin(fsw);
4022 	if (ifp == NULL) {
4023 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
4024 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
4025 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4026 		goto done;
4027 	}
4028 
4029 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
4030 
4031 	/*
4032 	 * Batch allocate enough packets to perform GSO on all
4033 	 * packets in spktq.
4034 	 */
4035 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
4036 	    gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
4037 #if DEVELOPMENT || DEBUG
4038 	if (__probable(err != ENOMEM)) {
4039 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
4040 	}
4041 #endif /* DEVELOPMENT || DEBUG */
4042 	/*
4043 	 * We either get all packets or none. No partial allocations.
4044 	 */
4045 	if (__improbable(err != 0)) {
4046 		if (err == ENOMEM) {
4047 			ASSERT(KPKTQ_EMPTY(&dpktq));
4048 		} else {
4049 			dp_free_pktq(fsw, &dpktq);
4050 		}
4051 		DTRACE_SKYWALK1(gso__no__mem, int, err);
4052 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
4053 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
4054 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
4055 		SK_ERR("failed to alloc %u pkts from device pool",
4056 		    gso_pkts_estimate);
4057 		goto done;
4058 	}
4059 	prev_fe = NULL;
4060 	KPKTQ_FOREACH(spkt, spktq) {
4061 		KPKTQ_DEQUEUE(&dpktq, pkt);
4062 		ASSERT(pkt != NULL);
4063 		/*
4064 		 * Copy only headers to the first packet of the GSO chain.
4065 		 * The headers will be used for classification below.
4066 		 */
4067 		err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4068 		if (__improbable(err != 0)) {
4069 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4070 			    DROPTAP_FLAG_L2_MISSING);
4071 			DTRACE_SKYWALK2(copy__headers__failed,
4072 			    struct nx_flowswitch *, fsw,
4073 			    struct __kern_packet *, spkt);
4074 			continue;
4075 		}
4076 		af = get_tso_af(pkt);
4077 		ASSERT(af == AF_INET || af == AF_INET6);
4078 
4079 		err = flow_pkt_classify(pkt, ifp, af, false);
4080 		if (__improbable(err != 0)) {
4081 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4082 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4083 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4084 			    DROPTAP_FLAG_L2_MISSING);
4085 			DTRACE_SKYWALK4(classify__failed,
4086 			    struct nx_flowswitch *, fsw,
4087 			    struct __kern_packet *, spkt,
4088 			    struct __kern_packet *, pkt,
4089 			    int, err);
4090 			continue;
4091 		}
4092 		/*
4093 		 * GSO cannot be done on a fragment and it's a bug in user
4094 		 * space to mark a fragment as needing GSO.
4095 		 */
4096 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4097 			FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4098 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4099 			    DROPTAP_FLAG_L2_MISSING);
4100 			DTRACE_SKYWALK3(is__frag,
4101 			    struct nx_flowswitch *, fsw,
4102 			    struct __kern_packet *, spkt,
4103 			    struct __kern_packet *, pkt);
4104 			continue;
4105 		}
4106 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
4107 		if (__improbable(fe == NULL)) {
4108 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4109 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
4110 			    DROPTAP_FLAG_L2_MISSING);
4111 			DTRACE_SKYWALK3(lookup__failed,
4112 			    struct nx_flowswitch *, fsw,
4113 			    struct __kern_packet *, spkt,
4114 			    struct __kern_packet *, pkt);
4115 			prev_fe = NULL;
4116 			continue;
4117 		}
4118 		/*
4119 		 * Perform GSO on spkt using the flow information
4120 		 * obtained above.
4121 		 */
4122 		struct pktq gso_pktq;
4123 		KPKTQ_INIT(&gso_pktq);
4124 		err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4125 		if (__probable(err == 0)) {
4126 			tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4127 			prev_fe = fe;
4128 		} else {
4129 			DTRACE_SKYWALK1(gso__error, int, err);
4130 			/* TODO: increment error stat */
4131 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4132 			    DROPTAP_FLAG_L2_MISSING);
4133 			flow_entry_release(&fe);
4134 			prev_fe = NULL;
4135 		}
4136 		KPKTQ_FINI(&gso_pktq);
4137 	}
4138 	struct flow_entry *tfe = NULL;
4139 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4140 		/* Chain-enqueue can be used for GSO chains */
4141 		tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4142 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
4143 		flow_entry_release(&fe);
4144 		n_flows++;
4145 	}
4146 done:
4147 	FSW_RUNLOCK(fsw);
4148 	if (n_flows > 0) {
4149 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4150 	}
4151 	if (ifp != NULL) {
4152 		fsw_datamov_end(fsw);
4153 	}
4154 
4155 	/*
4156 	 * It's possible for packets to be left in dpktq because
4157 	 * gso_pkts_estimate is only an estimate. The actual number
4158 	 * of packets needed could be less.
4159 	 */
4160 	uint32_t dpktq_len;
4161 	if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4162 		DTRACE_SKYWALK2(leftover__dev__pkts,
4163 		    struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4164 		dp_free_pktq(fsw, &dpktq);
4165 	}
4166 	KPKTQ_FINI(&dpktq);
4167 }
4168 
4169 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4170 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4171     struct proc *p)
4172 {
4173 #pragma unused(p)
4174 	uint32_t total_pkts = 0, total_bytes = 0;
4175 
4176 	for (;;) {
4177 		struct pktq pktq;
4178 		KPKTQ_INIT(&pktq);
4179 		uint32_t n_bytes;
4180 		fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4181 		if (n_bytes == 0) {
4182 			break;
4183 		}
4184 		total_pkts += KPKTQ_LEN(&pktq);
4185 		total_bytes += n_bytes;
4186 
4187 		if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4188 			fsw_receive(fsw, &pktq);
4189 		} else {
4190 			fsw_dev_input_netem_enqueue(fsw, &pktq);
4191 		}
4192 		KPKTQ_FINI(&pktq);
4193 	}
4194 
4195 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4196 	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4197 	    uint32_t, total_bytes);
4198 
4199 	/* compute mitigation rate for delivered traffic */
4200 	if (__probable(r->ckr_netif_mit_stats != NULL)) {
4201 		r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4202 	}
4203 }
4204 
4205 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4206 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4207     struct proc *p)
4208 {
4209 #pragma unused(p)
4210 	static packet_trace_id_t trace_id = 0;
4211 	uint32_t total_pkts = 0, total_bytes = 0;
4212 
4213 	for (;;) {
4214 		struct pktq pktq;
4215 		KPKTQ_INIT(&pktq);
4216 		uint32_t n_bytes;
4217 		uint32_t gso_pkts_estimate = 0;
4218 
4219 		fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4220 		    &gso_pkts_estimate);
4221 		if (n_bytes == 0) {
4222 			break;
4223 		}
4224 		total_pkts += KPKTQ_LEN(&pktq);
4225 		total_bytes += n_bytes;
4226 
4227 		KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4228 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4229 		    KPKTQ_FIRST(&pktq)->pkt_trace_id);
4230 
4231 		if (gso_pkts_estimate > 0) {
4232 			dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4233 		} else {
4234 			dp_tx_pktq(fsw, &pktq);
4235 		}
4236 		dp_free_pktq(fsw, &pktq);
4237 		KPKTQ_FINI(&pktq);
4238 	}
4239 	kr_update_stats(r, total_pkts, total_bytes);
4240 
4241 	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4242 	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4243 	    uint32_t, total_bytes);
4244 }
4245 
4246 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4247 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4248     struct proc *p)
4249 {
4250 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4251 
4252 	ASSERT(sk_is_sync_protected());
4253 	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4254 	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4255 
4256 	if (vpna->vpna_nx_port == FSW_VP_DEV) {
4257 		fsw_dev_ring_flush(fsw, r, p);
4258 	} else {
4259 		fsw_user_ring_flush(fsw, r, p);
4260 	}
4261 }
4262 
4263 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4264 fsw_dp_ctor(struct nx_flowswitch *fsw)
4265 {
4266 	uint32_t fe_cnt = fsw_fe_table_size;
4267 	uint32_t fob_cnt = fsw_flow_owner_buckets;
4268 	uint32_t frb_cnt = fsw_flow_route_buckets;
4269 	uint32_t frib_cnt = fsw_flow_route_id_buckets;
4270 	struct kern_nexus *nx = fsw->fsw_nx;
4271 	char name[64];
4272 	const char *__null_terminated fsw_name = NULL;
4273 	int error = 0;
4274 
4275 	/* just in case */
4276 	if (fe_cnt == 0) {
4277 		fe_cnt = NX_FSW_FE_TABLESZ;
4278 		ASSERT(fe_cnt != 0);
4279 	}
4280 	if (fob_cnt == 0) {
4281 		fob_cnt = NX_FSW_FOB_HASHSZ;
4282 		ASSERT(fob_cnt != 0);
4283 	}
4284 	if (frb_cnt == 0) {
4285 		frb_cnt = NX_FSW_FRB_HASHSZ;
4286 		ASSERT(frb_cnt != 0);
4287 	}
4288 	if (frib_cnt == 0) {
4289 		frib_cnt = NX_FSW_FRIB_HASHSZ;
4290 		ASSERT(frib_cnt != 0);
4291 	}
4292 
4293 	/* make sure fe_cnt is a power of two, else round up */
4294 	if ((fe_cnt & (fe_cnt - 1)) != 0) {
4295 		fe_cnt--;
4296 		fe_cnt |= (fe_cnt >> 1);
4297 		fe_cnt |= (fe_cnt >> 2);
4298 		fe_cnt |= (fe_cnt >> 4);
4299 		fe_cnt |= (fe_cnt >> 8);
4300 		fe_cnt |= (fe_cnt >> 16);
4301 		fe_cnt++;
4302 	}
4303 
4304 	/* make sure frb_cnt is a power of two, else round up */
4305 	if ((frb_cnt & (frb_cnt - 1)) != 0) {
4306 		frb_cnt--;
4307 		frb_cnt |= (frb_cnt >> 1);
4308 		frb_cnt |= (frb_cnt >> 2);
4309 		frb_cnt |= (frb_cnt >> 4);
4310 		frb_cnt |= (frb_cnt >> 8);
4311 		frb_cnt |= (frb_cnt >> 16);
4312 		frb_cnt++;
4313 	}
4314 
4315 	lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4316 	    &nexus_lock_attr);
4317 	lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4318 	lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4319 	TAILQ_INIT(&fsw->fsw_linger_head);
4320 	lck_mtx_init(&fsw->fsw_rxstrc_lock, &nexus_lock_group, &nexus_lock_attr);
4321 	TAILQ_INIT(&fsw->fsw_rxstrc_head);
4322 
4323 	fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4324 	error = nx_advisory_alloc(nx, fsw_name,
4325 	    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4326 	    NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4327 	if (error != 0) {
4328 		fsw_dp_dtor(fsw);
4329 		return error;
4330 	}
4331 
4332 	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4333 	if (fsw->fsw_flow_mgr == NULL) {
4334 		fsw_dp_dtor(fsw);
4335 		return error;
4336 	}
4337 
4338 	/* generic name; will be customized upon ifattach */
4339 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4340 	    FSW_REAP_THREADNAME, name, "");
4341 
4342 	if (kernel_thread_start(fsw_reap_thread_func, fsw,
4343 	    &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4344 		panic_plain("%s: can't create thread", __func__);
4345 		/* NOTREACHED */
4346 		__builtin_unreachable();
4347 	}
4348 	/* this must not fail */
4349 	VERIFY(fsw->fsw_reap_thread != NULL);
4350 
4351 	SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw));
4352 
4353 
4354 	return error;
4355 }
4356 
4357 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4358 fsw_dp_dtor(struct nx_flowswitch *fsw)
4359 {
4360 	uint64_t f = (1 * NSEC_PER_MSEC);         /* 1 ms */
4361 	uint64_t s = (1000 * NSEC_PER_SEC);         /* 1 sec */
4362 	uint32_t i = 0;
4363 
4364 #if (DEVELOPMENT || DEBUG)
4365 	if (fsw->fsw_rps_threads != NULL) {
4366 		for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4367 			fsw_rps_thread_join(fsw, i);
4368 		}
4369 		kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4370 		    fsw->fsw_rps_threads);
4371 	}
4372 #endif /* !DEVELOPMENT && !DEBUG */
4373 
4374 	nx_advisory_free(fsw->fsw_nx);
4375 
4376 	if (fsw->fsw_reap_thread != THREAD_NULL) {
4377 		/* signal thread to begin self-termination */
4378 		lck_mtx_lock(&fsw->fsw_reap_lock);
4379 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4380 
4381 		/*
4382 		 * And wait for thread to terminate; use another
4383 		 * wait channel here other than fsw_reap_flags to
4384 		 * make it more explicit.  In the event the reaper
4385 		 * thread misses a wakeup, we'll try again once
4386 		 * every second (except for the first time).
4387 		 */
4388 		while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4389 			uint64_t t = 0;
4390 
4391 			nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4392 			clock_absolutetime_interval_to_deadline(t, &t);
4393 			ASSERT(t != 0);
4394 
4395 			fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4396 			if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4397 				thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4398 			}
4399 			(void) assert_wait_deadline(&fsw->fsw_reap_thread,
4400 			    THREAD_UNINT, t);
4401 			lck_mtx_unlock(&fsw->fsw_reap_lock);
4402 			thread_block(THREAD_CONTINUE_NULL);
4403 			lck_mtx_lock(&fsw->fsw_reap_lock);
4404 			fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4405 		}
4406 		ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4407 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4408 		fsw->fsw_reap_thread = THREAD_NULL;
4409 	}
4410 
4411 	/* free any remaining flow entries in the linger list */
4412 	fsw_linger_purge(fsw);
4413 	fsw_rxstrc_purge(fsw);
4414 
4415 	if (fsw->fsw_flow_mgr != NULL) {
4416 		flow_mgr_destroy(fsw->fsw_flow_mgr);
4417 		fsw->fsw_flow_mgr = NULL;
4418 	}
4419 
4420 
4421 	lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4422 	lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4423 	lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4424 }
4425 
4426 void
fsw_linger_insert(struct flow_entry * fe)4427 fsw_linger_insert(struct flow_entry *fe)
4428 {
4429 	struct nx_flowswitch *fsw = fe->fe_fsw;
4430 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4431 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4432 
4433 	net_update_uptime();
4434 
4435 	ASSERT(flow_entry_refcnt(fe) >= 1);
4436 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4437 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4438 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4439 	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4440 	ASSERT(fe->fe_linger_wait != 0);
4441 	fe->fe_linger_expire = (net_uptime() + fe->fe_linger_wait);
4442 	os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4443 
4444 	lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4445 	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4446 	fsw->fsw_linger_cnt++;
4447 	VERIFY(fsw->fsw_linger_cnt != 0);
4448 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4449 
4450 	fsw_reap_sched(fsw);
4451 }
4452 
4453 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4454 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4455     struct flow_entry *fe)
4456 {
4457 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4458 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4459 
4460 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4461 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4462 	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4463 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4464 
4465 	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4466 	flow_entry_release(&fe);
4467 }
4468 
4469 static void
fsw_linger_remove(struct flow_entry * fe)4470 fsw_linger_remove(struct flow_entry *fe)
4471 {
4472 	struct nx_flowswitch *fsw = fe->fe_fsw;
4473 
4474 	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4475 
4476 	fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4477 	VERIFY(fsw->fsw_linger_cnt != 0);
4478 	fsw->fsw_linger_cnt--;
4479 }
4480 
4481 void
fsw_linger_purge(struct nx_flowswitch * fsw)4482 fsw_linger_purge(struct nx_flowswitch *fsw)
4483 {
4484 	struct flow_entry *fe, *tfe;
4485 
4486 	lck_mtx_lock(&fsw->fsw_linger_lock);
4487 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4488 		fsw_linger_remove(fe);
4489 	}
4490 	ASSERT(fsw->fsw_linger_cnt == 0);
4491 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4492 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4493 }
4494 
4495 void
fsw_rxstrc_insert(struct flow_entry * fe)4496 fsw_rxstrc_insert(struct flow_entry *fe)
4497 {
4498 	struct nx_flowswitch *fsw = fe->fe_fsw;
4499 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4500 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4501 
4502 	ASSERT(flow_entry_refcnt(fe) >= 1);
4503 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4504 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4505 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4506 	ASSERT(!(fe->fe_flags & FLOWENTF_RXSTRC_PENDING));
4507 	os_atomic_or(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4508 
4509 	flow_entry_retain(fe);
4510 
4511 	lck_mtx_lock_spin(&fsw->fsw_rxstrc_lock);
4512 	TAILQ_INSERT_TAIL(&fsw->fsw_rxstrc_head, fe, fe_rxstrc_link);
4513 	fsw->fsw_rxstrc_cnt++;
4514 	VERIFY(fsw->fsw_rxstrc_cnt != 0);
4515 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4516 
4517 	fsw_reap_sched(fsw);
4518 }
4519 
4520 static void
fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head * rxstrc_head,struct flow_entry * fe)4521 fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head *rxstrc_head,
4522     struct flow_entry *fe)
4523 {
4524 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4525 	SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4526 
4527 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4528 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4529 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4530 	ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
4531 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4532 
4533 	TAILQ_REMOVE(rxstrc_head, fe, fe_rxstrc_link);
4534 	flow_entry_release(&fe);
4535 }
4536 
4537 static void
fsw_rxstrc_remove(struct flow_entry * fe)4538 fsw_rxstrc_remove(struct flow_entry *fe)
4539 {
4540 	struct nx_flowswitch *fsw = fe->fe_fsw;
4541 
4542 	LCK_MTX_ASSERT(&fsw->fsw_rxstrc_lock, LCK_MTX_ASSERT_OWNED);
4543 
4544 	fsw_rxstrc_remove_internal(&fsw->fsw_rxstrc_head, fe);
4545 	VERIFY(fsw->fsw_rxstrc_cnt != 0);
4546 	fsw->fsw_rxstrc_cnt--;
4547 }
4548 
4549 void
fsw_rxstrc_purge(struct nx_flowswitch * fsw)4550 fsw_rxstrc_purge(struct nx_flowswitch *fsw)
4551 {
4552 	struct flow_entry *fe, *tfe;
4553 
4554 	lck_mtx_lock(&fsw->fsw_rxstrc_lock);
4555 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_rxstrc_head, fe_rxstrc_link, tfe) {
4556 		fsw_rxstrc_remove(fe);
4557 	}
4558 	ASSERT(fsw->fsw_rxstrc_cnt == 0);
4559 	ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
4560 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4561 }
4562 
4563 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4564 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4565 {
4566 	struct kern_nexus *nx;
4567 	uint64_t now = net_uptime();
4568 
4569 	nx = fsw->fsw_nx;
4570 
4571 	/* Walk through all channels and check for Rx stall condition */
4572 	/* uncrustify doesn't handle C blocks properly */
4573 	/* BEGIN IGNORE CODESTYLE */
4574 	nx_port_foreach(nx, ^(nexus_port_t nxport) {
4575 		struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4576 		uint64_t elapsed, enqueue_ts, dequeue_ts;
4577 		struct __kern_channel_ring *ring;
4578 		struct kern_channel *ch;
4579 		struct proc *p;
4580 
4581 		if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4582 			return;
4583 		}
4584 		ch = (struct kern_channel *)na->na_private;
4585 		if (ch == NULL) {
4586 			return;
4587 		}
4588 		ring = KR_SINGLE(na->na_rx_rings);
4589 		enqueue_ts = ring->ckr_rx_enqueue_ts;
4590 		dequeue_ts = ring->ckr_rx_dequeue_ts;
4591 		/* Elapsed time since last Rx enqueue */
4592 		elapsed = now - enqueue_ts;
4593 		if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4594 			p = proc_find(ch->ch_pid);
4595 			if (p == NULL) {
4596 				return;
4597 			}
4598 			if (fsw_rx_stall_defunct) {
4599 				kern_channel_defunct(p, ch);
4600 			}
4601 			proc_rele(p);
4602 			DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4603 			    struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4604 			FSW_STATS_INC(FSW_STATS_RX_STALL);
4605 			SK_ERR("Rx stall detected in proc %s(%d) (%s): "
4606 			    "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4607 			    "defunct: %s",
4608 			    ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4609 			    elapsed, now, enqueue_ts, dequeue_ts,
4610 			    fsw_rx_stall_defunct ? "yes" : "no");
4611 		}
4612 	});
4613 	/* END IGNORE CODESTYLE */
4614 }
4615 
4616 void
fsw_reap_sched(struct nx_flowswitch * fsw)4617 fsw_reap_sched(struct nx_flowswitch *fsw)
4618 {
4619 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4620 	lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4621 	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4622 	    !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4623 		thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4624 	}
4625 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4626 }
4627 
4628 __attribute__((noreturn))
4629 static void
fsw_reap_thread_func(void * v,wait_result_t w)4630 fsw_reap_thread_func(void *v, wait_result_t w)
4631 {
4632 #pragma unused(w)
4633 	struct nx_flowswitch *__single fsw = v;
4634 
4635 	ASSERT(fsw->fsw_reap_thread == current_thread());
4636 	/*
4637 	 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4638 	 * checks to ensure source contains the null terminator, by doing a
4639 	 * linear scan of the string.
4640 	 */
4641 	thread_set_thread_name(current_thread(),
4642 	    __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4643 
4644 	net_update_uptime();
4645 
4646 	lck_mtx_lock(&fsw->fsw_reap_lock);
4647 	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4648 	(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4649 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4650 	thread_block_parameter(fsw_reap_thread_cont, fsw);
4651 	/* NOTREACHED */
4652 	__builtin_unreachable();
4653 }
4654 
4655 __attribute__((noreturn))
4656 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4657 fsw_reap_thread_cont(void *v, wait_result_t wres)
4658 {
4659 	struct nx_flowswitch *__single fsw = v;
4660 	boolean_t low;
4661 	uint64_t t = 0;
4662 
4663 	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4664 
4665 	lck_mtx_lock(&fsw->fsw_reap_lock);
4666 	if (__improbable(wres == THREAD_INTERRUPTED ||
4667 	    (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4668 		goto terminate;
4669 	}
4670 
4671 	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4672 	fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4673 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4674 
4675 	net_update_uptime();
4676 
4677 	/* prevent detach from happening while we're here */
4678 	if (!fsw_detach_barrier_add(fsw)) {
4679 		SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4680 		t = 0;
4681 	} else {
4682 		uint32_t fe_nonviable, fe_freed, fe_aborted;
4683 		uint32_t fr_freed, fr_resid = 0;
4684 		struct ifnet *ifp = fsw->fsw_ifp;
4685 		uint64_t i = FSW_REAP_IVAL;
4686 		uint64_t now = net_uptime();
4687 		uint64_t last;
4688 
4689 		ASSERT(fsw->fsw_ifp != NULL);
4690 
4691 		/*
4692 		 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4693 		 */
4694 		fe_nonviable = fsw_process_deferred(fsw);
4695 
4696 		/*
4697 		 * Pass 2: remove any expired lingering flows.
4698 		 */
4699 		fe_freed = fsw_process_linger(fsw, &fe_aborted);
4700 
4701 		/*
4702 		 * Pass 3: process any pending Rx steering rule cleanup flows
4703 		 */
4704 		fsw_process_rxstrc(fsw);
4705 
4706 		/*
4707 		 * Pass 4: prune idle flow routes.
4708 		 */
4709 		fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4710 		    ifp, &fr_resid);
4711 
4712 		/*
4713 		 * Pass 5: prune flow table
4714 		 *
4715 		 */
4716 		cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4717 
4718 		SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4719 		    "fe_aborted %u fr_freed %u/%u",
4720 		    fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4721 		    (fe_nonviable + fsw->fsw_pending_nonviable),
4722 		    fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4723 		    (fe_freed + fr_resid));
4724 
4725 		/* see if VM memory level is critical */
4726 		low = skmem_lowmem_check();
4727 
4728 		/*
4729 		 * If things appear to be idle, we can prune away cached
4730 		 * object that have fallen out of the working sets (this
4731 		 * is different than purging).  Every once in a while, we
4732 		 * also purge the caches.  Note that this is done across
4733 		 * all flowswitch instances, and so we limit this to no
4734 		 * more than once every FSW_REAP_SK_THRES seconds.
4735 		 */
4736 		last = os_atomic_load(&fsw_reap_last, relaxed);
4737 		if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4738 		    os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4739 			fsw_purge_cache(fsw, low);
4740 
4741 			/* increase sleep interval if idle */
4742 			if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4743 			    fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4744 				i <<= 3;
4745 			}
4746 		} else if (last == 0) {
4747 			os_atomic_store(&fsw_reap_last, now, release);
4748 		}
4749 
4750 		/*
4751 		 * Additionally, run thru the list of channels and prune
4752 		 * or purge away cached objects on "idle" channels.  This
4753 		 * check is rate limited to no more than once every
4754 		 * FSW_DRAIN_CH_THRES seconds.
4755 		 */
4756 		last = fsw->fsw_drain_channel_chk_last;
4757 		if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4758 			SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4759 			    fsw->fsw_flow_mgr->fm_name);
4760 
4761 			fsw->fsw_drain_channel_chk_last = now;
4762 			fsw_drain_channels(fsw, now, low);
4763 		} else if (__improbable(last == 0)) {
4764 			fsw->fsw_drain_channel_chk_last = now;
4765 		}
4766 
4767 		/*
4768 		 * Finally, invoke the interface's reap callback to
4769 		 * tell it to prune or purge away cached objects if
4770 		 * it is idle.  This check is rate limited to no more
4771 		 * than once every FSW_REAP_IF_THRES seconds.
4772 		 */
4773 		last = fsw->fsw_drain_netif_chk_last;
4774 		if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4775 			ASSERT(fsw->fsw_nifna != NULL);
4776 
4777 			if (ifp->if_na_ops != NULL &&
4778 			    ifp->if_na_ops->ni_reap != NULL) {
4779 				SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4780 				    fsw->fsw_flow_mgr->fm_name);
4781 				ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4782 				    FSW_REAP_IF_THRES, low);
4783 			}
4784 
4785 			fsw->fsw_drain_netif_chk_last = now;
4786 		} else if (__improbable(last == 0)) {
4787 			fsw->fsw_drain_netif_chk_last = now;
4788 		}
4789 
4790 		/* emit periodic interface stats ktrace */
4791 		last = fsw->fsw_reap_last;
4792 		if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4793 			KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4794 			    ifp->if_data.ifi_ibytes * 8,
4795 			    ifp->if_data.ifi_opackets,
4796 			    ifp->if_data.ifi_obytes * 8);
4797 
4798 			fsw->fsw_reap_last = now;
4799 		} else if (__improbable(last == 0)) {
4800 			fsw->fsw_reap_last = now;
4801 		}
4802 
4803 		/* Check for Rx stall condition every fsw_rx_stall_thresh seconds */
4804 		last = fsw->fsw_rx_stall_chk_last;
4805 		if (fsw_rx_stall_thresh != 0) {
4806 			if (last != 0 && (now - last) >= fsw_rx_stall_thresh) {
4807 				fsw_defunct_rx_stall_channel(fsw);
4808 				fsw->fsw_rx_stall_chk_last = now;
4809 			} else if (__improbable(last == 0)) {
4810 				fsw->fsw_rx_stall_chk_last = now;
4811 			}
4812 		}
4813 
4814 		nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4815 		clock_absolutetime_interval_to_deadline(t, &t);
4816 		ASSERT(t != 0);
4817 
4818 		/* allow any pending detach to proceed */
4819 		fsw_detach_barrier_remove(fsw);
4820 	}
4821 
4822 	lck_mtx_lock(&fsw->fsw_reap_lock);
4823 	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4824 		fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4825 		(void) assert_wait_deadline(&fsw->fsw_reap_flags,
4826 		    THREAD_UNINT, t);
4827 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4828 		thread_block_parameter(fsw_reap_thread_cont, fsw);
4829 		/* NOTREACHED */
4830 		__builtin_unreachable();
4831 	} else {
4832 terminate:
4833 		LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4834 		fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4835 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4836 		/*
4837 		 * And signal any thread waiting for us to terminate;
4838 		 * wait channel here other than fsw_reap_flags to make
4839 		 * it more explicit.
4840 		 */
4841 		if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4842 			thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4843 		}
4844 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4845 
4846 		SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4847 
4848 		/* for the extra refcnt from kernel_thread_start() */
4849 		thread_deallocate(current_thread());
4850 		/* this is the end */
4851 		thread_terminate(current_thread());
4852 		/* NOTREACHED */
4853 		__builtin_unreachable();
4854 	}
4855 
4856 	/* must never get here */
4857 	VERIFY(0);
4858 	/* NOTREACHED */
4859 	__builtin_unreachable();
4860 }
4861 
4862 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4863 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4864 {
4865 	struct kern_nexus *nx = fsw->fsw_nx;
4866 
4867 	/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4868 	FSW_RLOCK(fsw);
4869 
4870 	/* uncrustify doesn't handle C blocks properly */
4871 	/* BEGIN IGNORE CODESTYLE */
4872 	nx_port_foreach(nx, ^(nexus_port_t p) {
4873 		boolean_t purge;
4874 		struct nexus_adapter *na = nx_port_get_na(nx, p);
4875 
4876 		if (na == NULL) {
4877 			DTRACE_SKYWALK1(ch__drain__na__null, struct nexus_adapter *, na);
4878 			return;
4879 		}
4880 
4881 		/*
4882 		 * If NA is deactivated, no need to proceed further with channel drain.
4883 		 * Note: fsw_vp_na_activate takes FSW_WLOCK before clearing the
4884 		 * NAF_ACTIVE flag.
4885 		 */
4886 		if ((na->na_flags & NAF_ACTIVE) == 0) {
4887 			DTRACE_SKYWALK1(ch__drain__na__inactive, struct nexus_adapter *, na);
4888 			return;
4889 		}
4890 
4891 		if (na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4892 			DTRACE_SKYWALK1(ch__drain__na__invalid, struct nexus_adapter *, na);
4893 			return;
4894 		}
4895 
4896 		/*
4897 		 * If some activity happened in the last FSW_DRAIN_CH_THRES
4898 		 * seconds on this channel, we reclaim memory if the channel
4899 		 * throughput is less than the reap threshold value.
4900 		 */
4901 		if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4902 			struct __kern_channel_ring *__single ring;
4903 			channel_ring_stats *stats;
4904 			uint64_t bps;
4905 
4906 			ring = KR_SINGLE(na->na_rx_rings);
4907 			stats = &ring->ckr_stats;
4908 			bps = stats->crs_bytes_per_second;
4909 
4910 			if (bps < fsw_channel_reap_thresh) {
4911 				purge = FALSE;
4912 				na_drain(na, purge);
4913 			}
4914 			return;
4915 		}
4916 
4917 		/*
4918 		 * If NA has been inactive for some time (twice the drain
4919 		 * threshold), we clear the work timestamp to temporarily skip
4920 		 * this channel until it's active again.  Purging cached objects
4921 		 * can be expensive since we'd need to allocate and construct
4922 		 * them again, so we do it only when necessary.
4923 		 */
4924 		if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4925 			na->na_work_ts = 0;
4926 			purge = TRUE;
4927 		} else {
4928 			purge = FALSE;
4929 		}
4930 
4931 		na_drain(na, purge);  /* purge/prune caches */
4932 	});
4933 	/* END IGNORE CODESTYLE */
4934 
4935 	FSW_RUNLOCK(fsw);
4936 }
4937 
4938 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4939 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4940 {
4941 #pragma unused(fsw)
4942 	uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4943 	uint32_t p = fsw_flow_purge_thresh;
4944 	boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4945 
4946 	SK_DF(SK_VERB_FLOW, "%s: %s caches",
4947 	    fsw->fsw_flow_mgr->fm_name,
4948 	    (purge ? "purge" : "prune"));
4949 
4950 	skmem_cache_reap_now(sk_fo_cache, purge);
4951 	skmem_cache_reap_now(sk_fe_cache, purge);
4952 	skmem_cache_reap_now(sk_fab_cache, purge);
4953 	skmem_cache_reap_now(flow_route_cache, purge);
4954 	skmem_cache_reap_now(flow_stats_cache, purge);
4955 	netns_reap_caches(purge);
4956 	skmem_reap_caches(purge);
4957 
4958 #if CONFIG_MBUF_MCACHE
4959 	if (if_is_fsw_transport_netagent_enabled() && purge) {
4960 		mbuf_drain(FALSE);
4961 	}
4962 #endif /* CONFIG_MBUF_MCACHE */
4963 }
4964 
4965 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4966 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4967 {
4968 	/* When the interface is in low power mode, the flow is nonviable */
4969 	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4970 	    os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4971 		os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4972 	}
4973 }
4974 
4975 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4976 fsw_process_deferred(struct nx_flowswitch *fsw)
4977 {
4978 	struct flow_entry_dead sfed __sk_aligned(8);
4979 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
4980 	struct flow_entry_dead *fed, *tfed;
4981 	LIST_HEAD(, flow_entry_dead) fed_head =
4982 	    LIST_HEAD_INITIALIZER(fed_head);
4983 	uint32_t i, nonviable = 0;
4984 	boolean_t lowpowermode = FALSE;
4985 
4986 	bzero(&sfed, sizeof(sfed));
4987 
4988 	/*
4989 	 * The flows become nonviable when the interface
4990 	 * is in low power mode (edge trigger)
4991 	 */
4992 	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4993 	    fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4994 		lowpowermode = TRUE;
4995 		fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4996 	}
4997 
4998 	/*
4999 	 * Scan thru the flow entry tree, and commit any pending withdraw or
5000 	 * nonviable requests.  We may need to push stats and/or unassign the
5001 	 * nexus from NECP, but we cannot do that while holding the locks;
5002 	 * build a temporary list for those entries.
5003 	 */
5004 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
5005 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
5006 		struct flow_owner *fo;
5007 
5008 		/*
5009 		 * Grab the lock at all costs when handling low power mode
5010 		 */
5011 		if (__probable(!lowpowermode)) {
5012 			if (!FOB_TRY_LOCK(fob)) {
5013 				continue;
5014 			}
5015 		} else {
5016 			FOB_LOCK(fob);
5017 		}
5018 
5019 		FOB_LOCK_ASSERT_HELD(fob);
5020 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
5021 			struct flow_entry *fe;
5022 
5023 			RB_FOREACH(fe, flow_entry_id_tree,
5024 			    &fo->fo_flow_entry_id_head) {
5025 				/* try first as reader; skip if we can't */
5026 				if (__improbable(lowpowermode)) {
5027 					fsw_flow_handle_low_power(fsw, fe);
5028 				}
5029 				if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
5030 					os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
5031 					flow_namespace_half_close(&fe->fe_port_reservation);
5032 				}
5033 
5034 				/* if not withdrawn/nonviable, skip */
5035 				if (!fe->fe_want_withdraw &&
5036 				    !fe->fe_want_nonviable) {
5037 					continue;
5038 				}
5039 				/*
5040 				 * Here we're holding the lock as writer;
5041 				 * don't spend too much time as we're
5042 				 * blocking the data path now.
5043 				 */
5044 				ASSERT(!uuid_is_null(fe->fe_uuid));
5045 				/* only need flow UUID and booleans */
5046 				uuid_copy(sfed.fed_uuid, fe->fe_uuid);
5047 				sfed.fed_want_clonotify =
5048 				    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
5049 				sfed.fed_want_nonviable = fe->fe_want_nonviable;
5050 				flow_entry_teardown(fo, fe);
5051 
5052 				/* do this outside the flow bucket lock */
5053 				fed = flow_entry_dead_alloc(Z_WAITOK);
5054 				ASSERT(fed != NULL);
5055 				*fed = sfed;
5056 				LIST_INSERT_HEAD(&fed_head, fed, fed_link);
5057 			}
5058 		}
5059 		FOB_UNLOCK(fob);
5060 	}
5061 
5062 	/*
5063 	 * These nonviable flows are no longer useful since we've lost
5064 	 * the source IP address; in the event the client monitors the
5065 	 * viability of the flow, explicitly mark it as nonviable so
5066 	 * that a new flow can be created.
5067 	 */
5068 	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
5069 		LIST_REMOVE(fed, fed_link);
5070 		ASSERT(fsw->fsw_agent_session != NULL);
5071 
5072 		/* if flow is closed early */
5073 		if (fed->fed_want_clonotify) {
5074 			necp_client_early_close(fed->fed_uuid);
5075 		}
5076 
5077 		/* if nonviable, unassign nexus attributes */
5078 		if (fed->fed_want_nonviable) {
5079 			(void) netagent_assign_nexus(fsw->fsw_agent_session,
5080 			    fed->fed_uuid, NULL, 0);
5081 		}
5082 
5083 		flow_entry_dead_free(fed);
5084 		++nonviable;
5085 	}
5086 	ASSERT(LIST_EMPTY(&fed_head));
5087 
5088 	return nonviable;
5089 }
5090 
5091 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)5092 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
5093 {
5094 	struct flow_entry_linger_head linger_head =
5095 	    TAILQ_HEAD_INITIALIZER(linger_head);
5096 	struct flow_entry *fe, *tfe;
5097 	uint64_t now = net_uptime();
5098 	uint32_t i = 0, cnt = 0, freed = 0;
5099 
5100 	ASSERT(fsw->fsw_ifp != NULL);
5101 	ASSERT(abort != NULL);
5102 	*abort = 0;
5103 
5104 	/*
5105 	 * We don't want to contend with the datapath, so move
5106 	 * everything that's in the linger list into a local list.
5107 	 * This allows us to generate RSTs or free the flow entry
5108 	 * outside the lock.  Any remaining flow entry in the local
5109 	 * list will get re-added back to the head of the linger
5110 	 * list, in front of any new ones added since then.
5111 	 */
5112 	lck_mtx_lock(&fsw->fsw_linger_lock);
5113 	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5114 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5115 	cnt = fsw->fsw_linger_cnt;
5116 	fsw->fsw_linger_cnt = 0;
5117 	lck_mtx_unlock(&fsw->fsw_linger_lock);
5118 
5119 	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
5120 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5121 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5122 		ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
5123 
5124 		/*
5125 		 * See if this is a TCP flow that needs to generate
5126 		 * a RST to the remote peer (if not already).
5127 		 */
5128 		if (flow_track_tcp_want_abort(fe)) {
5129 			VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
5130 			ASSERT(!uuid_is_null(fe->fe_uuid));
5131 			flow_track_abort_tcp(fe, NULL, NULL);
5132 			(*abort)++;
5133 			SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
5134 			SK_DF(SK_VERB_FLOW, "fe \"%s\" [RST]",
5135 			    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
5136 		}
5137 
5138 		/*
5139 		 * If flow has expired, remove from list and free;
5140 		 * otherwise leave it around in the linger list.
5141 		 */
5142 		if (fe->fe_linger_expire <= now) {
5143 			freed++;
5144 			fsw_linger_remove_internal(&linger_head, fe);
5145 			fe = NULL;
5146 		}
5147 		++i;
5148 	}
5149 	VERIFY(i == cnt && cnt >= freed);
5150 
5151 	/*
5152 	 * Add any remaining ones back into the linger list.
5153 	 */
5154 	lck_mtx_lock(&fsw->fsw_linger_lock);
5155 	if (!TAILQ_EMPTY(&linger_head)) {
5156 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5157 		TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5158 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5159 		TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5160 		fsw->fsw_linger_cnt += (cnt - freed);
5161 	}
5162 	ASSERT(TAILQ_EMPTY(&linger_head));
5163 	lck_mtx_unlock(&fsw->fsw_linger_lock);
5164 
5165 	return freed;
5166 }
5167 
5168 static void
fsw_process_rxstrc(struct nx_flowswitch * fsw)5169 fsw_process_rxstrc(struct nx_flowswitch *fsw)
5170 {
5171 	struct flow_entry_rxstrc_head rxstrc_head =
5172 	    TAILQ_HEAD_INITIALIZER(rxstrc_head);
5173 	struct flow_entry *fe, *tfe;
5174 
5175 	/*
5176 	 * We don't want to contend with the datapath, so move
5177 	 * everything that's in the rxstrc list into a local list.
5178 	 * This allows us to cleanup Rx steering rules or free the flow entry
5179 	 * outside the lock.
5180 	 */
5181 	lck_mtx_lock(&fsw->fsw_rxstrc_lock);
5182 	TAILQ_CONCAT(&rxstrc_head, &fsw->fsw_rxstrc_head, fe_rxstrc_link);
5183 	ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
5184 	fsw->fsw_rxstrc_cnt = 0;
5185 	lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
5186 
5187 	TAILQ_FOREACH_SAFE(fe, &rxstrc_head, fe_rxstrc_link, tfe) {
5188 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5189 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5190 		ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
5191 		ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
5192 
5193 		flow_entry_rx_steering_rule_cleanup(fsw, fe);
5194 		fsw_rxstrc_remove_internal(&rxstrc_head, fe);
5195 		fe = NULL;
5196 	}
5197 }
5198 
5199 __attribute__((always_inline))
5200 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5201 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5202 {
5203 	switch (__packet_get_traffic_class(ph)) {
5204 	case PKT_TC_BE:
5205 		ifp->if_tc.ifi_ibepackets++;
5206 		ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5207 		break;
5208 	case PKT_TC_BK:
5209 		ifp->if_tc.ifi_ibkpackets++;
5210 		ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5211 		break;
5212 	case PKT_TC_VI:
5213 		ifp->if_tc.ifi_ivipackets++;
5214 		ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5215 		break;
5216 	case PKT_TC_VO:
5217 		ifp->if_tc.ifi_ivopackets++;
5218 		ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5219 		break;
5220 	default:
5221 		break;
5222 	}
5223 }
5224 
5225 __attribute__((always_inline))
5226 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5227 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5228     uint32_t cnt, uint32_t len)
5229 {
5230 	switch (svc) {
5231 	case PKT_TC_BE:
5232 		ifp->if_tc.ifi_obepackets += cnt;
5233 		ifp->if_tc.ifi_obebytes += len;
5234 		break;
5235 	case PKT_TC_BK:
5236 		ifp->if_tc.ifi_obkpackets += cnt;
5237 		ifp->if_tc.ifi_obkbytes += len;
5238 		break;
5239 	case PKT_TC_VI:
5240 		ifp->if_tc.ifi_ovipackets += cnt;
5241 		ifp->if_tc.ifi_ovibytes += len;
5242 		break;
5243 	case PKT_TC_VO:
5244 		ifp->if_tc.ifi_ovopackets += cnt;
5245 		ifp->if_tc.ifi_ovobytes += len;
5246 		break;
5247 	default:
5248 		break;
5249 	}
5250 }
5251