xref: /xnu-8020.101.4/bsd/skywalk/nexus/netif/nx_netif.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * The netif nexus domain has two domain providers: native and compat, with
31  * the latter being the default provider of this domain. The compat provider
32  * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33  *
34  * A netif nexus instance can be in a native or compat mode; in either case,
35  * it is associated with two instances of a nexus_adapter structure, and allows
36  * at most two channels opened to the nexus.  Two two adapters correspond to
37  * host and device ports, respectively.
38  *
39  * By itself, a netif nexus isn't associated with a network interface. The
40  * association happens by attaching a network interface to the nexus instance.
41  * A channel can only be successfully opened to a netif nexus after it has an
42  * interface attached to it.
43  *
44  * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45  * structure refers to the attached netif nexus adapter via its if_na field.
46  * The nexus also holds a reference to the interface on its na_ifp field. Note
47  * that attaching to a netif_compat nexus does not alter the input/output data
48  * path, nor does it remove any of the interface's hardware offload flags. It
49  * merely associates the interface and netif nexus together.
50  *
51  * During a detach, the above references are dropped and the fields are cleared;
52  * the interface is also marked as non-Skywalk-capable. This detach can happen
53  * explicitly via a command down the nexus, or implicitly when the nexus goes
54  * away (assuming there's no channel opened to it.)
55  *
56  * A userland channel can be opened to a netif nexus via the usual ch_open()
57  * way, assuming the nexus provider is setup to allow access for the userland
58  * process (either by binding the nexus port to PID, etc. or by creating the
59  * nexus in the anonymous mode.)
60  *
61  * Alternatively, a kernel channel can also be opened to it by some kernel
62  * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63  * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64  * indicate that.
65  *
66  * Opening a channel to the host port of a native or compat netif causes the
67  * ifnet output path to be redirected to nx_netif_host_transmit().  We also,
68  * at present, disable any hardware offload features.
69  *
70  * Opening a channel to the device port of a compat netif causes the ifnet
71  * input path to be redirected to nx_netif_compat_receive().  This is specific
72  * to the compat variant, as the native variant's RX path already goes to
73  * the native netif.
74  *
75  * During channel close, we restore the original I/O callbacks, as well as the
76  * interface's offload flags.
77  */
78 
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87 
88 #define NX_NETIF_MAXRINGS       NX_MAX_NUM_RING_PAIR
89 #define NX_NETIF_MINSLOTS       2       /* XXX same as above */
90 #define NX_NETIF_MAXSLOTS       NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91 #define NX_NETIF_TXRINGSIZE     512     /* default TX ring size */
92 #define NX_NETIF_RXRINGSIZE     1024    /* default RX ring size */
93 #define NX_NETIF_BUFSIZE        (2 * 1024)  /* default buffer size */
94 #define NX_NETIF_MINBUFSIZE     (128)  /* min buffer size */
95 #define NX_NETIF_MAXBUFSIZE     (16 * 1024) /* max buffer size */
96 
97 /*
98  * TODO: [email protected] -- minimum buflets for now; we will need to
99  * have a way to adjust this based on the underlying interface's
100  * parameters, e.g. jumbo MTU, large segment offload, etc.
101  */
102 #define NX_NETIF_UMD_SIZE       _USER_PACKET_SIZE(BUFLETS_MIN)
103 #define NX_NETIF_KMD_SIZE       _KERN_PACKET_SIZE(BUFLETS_MIN)
104 
105 /*
106  * minimum stack space required for IOSkywalkFamily and Driver execution.
107  */
108 #if XNU_TARGET_OS_OSX
109 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 1)
110 #else /* !XNU_TARGET_OS_OSX */
111 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 2)
112 #endif /* XNU_TARGET_OS_OSX */
113 
114 static void nx_netif_dom_init(struct nxdom *);
115 static void nx_netif_dom_terminate(struct nxdom *);
116 static void nx_netif_dom_fini(struct nxdom *);
117 static int nx_netif_prov_params_adjust(
118 	const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 	struct nxprov_adjusted_params *);
120 
121 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122     struct nxbind *, void *);
123 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125     struct kern_nexus *, struct kern_channel *, struct chreq *,
126     struct kern_channel *, struct nxbind *, struct proc *);
127 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128     struct kern_nexus *, struct kern_channel *);
129 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130     struct kern_nexus *, struct kern_channel *, struct proc *);
131 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132     struct kern_nexus *, struct kern_channel *, boolean_t);
133 
134 static void nx_netif_doorbell(struct ifnet *);
135 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136     uint32_t);
137 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138     uint32_t);
139 static void nx_netif_na_dtor(struct nexus_adapter *na);
140 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141     uint32_t);
142 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143     uint32_t);
144 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145 
146 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147     struct proc *);
148 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149     struct proc *);
150 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152 static void nx_netif_flags_init(struct nx_netif *);
153 static void nx_netif_flags_fini(struct nx_netif *);
154 static int nx_netif_na_channel_event_notify(struct nexus_adapter *,
155     struct __kern_packet *, struct __kern_channel_event *, uint16_t);
156 static void nx_netif_capabilities_fini(struct nx_netif *);
157 static errno_t nx_netif_interface_advisory_notify(void *,
158     const struct ifnet_interface_advisory *);
159 
160 struct nxdom nx_netif_dom_s = {
161 	.nxdom_prov_head =
162     STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
163 	.nxdom_type =           NEXUS_TYPE_NET_IF,
164 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
165 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
166 	.nxdom_name =           "netif",
167 	.nxdom_ports = {
168 		.nb_def = 2,
169 		.nb_min = 2,
170 		.nb_max = NX_NETIF_MAXPORTS,
171 	},
172 	.nxdom_tx_rings = {
173 		.nb_def = 1,
174 		.nb_min = 1,
175 		.nb_max = NX_NETIF_MAXRINGS,
176 	},
177 	.nxdom_rx_rings = {
178 		.nb_def = 1,
179 		.nb_min = 1,
180 		.nb_max = NX_NETIF_MAXRINGS,
181 	},
182 	.nxdom_tx_slots = {
183 		.nb_def = NX_NETIF_TXRINGSIZE,
184 		.nb_min = NX_NETIF_MINSLOTS,
185 		.nb_max = NX_NETIF_MAXSLOTS,
186 	},
187 	.nxdom_rx_slots = {
188 		.nb_def = NX_NETIF_RXRINGSIZE,
189 		.nb_min = NX_NETIF_MINSLOTS,
190 		.nb_max = NX_NETIF_MAXSLOTS,
191 	},
192 	.nxdom_buf_size = {
193 		.nb_def = NX_NETIF_BUFSIZE,
194 		.nb_min = NX_NETIF_MINBUFSIZE,
195 		.nb_max = NX_NETIF_MAXBUFSIZE,
196 	},
197 	.nxdom_meta_size = {
198 		.nb_def = NX_NETIF_UMD_SIZE,
199 		.nb_min = NX_NETIF_UMD_SIZE,
200 		.nb_max = NX_METADATA_USR_MAX_SZ,
201 	},
202 	.nxdom_stats_size = {
203 		.nb_def = 0,
204 		.nb_min = 0,
205 		.nb_max = NX_STATS_MAX_SZ,
206 	},
207 	.nxdom_pipes = {
208 		.nb_def = 0,
209 		.nb_min = 0,
210 		.nb_max = NX_UPIPE_MAXPIPES,
211 	},
212 	.nxdom_flowadv_max = {
213 		.nb_def = 0,
214 		.nb_min = 0,
215 		.nb_max = NX_FLOWADV_MAX,
216 	},
217 	.nxdom_nexusadv_size = {
218 		.nb_def = 0,
219 		.nb_min = 0,
220 		.nb_max = NX_NEXUSADV_MAX_SZ,
221 	},
222 	.nxdom_capabilities = {
223 		.nb_def = NXPCAP_USER_CHANNEL,
224 		.nb_min = 0,
225 		.nb_max = NXPCAP_USER_CHANNEL,
226 	},
227 	.nxdom_qmap = {
228 		.nb_def = NEXUS_QMAP_TYPE_DEFAULT,
229 		.nb_min = NEXUS_QMAP_TYPE_DEFAULT,
230 		.nb_max = NEXUS_QMAP_TYPE_WMM,
231 	},
232 	.nxdom_max_frags = {
233 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
234 		.nb_min = NX_PBUF_FRAGS_MIN,
235 		.nb_max = NX_PBUF_FRAGS_MAX,
236 	},
237 	.nxdom_init =           nx_netif_dom_init,
238 	.nxdom_terminate =      nx_netif_dom_terminate,
239 	.nxdom_fini =           nx_netif_dom_fini,
240 	.nxdom_find_port =      NULL,
241 	.nxdom_port_is_reserved = NULL,
242 	.nxdom_bind_port =      nx_netif_dom_bind_port,
243 	.nxdom_unbind_port =    nx_netif_dom_unbind_port,
244 	.nxdom_connect =        nx_netif_dom_connect,
245 	.nxdom_disconnect =     nx_netif_dom_disconnect,
246 	.nxdom_defunct =        nx_netif_dom_defunct,
247 	.nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
248 };
249 
250 struct kern_nexus_domain_provider nx_netif_prov_s = {
251 	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF,
252 	/*
253 	 * Don't install this as the default domain provider, i.e.
254 	 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
255 	 * provider to be the one handling userland-issued requests
256 	 * coming down thru nxprov_create() instead.
257 	 */
258 	.nxdom_prov_flags =             0,
259 	.nxdom_prov_cb = {
260 		.dp_cb_init =           nx_netif_prov_init,
261 		.dp_cb_fini =           nx_netif_prov_fini,
262 		.dp_cb_params =         nx_netif_prov_params,
263 		.dp_cb_mem_new =        nx_netif_prov_mem_new,
264 		.dp_cb_config =         nx_netif_prov_config,
265 		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
266 		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
267 		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
268 		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
269 		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
270 	},
271 };
272 
273 struct nexus_ifnet_ops na_netif_ops = {
274 	.ni_finalize = na_netif_finalize,
275 	.ni_reap = nx_netif_reap,
276 	.ni_dequeue = nx_netif_native_tx_dequeue,
277 	.ni_get_len = nx_netif_native_tx_get_len
278 };
279 
280 #define NX_NETIF_DOORBELL_MAX_DEQUEUE    64
281 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
282 
283 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
284     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
285 #if (DEVELOPMENT || DEBUG)
286 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
287     CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
288     "ifname prefix for enabling low latency support");
289 static uint32_t nx_netif_force_ifnet_start = 0;
290 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
291     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
292     "always use ifnet starter thread");
293 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
294     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
295     NX_NETIF_DOORBELL_MAX_DEQUEUE,
296     "max packets to dequeue in doorbell context");
297 #endif /* !DEVELOPMENT && !DEBUG */
298 
299 static ZONE_DEFINE(na_netif_zone, SKMEM_ZONE_PREFIX ".na.netif",
300     sizeof(struct nexus_netif_adapter), ZC_ZFREE_CLEARMEM);
301 
302 static ZONE_DEFINE(nx_netif_zone, SKMEM_ZONE_PREFIX ".nx.netif",
303     sizeof(struct nx_netif), ZC_ZFREE_CLEARMEM);
304 
305 #define SKMEM_TAG_NETIF_MIT          "com.apple.skywalk.netif.mit"
306 static kern_allocation_name_t skmem_tag_netif_mit;
307 
308 #define SKMEM_TAG_NETIF_FILTER       "com.apple.skywalk.netif.filter"
309 kern_allocation_name_t skmem_tag_netif_filter;
310 
311 #define SKMEM_TAG_NETIF_FLOW         "com.apple.skywalk.netif.flow"
312 kern_allocation_name_t skmem_tag_netif_flow;
313 
314 #define SKMEM_TAG_NETIF_AGENT_FLOW   "com.apple.skywalk.netif.agent_flow"
315 kern_allocation_name_t skmem_tag_netif_agent_flow;
316 
317 #define SKMEM_TAG_NETIF_LLINK        "com.apple.skywalk.netif.llink"
318 kern_allocation_name_t skmem_tag_netif_llink;
319 
320 #define SKMEM_TAG_NETIF_QSET         "com.apple.skywalk.netif.qset"
321 kern_allocation_name_t skmem_tag_netif_qset;
322 
323 #define SKMEM_TAG_NETIF_LLINK_INFO   "com.apple.skywalk.netif.llink_info"
324 kern_allocation_name_t skmem_tag_netif_llink_info;
325 
326 static void
nx_netif_dom_init(struct nxdom * nxdom)327 nx_netif_dom_init(struct nxdom *nxdom)
328 {
329 	SK_LOCK_ASSERT_HELD();
330 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
331 
332 	_CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
333 	_CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
334 	_CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
335 	_CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
336 	_CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
337 	_CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
338 	_CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
339 
340 	(void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
341 
342 	ASSERT(skmem_tag_netif_mit == NULL);
343 	skmem_tag_netif_mit =
344 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_MIT, 0);
345 	ASSERT(skmem_tag_netif_mit != NULL);
346 
347 	ASSERT(skmem_tag_netif_filter == NULL);
348 	skmem_tag_netif_filter =
349 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_FILTER, 0);
350 	ASSERT(skmem_tag_netif_filter != NULL);
351 
352 	ASSERT(skmem_tag_netif_flow == NULL);
353 	skmem_tag_netif_flow =
354 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_FLOW, 0);
355 	ASSERT(skmem_tag_netif_flow != NULL);
356 
357 	ASSERT(skmem_tag_netif_agent_flow == NULL);
358 	skmem_tag_netif_agent_flow =
359 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_AGENT_FLOW, 0);
360 	ASSERT(skmem_tag_netif_agent_flow != NULL);
361 
362 	ASSERT(skmem_tag_netif_llink == NULL);
363 	skmem_tag_netif_llink =
364 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_LLINK, 0);
365 	ASSERT(skmem_tag_netif_llink != NULL);
366 
367 	ASSERT(skmem_tag_netif_qset == NULL);
368 	skmem_tag_netif_qset =
369 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_QSET, 0);
370 	ASSERT(skmem_tag_netif_qset != NULL);
371 
372 	ASSERT(skmem_tag_netif_llink_info == NULL);
373 	skmem_tag_netif_llink_info =
374 	    kern_allocation_name_allocate(SKMEM_TAG_NETIF_LLINK_INFO, 0);
375 	ASSERT(skmem_tag_netif_llink_info != NULL);
376 
377 	nx_netif_compat_init(nxdom);
378 
379 	ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
380 	    strcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
381 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
382 
383 	netif_gso_init();
384 	nx_netif_llink_module_init();
385 }
386 
387 static void
nx_netif_dom_terminate(struct nxdom * nxdom)388 nx_netif_dom_terminate(struct nxdom *nxdom)
389 {
390 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
391 
392 	SK_LOCK_ASSERT_HELD();
393 
394 	nx_netif_llink_module_fini();
395 	netif_gso_fini();
396 	nx_netif_compat_fini();
397 
398 	if (skmem_tag_netif_llink_info != NULL) {
399 		kern_allocation_name_release(skmem_tag_netif_llink_info);
400 		skmem_tag_netif_llink_info = NULL;
401 	}
402 	if (skmem_tag_netif_qset != NULL) {
403 		kern_allocation_name_release(skmem_tag_netif_qset);
404 		skmem_tag_netif_qset = NULL;
405 	}
406 	if (skmem_tag_netif_llink != NULL) {
407 		kern_allocation_name_release(skmem_tag_netif_llink);
408 		skmem_tag_netif_llink = NULL;
409 	}
410 	if (skmem_tag_netif_agent_flow != NULL) {
411 		kern_allocation_name_release(skmem_tag_netif_agent_flow);
412 		skmem_tag_netif_agent_flow = NULL;
413 	}
414 	if (skmem_tag_netif_flow != NULL) {
415 		kern_allocation_name_release(skmem_tag_netif_flow);
416 		skmem_tag_netif_flow = NULL;
417 	}
418 	if (skmem_tag_netif_filter != NULL) {
419 		kern_allocation_name_release(skmem_tag_netif_filter);
420 		skmem_tag_netif_filter = NULL;
421 	}
422 	if (skmem_tag_netif_mit != NULL) {
423 		kern_allocation_name_release(skmem_tag_netif_mit);
424 		skmem_tag_netif_mit = NULL;
425 	}
426 
427 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
428 	    nxdom_prov_link, tnxdp) {
429 		(void) nxdom_prov_del(nxdom_prov);
430 	}
431 }
432 
433 static void
nx_netif_dom_fini(struct nxdom * nxdom)434 nx_netif_dom_fini(struct nxdom *nxdom)
435 {
436 #pragma unused(nxdom)
437 }
438 
439 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)440 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
441 {
442 #pragma unused(nxdom_prov)
443 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
444 	return 0;
445 }
446 
447 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)448 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
449     uint32_t flags)
450 {
451 #pragma unused(kring, p, flags)
452 	return ENXIO;
453 }
454 
455 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)456 nx_netif_prov_nx_stop(struct kern_nexus *nx)
457 {
458 	uint32_t r;
459 	struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
460 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
461 
462 	SK_LOCK_ASSERT_HELD();
463 	ASSERT(nx != NULL);
464 
465 	/* place all rings in drop mode */
466 	na_kr_drop(na, TRUE);
467 
468 	/* ensure global visibility */
469 	membar_sync();
470 
471 	/* reset all TX notify callbacks */
472 	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
473 		while (!atomic_test_set_ptr(&na->na_tx_rings[r].ckr_na_notify,
474 		    ptrauth_nop_cast(void *, na->na_tx_rings[r].ckr_na_notify),
475 		    ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
476 			;
477 		}
478 		membar_sync();
479 		if (nifna->nifna_tx_mit != NULL) {
480 			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
481 		}
482 	}
483 	if (nifna->nifna_tx_mit != NULL) {
484 		skn_free_type_array(tx, struct nx_netif_mit,
485 		    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
486 		nifna->nifna_tx_mit = NULL;
487 	}
488 
489 	/* reset all RX notify callbacks */
490 	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
491 		while (!atomic_test_set_ptr(&na->na_rx_rings[r].ckr_na_notify,
492 		    ptrauth_nop_cast(void *, na->na_rx_rings[r].ckr_na_notify),
493 		    ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
494 			;
495 		}
496 		membar_sync();
497 		if (nifna->nifna_rx_mit != NULL) {
498 			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
499 		}
500 	}
501 	if (nifna->nifna_rx_mit != NULL) {
502 		skn_free_type_array(rx, struct nx_netif_mit,
503 		    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
504 		nifna->nifna_rx_mit = NULL;
505 	}
506 	return 0;
507 }
508 
509 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)510 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
511     ifnet_t ifp)
512 {
513 	if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
514 		*(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
515 		*(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
516 	} else if (IFNET_IS_WIFI(ifp)) {
517 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
518 		    ifp->if_name[2] == '\0') {
519 			/* Wi-Fi Access Point */
520 			*(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
521 			*(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
522 		} else if (ifp->if_eflags & IFEF_AWDL) {
523 			/* AWDL */
524 			*(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
525 			*(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
526 		} else {
527 			/* Wi-Fi infrastructure */
528 			*(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
529 			*(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
530 		}
531 	} else if (IFNET_IS_ETHERNET(ifp)) {
532 #if !XNU_TARGET_OS_OSX
533 		/*
534 		 * On non-macOS platforms, treat all compat Ethernet
535 		 * interfaces as USB Ethernet with reduced ring sizes.
536 		 */
537 		*(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
538 		*(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
539 #else /* XNU_TARGET_OS_OSX */
540 		if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
541 			*(adj->adj_rx_slots) =
542 			    sk_netif_compat_usb_eth_rx_ring_sz;
543 			*(adj->adj_tx_slots) =
544 			    sk_netif_compat_usb_eth_tx_ring_sz;
545 		}
546 #endif /* XNU_TARGET_OS_OSX */
547 	}
548 }
549 
550 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)551 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
552     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
553 {
554 	/*
555 	 * for netif compat adjust the following parameters for memory
556 	 * optimization:
557 	 * - change the size of buffer object to 128 bytes.
558 	 * - don't allocate rx ring for host port and tx ring for dev port.
559 	 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
560 	 *   Assumption here is that pdp_ip0 is always used as the data
561 	 *   interface.
562 	 * - reduce the ring size for AWDL interface.
563 	 * - reduce the ring size for USB ethernet interface.
564 	 */
565 	if (strcmp(nxdom_prov->nxdom_prov_name,
566 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
567 		/*
568 		 * Leave the parameters default if userspace access may be
569 		 * needed. We can't use skywalk_direct_allowed() here because
570 		 * the drivers have not attached yet.
571 		 */
572 		if (skywalk_netif_direct_enabled()) {
573 			goto done;
574 		}
575 
576 		*(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
577 		*(adj->adj_tx_rings) = 1;
578 		if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
579 			ifnet_t ifp;
580 			ifnet_head_lock_shared();
581 			ifp = ifindex2ifnet[nxp->nxp_ifindex];
582 			ifnet_head_done();
583 			VERIFY(ifp != NULL);
584 			nx_netif_compat_adjust_ring_size(adj, ifp);
585 		}
586 		if (adj->adj_buf_srp->srp_r_seg_size == 0) {
587 			adj->adj_buf_srp->srp_r_seg_size =
588 			    skmem_usr_buf_seg_size;
589 		}
590 	} else { /* netif native */
591 		if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
592 			*(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
593 			*(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
594 		}
595 		/*
596 		 * Add another extra ring for host port. Note that if the
597 		 * nexus isn't configured to use the same pbufpool for all of
598 		 * its ports, we'd end up allocating extra here.
599 		 * Not a big deal since that case isn't the default.
600 		 */
601 		*(adj->adj_tx_rings) += 1;
602 		*(adj->adj_rx_rings) += 1;
603 
604 		if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
605 			SK_ERR("buf size too small, min (%d)",
606 			    PKT_MAX_PROTO_HEADER_SIZE);
607 			return EINVAL;
608 		}
609 		_CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
610 		    NX_INTF_ADV_SIZE);
611 		*(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
612 	}
613 done:
614 	/* enable magazines layer for metadata */
615 	*(adj->adj_md_magazines) = TRUE;
616 	return 0;
617 }
618 
619 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS])620 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
621     const uint32_t req, const struct nxprov_params *nxp0,
622     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS])
623 {
624 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
625 
626 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
627 	           nxdom, nxdom, nxdom, nx_netif_prov_params_adjust);
628 }
629 
630 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)631 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
632     struct kern_nexus *nx, struct nexus_adapter *na)
633 {
634 #pragma unused(nxdom_prov)
635 	int err = 0;
636 	boolean_t pp_truncated_buf = FALSE;
637 	boolean_t allow_direct;
638 	boolean_t kernel_only;
639 
640 	SK_DF(SK_VERB_NETIF,
641 	    "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
642 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
643 	    SK_KVA(na));
644 
645 	ASSERT(na->na_arena == NULL);
646 	if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
647 	    (na->na_type == NA_NETIF_COMPAT_HOST)) {
648 		pp_truncated_buf = TRUE;
649 	}
650 	/*
651 	 * We do this check to determine whether to create the extra
652 	 * regions needed for userspace access. This is per interface.
653 	 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
654 	 */
655 	allow_direct = skywalk_netif_direct_allowed(na->na_name);
656 
657 	/*
658 	 * Both ports (host and dev) share the same packet buffer pool;
659 	 * the first time a port gets opened will allocate the pp that
660 	 * gets stored in the nexus, which will then be used by any
661 	 * subsequent opens.
662 	 */
663 	kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
664 	na->na_arena = skmem_arena_create_for_nexus(na,
665 	    NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
666 	    &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
667 	ASSERT(na->na_arena != NULL || err != 0);
668 	ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
669 	    NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
670 	    NX_DOM(nx)->nxdom_md_subtype));
671 
672 	return err;
673 }
674 
675 SK_NO_INLINE_ATTRIBUTE
676 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)677 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
678 {
679 	struct nx_llink_info_req *nlir = NULL;
680 	struct nx_netif *nif;
681 	struct netif_llink *llink;
682 	uint16_t llink_cnt;
683 	size_t len, user_len;
684 	int err, i;
685 
686 	nif = NX_NETIF_PRIVATE(nx);
687 	if (!NETIF_LLINK_ENABLED(nif)) {
688 		SK_ERR("llink mode not enabled");
689 		return ENOTSUP;
690 	}
691 	lck_rw_lock_shared(&nif->nif_llink_lock);
692 	llink_cnt = nif->nif_llink_cnt;
693 	if (llink_cnt == 0) {
694 		SK_ERR("zero llink cnt");
695 		err = ENXIO;
696 		goto done;
697 	}
698 	len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
699 	/* preserve sopt_valsize because it gets overwritten by copyin */
700 	user_len = sopt->sopt_valsize;
701 	if (user_len < len) {
702 		SK_ERR("buffer too small");
703 		err = ENOBUFS;
704 		goto done;
705 	}
706 	nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
707 	if (nlir == NULL) {
708 		SK_ERR("failed to allocate nlir");
709 		err = ENOMEM;
710 		goto done;
711 	}
712 	err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
713 	if (err != 0) {
714 		SK_ERR("copyin failed: %d", err);
715 		goto done;
716 	}
717 	if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
718 		SK_ERR("nlir version mismatch: %d != %d",
719 		    nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
720 		err = ENOTSUP;
721 		goto done;
722 	}
723 	nlir->nlir_llink_cnt = llink_cnt;
724 	i = 0;
725 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
726 		struct nx_llink_info *nli;
727 		struct netif_qset *qset;
728 		uint16_t qset_cnt;
729 		int j;
730 
731 		nli = &nlir->nlir_llink[i];
732 		nli->nli_link_id = llink->nll_link_id;
733 		nli->nli_link_id_internal = llink->nll_link_id_internal;
734 		nli->nli_state = llink->nll_state;
735 		nli->nli_flags = llink->nll_flags;
736 
737 		qset_cnt = llink->nll_qset_cnt;
738 		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
739 		nli->nli_qset_cnt = qset_cnt;
740 
741 		j = 0;
742 		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
743 			struct nx_qset_info *nqi;
744 
745 			nqi = &nli->nli_qset[j];
746 			nqi->nqi_id = qset->nqs_id;
747 			nqi->nqi_flags = qset->nqs_flags;
748 			nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
749 			nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
750 			j++;
751 		}
752 		ASSERT(j == qset_cnt);
753 		i++;
754 	}
755 	ASSERT(i == llink_cnt);
756 	sopt->sopt_valsize = user_len;
757 	err = sooptcopyout(sopt, nlir, len);
758 	if (err != 0) {
759 		SK_ERR("sooptcopyout failed: %d", err);
760 	}
761 done:
762 	lck_rw_unlock_shared(&nif->nif_llink_lock);
763 	if (nlir != NULL) {
764 		sk_free_data(nlir, len);
765 	}
766 	return err;
767 }
768 
769 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)770 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
771     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
772     struct proc *p, kauth_cred_t cred)
773 {
774 #pragma unused(nxdom_prov)
775 	struct sockopt sopt;
776 	int err = 0;
777 
778 	SK_LOCK_ASSERT_HELD();
779 
780 	/* proceed only if the client possesses netif entitlement */
781 	if ((err = skywalk_priv_check_cred(p, cred,
782 	    PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
783 		goto done;
784 	}
785 
786 	if (ncr->nc_req == USER_ADDR_NULL) {
787 		err = EINVAL;
788 		goto done;
789 	}
790 
791 	/* to make life easier for handling copies */
792 	bzero(&sopt, sizeof(sopt));
793 	sopt.sopt_dir = sopt_dir;
794 	sopt.sopt_val = ncr->nc_req;
795 	sopt.sopt_valsize = ncr->nc_req_len;
796 	sopt.sopt_p = p;
797 
798 	switch (ncr->nc_cmd) {
799 	case NXCFG_CMD_ATTACH:
800 	case NXCFG_CMD_DETACH: {
801 		struct nx_spec_req nsr;
802 
803 		bzero(&nsr, sizeof(nsr));
804 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
805 		if (err != 0) {
806 			goto done;
807 		}
808 
809 		/*
810 		 * Null-terminate in case this has an interface name;
811 		 * the union is already large enough for uuid_t.
812 		 */
813 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
814 		if (p != kernproc) {
815 			nsr.nsr_flags &= NXSPECREQ_MASK;
816 		}
817 
818 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
819 		if (err != 0) {
820 			goto done;
821 		}
822 
823 		/* XXX: [email protected] -- can this copyout fail? */
824 		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
825 		break;
826 	}
827 	case NXCFG_CMD_FLOW_ADD:
828 	case NXCFG_CMD_FLOW_DEL: {
829 		_CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
830 		    offsetof(struct nx_flow_req, _nfr_common_field_end));
831 		struct nx_flow_req nfr;
832 
833 		bzero(&nfr, sizeof(nfr));
834 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
835 		if (err != 0) {
836 			goto done;
837 		}
838 
839 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
840 		if (err != 0) {
841 			goto done;
842 		}
843 
844 		/* XXX: [email protected] -- can this copyout fail? */
845 		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
846 		break;
847 	}
848 	case NXCFG_CMD_GET_LLINK_INFO: {
849 		err = nx_netif_get_llink_info(&sopt, nx);
850 		break;
851 	}
852 	default:
853 		err = EINVAL;
854 		goto done;
855 	}
856 done:
857 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
858 	    "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
859 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
860 	return err;
861 }
862 
863 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)864 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
865 {
866 #pragma unused(nxdom_prov)
867 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
868 }
869 
870 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)871 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
872 {
873 	struct nx_netif *n;
874 	char name[64];
875 	int error;
876 
877 	SK_LOCK_ASSERT_HELD();
878 	ASSERT(nx->nx_arg == NULL);
879 
880 	SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
881 
882 	nx->nx_arg = nx_netif_alloc(Z_WAITOK);
883 	n = NX_NETIF_PRIVATE(nx);
884 	if (NX_USER_CHANNEL_PROV(nx) &&
885 	    NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
886 		(void) snprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
887 		error = nx_advisory_alloc(nx, name,
888 		    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
889 		    NEXUS_ADVISORY_TYPE_NETIF);
890 		if (error != 0) {
891 			nx_netif_free(n);
892 			return error;
893 		}
894 	}
895 	n->nif_nx = nx;
896 	SK_D("create new netif 0x%llx for nexus 0x%llx",
897 	    SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
898 	return 0;
899 }
900 
901 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)902 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
903 {
904 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
905 
906 	SK_LOCK_ASSERT_HELD();
907 
908 	SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
909 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
910 
911 	/*
912 	 * XXX
913 	 * detach should be done separately to be symmetrical with attach.
914 	 */
915 	nx_advisory_free(nx);
916 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
917 		/* we're called by nx_detach(), so this cannot fail */
918 		int err = nx_netif_ctl_detach(nx, NULL);
919 		VERIFY(err == 0);
920 	}
921 	if (n->nif_dev_nxb != NULL) {
922 		nxb_free(n->nif_dev_nxb);
923 		n->nif_dev_nxb = NULL;
924 	}
925 	if (n->nif_host_nxb != NULL) {
926 		nxb_free(n->nif_host_nxb);
927 		n->nif_host_nxb = NULL;
928 	}
929 	SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
930 	nx_netif_free(n);
931 	nx->nx_arg = NULL;
932 }
933 
934 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)935 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
936     struct kern_pbufpool **rpp)
937 {
938 	ASSERT(nx->nx_tx_pp != NULL);
939 	ASSERT(nx->nx_rx_pp != NULL);
940 
941 	if (tpp != NULL) {
942 		*tpp = nx->nx_tx_pp;
943 	}
944 	if (rpp != NULL) {
945 		*rpp = nx->nx_rx_pp;
946 	}
947 
948 	return 0;
949 }
950 
951 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)952 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
953 {
954 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
955 	struct ifnet *ifp = nif->nif_ifp;
956 	struct sk_stats_net_if *sns = out;
957 	size_t actual_space = sizeof(struct sk_stats_net_if);
958 
959 	if (out != NULL && actual_space <= len) {
960 		uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
961 		if (ifp != NULL) {
962 			(void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
963 		}
964 		sns->sns_nifs = nif->nif_stats;
965 	}
966 
967 	return actual_space;
968 }
969 
970 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * out,size_t len)971 __netif_mib_get_llinks(struct kern_nexus *nx, void *out, size_t len)
972 {
973 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
974 	struct nx_llink_info *nli_list = out;
975 	size_t actual_space = 0;
976 	if (NETIF_LLINK_ENABLED(nif)) {
977 		lck_rw_lock_shared(&nif->nif_llink_lock);
978 		actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
979 
980 		if (out != NULL && actual_space <= len) {
981 			struct netif_llink *llink;
982 			int i = 0;
983 			STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
984 				struct nx_llink_info *nli;
985 				struct netif_qset *qset;
986 				uint16_t qset_cnt;
987 				int j;
988 
989 				nli = &nli_list[i];
990 				uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
991 				nli->nli_link_id = llink->nll_link_id;
992 				nli->nli_link_id_internal = llink->nll_link_id_internal;
993 				nli->nli_state = llink->nll_state;
994 				nli->nli_flags = llink->nll_flags;
995 
996 				qset_cnt = llink->nll_qset_cnt;
997 				ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
998 				nli->nli_qset_cnt = qset_cnt;
999 
1000 				j = 0;
1001 				SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1002 					struct nx_qset_info *nqi;
1003 
1004 					nqi = &nli->nli_qset[j];
1005 					nqi->nqi_id = qset->nqs_id;
1006 					nqi->nqi_flags = qset->nqs_flags;
1007 					nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
1008 					nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
1009 					j++;
1010 				}
1011 				ASSERT(j == qset_cnt);
1012 				i++;
1013 			}
1014 			ASSERT(i == nif->nif_llink_cnt);
1015 		}
1016 		lck_rw_unlock_shared(&nif->nif_llink_lock);
1017 	}
1018 
1019 	return actual_space;
1020 }
1021 
1022 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1023 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1024     void *out, size_t len, struct proc *p)
1025 {
1026 #pragma unused(p)
1027 	size_t ret;
1028 
1029 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1030 	    (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
1031 		return 0;
1032 	}
1033 
1034 	switch (filter->nmf_type) {
1035 	case NXMIB_NETIF_STATS:
1036 		ret = __netif_mib_get_stats(nx, out, len);
1037 		break;
1038 	case NXMIB_LLINK_LIST:
1039 		ret = __netif_mib_get_llinks(nx, out, len);
1040 		break;
1041 	default:
1042 		ret = 0;
1043 		break;
1044 	}
1045 	return ret;
1046 }
1047 
1048 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)1049 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1050     struct nxbind *nxb, void *info)
1051 {
1052 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1053 	nexus_port_t first, last, port;
1054 	int error;
1055 
1056 	ASSERT(nx_port != NULL);
1057 	ASSERT(nxb != NULL);
1058 
1059 	port = *nx_port;
1060 
1061 	/*
1062 	 * If port is:
1063 	 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1064 	 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1065 	 *                    return back the assigned port.
1066 	 */
1067 	first = NEXUS_PORT_NET_IF_CLIENT;
1068 	last = NXDOM_MAX(NX_DOM(nx), ports);
1069 	ASSERT(first <= last);
1070 
1071 	NETIF_WLOCK(nif);
1072 
1073 	if (__improbable(first == last)) {
1074 		error = ENOMEM;
1075 	} else if (port != NEXUS_PORT_ANY) {
1076 		error = nx_port_bind_info(nx, port, nxb, info);
1077 		SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1078 	} else {
1079 		error = nx_port_find(nx, first, last - 1, &port);
1080 		ASSERT(error != 0 || (port >= first && port < last));
1081 		if (error == 0) {
1082 			error = nx_port_bind_info(nx, port, nxb, info);
1083 			SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1084 			    port, error);
1085 		}
1086 	}
1087 	NETIF_WUNLOCK(nif);
1088 
1089 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1090 	if (error == 0) {
1091 		*nx_port = port;
1092 	}
1093 
1094 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1095 	    "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1096 	    SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1097 	    nx->nx_active_ports, error);
1098 
1099 	return error;
1100 }
1101 
1102 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1103 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1104 {
1105 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1106 	int error = 0;
1107 
1108 	ASSERT(nx_port != NEXUS_PORT_ANY);
1109 
1110 	NETIF_WLOCK(nif);
1111 	error = nx_port_unbind(nx, nx_port);
1112 	NETIF_WUNLOCK(nif);
1113 
1114 	return error;
1115 }
1116 
1117 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)1118 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1119     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1120     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1121 {
1122 #pragma unused(nxdom_prov)
1123 	int err = 0;
1124 
1125 	SK_LOCK_ASSERT_HELD();
1126 
1127 	ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1128 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1129 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
1130 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1131 	ASSERT(!(ch->ch_flags & CHANF_HOST));
1132 
1133 	switch (chr->cr_port) {
1134 	case NEXUS_PORT_NET_IF_DEV:
1135 		if (chr->cr_mode & CHMODE_HOST) {
1136 			err = EINVAL;
1137 			goto done;
1138 		}
1139 		break;
1140 
1141 	case NEXUS_PORT_NET_IF_HOST:
1142 		if (!(chr->cr_mode & CHMODE_HOST)) {
1143 			if (ch->ch_flags & CHANF_KERNEL) {
1144 				err = EINVAL;
1145 				goto done;
1146 			}
1147 			chr->cr_mode |= CHMODE_HOST;
1148 		}
1149 		/*
1150 		 * This channel is exclusively opened to the host
1151 		 * rings; don't notify the external provider.
1152 		 */
1153 		atomic_bitset_32(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP);
1154 		break;
1155 
1156 	default:
1157 		/*
1158 		 * This channel is shared between netif and user process;
1159 		 * don't notify the external provider.
1160 		 */
1161 		atomic_bitset_32(&ch->ch_flags, CHANF_EXT_SKIP);
1162 		break;
1163 	}
1164 
1165 	chr->cr_ring_set = RING_SET_DEFAULT;
1166 	chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1167 	(void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1168 	    nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1169 	    nx->nx_prov->nxprov_params->nxp_name);
1170 
1171 	if (ch->ch_flags & CHANF_KERNEL) {
1172 		err = na_connect_spec(nx, ch, chr, p);
1173 	} else {
1174 		err = na_connect(nx, ch, chr, ch0, nxb, p);
1175 	}
1176 
1177 	if (err == 0) {
1178 		/*
1179 		 * Mark the kernel slot descriptor region as busy; this
1180 		 * prevents it from being torn-down at channel defunct
1181 		 * time, as the (external) nexus owner may be calling
1182 		 * KPIs that require accessing the slots.
1183 		 */
1184 		skmem_arena_nexus_sd_set_noidle(
1185 			skmem_arena_nexus(ch->ch_na->na_arena), 1);
1186 	}
1187 
1188 done:
1189 	return err;
1190 }
1191 
1192 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1193 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1194     struct kern_nexus *nx, struct kern_channel *ch)
1195 {
1196 #pragma unused(nxdom_prov)
1197 	SK_LOCK_ASSERT_HELD();
1198 
1199 	SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1200 	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1201 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1202 
1203 	/*
1204 	 * Release busy assertion held earlier in nx_netif_dom_connect();
1205 	 * this allows for the final arena teardown to succeed.
1206 	 */
1207 	skmem_arena_nexus_sd_set_noidle(
1208 		skmem_arena_nexus(ch->ch_na->na_arena), -1);
1209 
1210 	if (ch->ch_flags & CHANF_KERNEL) {
1211 		na_disconnect_spec(nx, ch);
1212 	} else {
1213 		na_disconnect(nx, ch);
1214 	}
1215 }
1216 
1217 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1218 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1219     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1220 {
1221 #pragma unused(nxdom_prov, nx)
1222 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1223 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1224 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1225 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1226 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1227 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST);
1228 
1229 	na_ch_rings_defunct(ch, p);
1230 }
1231 
1232 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1233 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1234     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1235 {
1236 #pragma unused(nxdom_prov)
1237 	if (!locked) {
1238 		SK_LOCK_ASSERT_NOTHELD();
1239 		SK_LOCK();
1240 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1241 	} else {
1242 		SK_LOCK_ASSERT_HELD();
1243 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1244 	}
1245 
1246 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1247 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1248 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1249 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST);
1250 
1251 	na_defunct(nx, ch, ch->ch_na, locked);
1252 
1253 	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1254 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1255 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1256 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1257 
1258 	if (!locked) {
1259 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1260 		SK_UNLOCK();
1261 	} else {
1262 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1263 		SK_LOCK_ASSERT_HELD();
1264 	}
1265 }
1266 
1267 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1268 na_netif_alloc(zalloc_flags_t how)
1269 {
1270 	_CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1271 
1272 	return zalloc_flags(na_netif_zone, how | Z_ZERO);
1273 }
1274 
1275 void
na_netif_free(struct nexus_adapter * na)1276 na_netif_free(struct nexus_adapter *na)
1277 {
1278 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1279 
1280 	SK_LOCK_ASSERT_HELD();
1281 	SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1282 
1283 	ASSERT(na->na_refcount == 0);
1284 	ASSERT(nifna->nifna_tx_mit == NULL);
1285 	ASSERT(nifna->nifna_rx_mit == NULL);
1286 	bzero(nifna, sizeof(*nifna));
1287 
1288 	zfree(na_netif_zone, nifna);
1289 }
1290 
1291 /* Process NXCFG_CMD_ATTACH */
1292 SK_NO_INLINE_ATTRIBUTE
1293 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1294 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1295     struct proc *p)
1296 {
1297 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1298 	struct ifnet *ifp = NULL;
1299 	boolean_t compat;
1300 	int err = 0;
1301 
1302 	SK_LOCK_ASSERT_HELD();
1303 
1304 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1305 	compat = (strcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1306 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1307 
1308 	uuid_clear(nsr->nsr_if_uuid);
1309 	/*
1310 	 * The netif accepts either an interface name or a pointer to
1311 	 * an ifnet, but never a UUID.
1312 	 */
1313 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1314 		err = EINVAL;
1315 		goto done;
1316 	}
1317 	if (nsr->nsr_flags & NXSPECREQ_IFP) {
1318 		if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1319 			err = EINVAL;
1320 			goto done;
1321 		}
1322 	} else if ((ifp = ifunit_ref(nsr->nsr_name)) == NULL) {
1323 		err = ENXIO;
1324 		goto done;
1325 	}
1326 
1327 	if ((compat && SKYWALK_NATIVE(ifp)) ||
1328 	    (!compat && !SKYWALK_NATIVE(ifp))) {
1329 		/* native driver for netif; non-native for netif_compat  */
1330 		err = ENODEV;
1331 	} else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1332 		err = EBUSY;
1333 	} else {
1334 		ASSERT(uuid_is_null(n->nif_uuid));
1335 		/*
1336 		 * Upon success, callee will hold its own ifnet iorefcnt
1337 		 * as well as a retain count on the nexus adapter.
1338 		 */
1339 		if (compat) {
1340 			err = nx_netif_compat_attach(nx, ifp);
1341 		} else {
1342 			err = nx_netif_attach(nx, ifp);
1343 		}
1344 
1345 		if (err == 0) {
1346 			/* return the adapter UUID */
1347 			uuid_generate_random(n->nif_uuid);
1348 			uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1349 #if (DEVELOPMENT || DEBUG)
1350 			skoid_create(&n->nif_skoid,
1351 			    SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1352 			    CTLFLAG_RW);
1353 #endif /* !DEVELOPMENT && !DEBUG */
1354 		}
1355 	}
1356 done:
1357 	/* drop I/O refcnt from ifunit_ref() */
1358 	if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1359 		ifnet_decr_iorefcnt(ifp);
1360 	}
1361 
1362 #if SK_LOG
1363 	uuid_string_t uuidstr, ifuuidstr;
1364 	const char *nustr;
1365 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1366 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1367 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1368 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1369 		    SK_KVA(nsr->nsr_ifp));
1370 		nustr = uuidstr;
1371 	} else {
1372 		nustr = nsr->nsr_name;
1373 	}
1374 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1375 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1376 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1377 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1378 #endif /* SK_LOG */
1379 
1380 	return err;
1381 }
1382 
1383 /* process NXCFG_CMD_DETACH */
1384 SK_NO_INLINE_ATTRIBUTE
1385 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1386 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1387 {
1388 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1389 	int err = 0;
1390 
1391 	SK_LOCK_ASSERT_HELD();
1392 
1393 	/*
1394 	 * nsr is NULL when we're called from the destructor, and it
1395 	 * implies that we'll detach whatever that is attached.
1396 	 */
1397 	if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1398 		err = EINVAL;
1399 	} else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1400 	    nif->nif_uuid) != 0) {
1401 		err = ESRCH;
1402 	} else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1403 		/* nx_netif_ctl_attach() not yet done or already detached */
1404 		err = ENXIO;
1405 	} else if (nx->nx_ch_count != 0) {
1406 		/*
1407 		 * There's at least a channel opened; we can't
1408 		 * yank the interface from underneath the nexus
1409 		 * since our dlil input/output handler may be
1410 		 * running now.  Bail out and come back here
1411 		 * again when the nexus detaches.
1412 		 */
1413 		err = EBUSY;
1414 	} else {
1415 		struct ifnet *ifp;
1416 		boolean_t suspended = FALSE;
1417 
1418 		ifp = nif->nif_ifp;
1419 		if (ifp == NULL) {
1420 			err = EALREADY;
1421 			goto done;
1422 		}
1423 		/*
1424 		 * For regular kernel-attached interfaces, quiescing is handled by
1425 		 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1426 		 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1427 		 * are constructed on the fly and can also be torn down on the fly.
1428 		 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1429 		 * can be detached while the interface is still attached.
1430 		 */
1431 		if (ifnet_datamov_suspend_if_needed(ifp)) {
1432 			SK_UNLOCK();
1433 			suspended = TRUE;
1434 			ifnet_datamov_drain(ifp);
1435 			SK_LOCK();
1436 		}
1437 		nx_netif_agent_fini(nif);
1438 		nx_netif_capabilities_fini(nif);
1439 		nx_netif_flow_fini(nif);
1440 		nx_netif_filter_fini(nif);
1441 		nx_netif_llink_fini(nif);
1442 		nx_netif_flags_fini(nif);
1443 
1444 		uuid_clear(nif->nif_uuid);
1445 		/* nx_netif_{compat_}attach() held both references */
1446 		na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1447 		na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1448 		nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1449 		nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1450 
1451 		ifp->if_na_ops = NULL;
1452 		ifp->if_na = NULL;
1453 		nif->nif_ifp = NULL;
1454 		nif->nif_netif_nxadv = NULL;
1455 		SKYWALK_CLEAR_CAPABLE(ifp);
1456 		if (suspended) {
1457 			ifnet_datamov_resume(ifp);
1458 		}
1459 
1460 #if (DEVELOPMENT || DEBUG)
1461 		skoid_destroy(&nif->nif_skoid);
1462 #endif /* !DEVELOPMENT && !DEBUG */
1463 	}
1464 done:
1465 #if SK_LOG
1466 	if (nsr != NULL) {
1467 		uuid_string_t ifuuidstr;
1468 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1469 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1470 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1471 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1472 		    nsr->nsr_flags, err);
1473 	} else {
1474 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1475 		    "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1476 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
1477 	}
1478 #endif /* SK_LOG */
1479 
1480 	return err;
1481 }
1482 
1483 /*
1484  * XXX
1485  * These checks are copied from fsw.c
1486  * There are no tests exercising this code. Do we still need this?
1487  */
1488 SK_NO_INLINE_ATTRIBUTE
1489 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1490 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1491     struct proc *p, struct nx_flow_req *req)
1492 {
1493 #pragma unused(nif)
1494 	boolean_t need_check;
1495 	int error;
1496 
1497 	if (uuid_is_null(req->nfr_flow_uuid)) {
1498 		return EINVAL;
1499 	}
1500 	req->nfr_flags &= NXFLOWREQF_MASK;
1501 	req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1502 
1503 	if (cmd == NXCFG_CMD_FLOW_DEL) {
1504 		return 0;
1505 	}
1506 	need_check = FALSE;
1507 	if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1508 		need_check = TRUE;
1509 	} else if (!uuid_is_null(req->nfr_euuid)) {
1510 		uuid_t uuid;
1511 
1512 		/* get the UUID of the issuing process */
1513 		proc_getexecutableuuid(p, uuid, sizeof(uuid));
1514 
1515 		/*
1516 		 * If this is not issued by a process for its own
1517 		 * executable UUID and if the process does not have
1518 		 * the necessary privilege, reject the request.
1519 		 * The logic is similar to so_set_effective_uuid().
1520 		 */
1521 		if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1522 			need_check = TRUE;
1523 		}
1524 	}
1525 	if (need_check) {
1526 		kauth_cred_t cred = kauth_cred_proc_ref(p);
1527 		error = priv_check_cred(cred,
1528 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1529 		kauth_cred_unref(&cred);
1530 		if (error != 0) {
1531 			return error;
1532 		}
1533 	}
1534 	return 0;
1535 }
1536 
1537 SK_NO_INLINE_ATTRIBUTE
1538 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1539 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1540     struct nx_flow_req *req)
1541 {
1542 	int err;
1543 
1544 	ASSERT(p != PROC_NULL);
1545 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1546 	if (err != 0) {
1547 		return err;
1548 	}
1549 
1550 	/* init kernel only fields */
1551 	nx_flow_req_internalize(req);
1552 	req->nfr_context = NULL;
1553 	req->nfr_flow_stats = NULL;
1554 	req->nfr_port_reservation = NULL;
1555 	req->nfr_pid = proc_pid(p);
1556 
1557 	err = nx_netif_netagent_flow_add(nif, req);
1558 	nx_flow_req_externalize(req);
1559 	return err;
1560 }
1561 
1562 SK_NO_INLINE_ATTRIBUTE
1563 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1564 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1565     struct nx_flow_req *req)
1566 {
1567 	int err;
1568 
1569 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1570 	if (err != 0) {
1571 		return err;
1572 	}
1573 
1574 	nx_flow_req_internalize(req);
1575 	req->nfr_pid = proc_pid(p);
1576 
1577 	err = nx_netif_netagent_flow_del(nif, req);
1578 	nx_flow_req_externalize(req);
1579 	return err;
1580 }
1581 
1582 SK_NO_INLINE_ATTRIBUTE
1583 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1584 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1585     struct proc *p)
1586 {
1587 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1588 	struct nx_spec_req *nsr = data;
1589 	struct nx_flow_req *nfr = data;
1590 	int error = 0;
1591 
1592 	SK_LOCK_ASSERT_HELD();
1593 
1594 	switch (nc_cmd) {
1595 	case NXCFG_CMD_ATTACH:
1596 		error = nx_netif_ctl_attach(nx, nsr, p);
1597 		break;
1598 
1599 	case NXCFG_CMD_DETACH:
1600 		error = nx_netif_ctl_detach(nx, nsr);
1601 		break;
1602 
1603 	case NXCFG_CMD_FLOW_ADD:
1604 		error = nx_netif_ctl_flow_add(nif, p, nfr);
1605 		break;
1606 
1607 	case NXCFG_CMD_FLOW_DEL:
1608 		error = nx_netif_ctl_flow_del(nif, p, nfr);
1609 		break;
1610 
1611 	default:
1612 		SK_ERR("invalid cmd %u", nc_cmd);
1613 		error = EINVAL;
1614 		break;
1615 	}
1616 	return error;
1617 }
1618 
1619 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1620 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1621     uint32_t flags)
1622 {
1623 #pragma unused(flags)
1624 	struct netif_qset *qset;
1625 
1626 	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1627 		(void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1628 	}
1629 }
1630 
1631 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1632 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1633 {
1634 	struct nx_netif *nif;
1635 	struct netif_llink *llink;
1636 
1637 	nif = NX_NETIF_PRIVATE(nx);
1638 
1639 	lck_rw_lock_shared(&nif->nif_llink_lock);
1640 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1641 		nx_netif_llink_notify(nx, llink, flags);
1642 	}
1643 	lck_rw_unlock_shared(&nif->nif_llink_lock);
1644 }
1645 
1646 /*
1647  * if_start() callback for native Skywalk interfaces, registered
1648  * at ifnet_allocate_extended() time, and invoked by the ifnet
1649  * starter thread.
1650  */
1651 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1652 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1653 {
1654 	if (__improbable(ifp->if_na == NULL)) {
1655 		return;
1656 	}
1657 
1658 	/*
1659 	 * Do this only if the nexus adapter is active, i.e. a channel
1660 	 * has been opened to it by the module above (flowswitch, etc.)
1661 	 */
1662 	struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1663 	if (__probable(NA_IS_ACTIVE(hwna))) {
1664 		struct kern_nexus *nx = hwna->na_nx;
1665 
1666 		/* update our work timestamp */
1667 		hwna->na_work_ts = _net_uptime;
1668 
1669 		if (NX_LLINK_PROV(nx)) {
1670 			nx_netif_llink_notify_all(nx, flags);
1671 		} else {
1672 			struct __kern_channel_ring *kring;
1673 
1674 			/* for doorbell purposes, use TX ring 0 */
1675 			kring = &hwna->na_tx_rings[0];
1676 
1677 			/* Issue a synchronous TX doorbell on the netif device ring */
1678 			kring->ckr_na_sync(kring, PROC_NULL,
1679 			    (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1680 		}
1681 	} else {
1682 		struct netif_stats *nifs =
1683 		    &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1684 		STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1685 	}
1686 }
1687 
1688 static void
nx_netif_doorbell(struct ifnet * ifp)1689 nx_netif_doorbell(struct ifnet *ifp)
1690 {
1691 	nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1692 }
1693 
1694 /*
1695  * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1696  * perform synchronous TX doorbell to the driver, by invoking the driver's
1697  * doorbell callback directly in the same thread context.  It is also called
1698  * when the layer above performs a TX sync operation, where we might need
1699  * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1700  */
1701 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1702 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1703     uint32_t flags)
1704 {
1705 #pragma unused(p)
1706 	struct ifnet *ifp = KRNA(kring)->na_ifp;
1707 	boolean_t sync_only;
1708 	int ret = 0;
1709 
1710 	ASSERT(ifp != NULL);
1711 
1712 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1713 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1714 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1715 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1716 	    flags);
1717 
1718 	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1719 		SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1720 		    SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1721 		return ENXIO;
1722 	}
1723 
1724 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1725 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1726 		    "flow control ON", SK_KVA(kring), if_name(ifp),
1727 		    SK_KVA(ifp));
1728 		return ENXIO;
1729 	}
1730 
1731 	/* update our work timestamp */
1732 	KRNA(kring)->na_work_ts = _net_uptime;
1733 
1734 	sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1735 	    !KR_KERNEL_ONLY(kring);
1736 	/* regular sync (reclaim) */
1737 	if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1738 		ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1739 		    kring->ckr_pending_intr != 0);
1740 		kring->ckr_pending_intr = 0;
1741 
1742 		/* direct user channels do not need to use the doorbell */
1743 		if (__improbable(sync_only)) {
1744 			return ret;
1745 		}
1746 	}
1747 
1748 	/*
1749 	 * Doorbell call.  Here we do doorbell explicitly if the flag is
1750 	 * set or implicitly if we're opened directly by a user channel.
1751 	 * Synchronous vs. asynchronous depending on the context.
1752 	 */
1753 	if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1754 		if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1755 			ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1756 			    !(flags & NA_SYNCF_NETIF_ASYNC));
1757 			nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1758 		} else {
1759 			ifnet_start(ifp);
1760 		}
1761 	}
1762 
1763 	return ret;
1764 }
1765 
1766 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1767 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1768     uint32_t flags)
1769 {
1770 #pragma unused(p)
1771 	int ret;
1772 
1773 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1774 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1775 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1776 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1777 	    flags);
1778 
1779 	ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1780 
1781 	/* update our work timestamp */
1782 	KRNA(kring)->na_work_ts = _net_uptime;
1783 
1784 	ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1785 	    kring->ckr_pending_intr != 0);
1786 	kring->ckr_pending_intr = 0;
1787 
1788 	return ret;
1789 }
1790 
1791 static void
nx_netif_na_dtor(struct nexus_adapter * na)1792 nx_netif_na_dtor(struct nexus_adapter *na)
1793 {
1794 	struct ifnet *ifp;
1795 	struct nexus_netif_adapter *nifna = NIFNA(na);
1796 
1797 	SK_LOCK_ASSERT_HELD();
1798 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1799 
1800 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1801 
1802 	/*
1803 	 * If the finalizer callback hasn't been called for whatever
1804 	 * reasons, pick up the embryonic ifnet stored in na_private.
1805 	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1806 	 */
1807 	if ((ifp = na->na_ifp) == NULL) {
1808 		ifp = na->na_private;
1809 		na->na_private = NULL;
1810 	} else {
1811 		ifnet_decr_iorefcnt(ifp);
1812 		na->na_ifp = NULL;
1813 	}
1814 
1815 	if (nifna->nifna_netif != NULL) {
1816 		nx_netif_release(nifna->nifna_netif);
1817 		nifna->nifna_netif = NULL;
1818 	}
1819 	ASSERT(SKYWALK_NATIVE(ifp));
1820 }
1821 
1822 /*
1823  * Dispatch rx/tx interrupts to the channel rings.
1824  *
1825  * The 'notify' routine depends on what the ring is attached to.
1826  * - for a channel file descriptor, do an event wakeup on the individual
1827  *   waitqueue, plus one on the global one if needed (see na_notify)
1828  * - for a device port connected to a FlowSwitch, call the proper
1829  *   forwarding routine; see nx_fsw_tx_hwna_notify()
1830  *   or nx_fsw_rx_hwna_notify().
1831  */
1832 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1833 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1834     uint32_t flags, uint32_t *work_done)
1835 {
1836 	struct netif_stats *nifs =
1837 	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1838 	int (*notify)(struct __kern_channel_ring *kring,
1839 	    struct proc *, uint32_t flags);
1840 	int ret;
1841 
1842 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1843 
1844 	SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1845 	    ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1846 	    "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1847 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1848 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1849 
1850 	/* update our work timestamp */
1851 	KRNA(kring)->na_work_ts = _net_uptime;
1852 
1853 	kring->ckr_pending_intr++;
1854 	if (work_done != NULL) {
1855 		*work_done = 1; /* do not fire again */
1856 	}
1857 	/*
1858 	 * We can't be calling ckr_na_notify here since we could already be
1859 	 * intercepting it, else we'd end up recursively calling ourselves.
1860 	 * Use the original na_notify callback saved during na_activate, or in
1861 	 * the case when the module above us is the flowswitch, the notify
1862 	 * routine that it has installed in place of our original one.
1863 	 */
1864 	if (__probable(!KR_DROP(kring) &&
1865 	    (notify = kring->ckr_netif_notify) != NULL)) {
1866 		ret = notify(kring, p, flags);
1867 	} else {
1868 		/*
1869 		 * If the ring is in drop mode, pretend as if it's busy.
1870 		 * This allows the mitigation thread to pause for a while
1871 		 * before attempting again.
1872 		 */
1873 		ret = EBUSY;
1874 	}
1875 	if (__improbable(ret != 0)) {
1876 		switch (kring->ckr_tx) {
1877 		case NR_RX:
1878 			if (ret == EBUSY) {
1879 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1880 			} else if (ret == EAGAIN) {
1881 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1882 			} else {
1883 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1884 			}
1885 			break;
1886 
1887 		case NR_TX:
1888 			if (ret == EBUSY) {
1889 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1890 			} else if (ret == EAGAIN) {
1891 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1892 			} else {
1893 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1894 			}
1895 			break;
1896 
1897 		default:
1898 			break;
1899 		}
1900 	}
1901 
1902 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1903 
1904 	return ret;
1905 }
1906 
1907 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1908 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1909     uint32_t flags)
1910 {
1911 	return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1912 }
1913 
1914 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1915 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1916     uint32_t flags)
1917 {
1918 	int ret;
1919 
1920 	/*
1921 	 * In the event the mitigation thread is disabled, protect
1922 	 * against recursion by detecting if we're already in the
1923 	 * context of an RX notify.  IOSkywalkFamily may invoke the
1924 	 * notify callback as part of its RX sync callback.
1925 	 */
1926 	if (__probable(!sk_is_rx_notify_protected())) {
1927 		sk_protect_t protect;
1928 		uint32_t work_done;
1929 
1930 		protect = sk_rx_notify_protect();
1931 		ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1932 		sk_sync_unprotect(protect);
1933 	} else {
1934 		ret = EAGAIN;
1935 	}
1936 
1937 	return ret;
1938 }
1939 
1940 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)1941 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
1942     boolean_t *tx_mit, boolean_t *tx_mit_simple,
1943     boolean_t *rx_mit, boolean_t *rx_mit_simple)
1944 {
1945 	struct nx_netif *nif = nifna->nifna_netif;
1946 
1947 	/*
1948 	 * TX mitigation is disabled by default, but can be
1949 	 * overridden via "sk_netif_tx_mit=N" boot-arg, where
1950 	 * N is one of SK_NETIF_MIT_FORCE_* values.
1951 	 */
1952 	*tx_mit = *tx_mit_simple = FALSE;
1953 	switch (sk_netif_tx_mit) {
1954 	case SK_NETIF_MIT_FORCE_SIMPLE:
1955 		*tx_mit_simple = TRUE;
1956 		OS_FALLTHROUGH;
1957 	case SK_NETIF_MIT_FORCE_ADVANCED:
1958 		*tx_mit = TRUE;
1959 		break;
1960 	case SK_NETIF_MIT_FORCE_OFF:
1961 	case SK_NETIF_MIT_AUTO:
1962 		ASSERT(*tx_mit == FALSE);
1963 		break;
1964 	default:
1965 		VERIFY(0);
1966 		/* NOTREACHED */
1967 		__builtin_unreachable();
1968 	}
1969 
1970 	/*
1971 	 * RX mitigation is enabled by default only for BSD-style
1972 	 * virtual network interfaces, but can be overridden
1973 	 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
1974 	 * SK_NETIF_MIT_FORCE_* values.
1975 	 */
1976 	*rx_mit = *rx_mit_simple = FALSE;
1977 	switch (sk_netif_rx_mit) {
1978 	case SK_NETIF_MIT_FORCE_OFF:
1979 		ASSERT(*rx_mit == FALSE);
1980 		break;
1981 	case SK_NETIF_MIT_FORCE_SIMPLE:
1982 		*rx_mit_simple = TRUE;
1983 		OS_FALLTHROUGH;
1984 	case SK_NETIF_MIT_FORCE_ADVANCED:
1985 		*rx_mit = TRUE;
1986 		break;
1987 	case SK_NETIF_MIT_AUTO:
1988 		*rx_mit_simple = TRUE;
1989 #if !XNU_TARGET_OS_OSX
1990 		/*
1991 		 * On non-macOS platforms, enable RX mitigation
1992 		 * thread only for BSD-style virtual (and regular)
1993 		 * interfaces, since otherwise we may run out of
1994 		 * stack when subjected to IPsec processing, etc.
1995 		 */
1996 		*rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
1997 		    NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
1998 #else /* XNU_TARGET_OS_OSX */
1999 		/*
2000 		 * On macOS platform, enable RX mitigation on all but
2001 		 * low-latency interfaces, since we could potentially
2002 		 * have filter providers, etc.  Ideally this should
2003 		 * be detected and dealt with dynamically.
2004 		 */
2005 		*rx_mit = !NETIF_IS_LOW_LATENCY(nif);
2006 #endif /* XNU_TARGET_OS_OSX */
2007 		break;
2008 	default:
2009 		VERIFY(0);
2010 		/* NOTREACHED */
2011 		__builtin_unreachable();
2012 	}
2013 }
2014 
2015 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)2016 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2017 {
2018 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2019 	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2020 	struct nx_netif *nif = nifna->nifna_netif;
2021 	struct ifnet *ifp = na->na_ifp;
2022 	int error = 0;
2023 	uint32_t r;
2024 
2025 	ASSERT(na->na_type == NA_NETIF_DEV);
2026 	ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2027 
2028 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
2029 	    SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2030 
2031 	switch (mode) {
2032 	case NA_ACTIVATE_MODE_ON:
2033 		ASSERT(SKYWALK_CAPABLE(ifp));
2034 
2035 		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
2036 		    &rx_mit, &rx_mit_simple);
2037 
2038 		/*
2039 		 * Init the mitigation support on all the dev TX rings.
2040 		 */
2041 		if (tx_mit) {
2042 			nifna->nifna_tx_mit =
2043 			    skn_alloc_type_array(tx_on, struct nx_netif_mit,
2044 			    na_get_nrings(na, NR_TX), Z_WAITOK,
2045 			    skmem_tag_netif_mit);
2046 			if (nifna->nifna_tx_mit == NULL) {
2047 				SK_ERR("TX mitigation allocation failed");
2048 				error = ENOMEM;
2049 				goto out;
2050 			}
2051 		} else {
2052 			ASSERT(nifna->nifna_tx_mit == NULL);
2053 		}
2054 
2055 		/*
2056 		 * Init the mitigation support on all the dev RX rings.
2057 		 */
2058 		if (rx_mit) {
2059 			nifna->nifna_rx_mit =
2060 			    skn_alloc_type_array(rx_on, struct nx_netif_mit,
2061 			    na_get_nrings(na, NR_RX), Z_WAITOK,
2062 			    skmem_tag_netif_mit);
2063 			if (nifna->nifna_rx_mit == NULL) {
2064 				SK_ERR("RX mitigation allocation failed");
2065 				if (nifna->nifna_tx_mit != NULL) {
2066 					skn_free_type_array(rx_fail,
2067 					    struct nx_netif_mit,
2068 					    na_get_nrings(na, NR_TX),
2069 					    nifna->nifna_tx_mit);
2070 					nifna->nifna_tx_mit = NULL;
2071 				}
2072 				error = ENOMEM;
2073 				goto out;
2074 			}
2075 		} else {
2076 			ASSERT(nifna->nifna_rx_mit == NULL);
2077 		}
2078 
2079 		/* intercept na_notify callback on the TX rings */
2080 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2081 			na->na_tx_rings[r].ckr_netif_notify =
2082 			    na->na_tx_rings[r].ckr_na_notify;
2083 			na->na_tx_rings[r].ckr_na_notify =
2084 			    nx_netif_na_notify_tx;
2085 			if (nifna->nifna_tx_mit != NULL) {
2086 				nx_netif_mit_init(nif, ifp,
2087 				    &nifna->nifna_tx_mit[r],
2088 				    &na->na_tx_rings[r], tx_mit_simple);
2089 			}
2090 		}
2091 
2092 		/* intercept na_notify callback on the RX rings */
2093 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2094 			na->na_rx_rings[r].ckr_netif_notify =
2095 			    na->na_rx_rings[r].ckr_na_notify;
2096 			na->na_rx_rings[r].ckr_na_notify =
2097 			    nx_netif_na_notify_rx;
2098 			if (nifna->nifna_rx_mit != NULL) {
2099 				nx_netif_mit_init(nif, ifp,
2100 				    &nifna->nifna_rx_mit[r],
2101 				    &na->na_rx_rings[r], rx_mit_simple);
2102 			}
2103 		}
2104 		nx_netif_filter_enable(nif);
2105 		nx_netif_flow_enable(nif);
2106 		atomic_bitset_32(&na->na_flags, NAF_ACTIVE);
2107 
2108 		/* steer all start requests to netif; this must not fail */
2109 		lck_mtx_lock(&ifp->if_start_lock);
2110 		error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2111 		VERIFY(error == 0);
2112 		lck_mtx_unlock(&ifp->if_start_lock);
2113 		break;
2114 
2115 	case NA_ACTIVATE_MODE_DEFUNCT:
2116 		ASSERT(SKYWALK_CAPABLE(ifp));
2117 		break;
2118 
2119 	case NA_ACTIVATE_MODE_OFF:
2120 		/*
2121 		 * Note that here we cannot assert SKYWALK_CAPABLE()
2122 		 * as we're called in the destructor path.
2123 		 */
2124 		atomic_bitclear_32(&na->na_flags, NAF_ACTIVE);
2125 		nx_netif_flow_disable(nif);
2126 		nx_netif_filter_disable(nif);
2127 
2128 		/*
2129 		 * Here we may block while holding sk_lock, but because
2130 		 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2131 		 * should immediately return.  A better approach would be
2132 		 * to drop sk_lock and add a monitor for this routine.
2133 		 */
2134 		lck_mtx_lock(&ifp->if_start_lock);
2135 		while (ifp->if_start_active != 0) {
2136 			++ifp->if_start_waiters;
2137 			(void) msleep(&ifp->if_start_waiters,
2138 			    &ifp->if_start_lock, (PZERO - 1),
2139 			    na->na_name, NULL);
2140 		}
2141 		/* steer all start requests to default handler */
2142 		ifnet_reset_start_handler(ifp);
2143 		lck_mtx_unlock(&ifp->if_start_lock);
2144 
2145 		/* reset all TX notify callbacks */
2146 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2147 			na->na_tx_rings[r].ckr_na_notify =
2148 			    na->na_tx_rings[r].ckr_netif_notify;
2149 			na->na_tx_rings[r].ckr_netif_notify = NULL;
2150 			if (nifna->nifna_tx_mit != NULL) {
2151 				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2152 				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2153 			}
2154 		}
2155 
2156 		if (nifna->nifna_tx_mit != NULL) {
2157 			skn_free_type_array(tx_off, struct nx_netif_mit,
2158 			    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
2159 			nifna->nifna_tx_mit = NULL;
2160 		}
2161 
2162 		/* reset all RX notify callbacks */
2163 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2164 			na->na_rx_rings[r].ckr_na_notify =
2165 			    na->na_rx_rings[r].ckr_netif_notify;
2166 			na->na_rx_rings[r].ckr_netif_notify = NULL;
2167 			if (nifna->nifna_rx_mit != NULL) {
2168 				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2169 				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2170 			}
2171 		}
2172 		if (nifna->nifna_rx_mit != NULL) {
2173 			skn_free_type_array(rx_off, struct nx_netif_mit,
2174 			    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
2175 			nifna->nifna_rx_mit = NULL;
2176 		}
2177 		break;
2178 
2179 	default:
2180 		VERIFY(0);
2181 		/* NOTREACHED */
2182 		__builtin_unreachable();
2183 	}
2184 out:
2185 	return error;
2186 }
2187 
2188 SK_NO_INLINE_ATTRIBUTE
2189 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2190 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2191 __attribute__((optnone))
2192 {
2193 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2194 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2195 	struct nexus_netif_adapter *devnifna = NULL;
2196 	struct nexus_netif_adapter *hostnifna = NULL;
2197 	struct nexus_adapter *devna = NULL;
2198 	struct nexus_adapter *hostna = NULL;
2199 	boolean_t embryonic = FALSE;
2200 	int retval = 0;
2201 	uint32_t na_flags;
2202 
2203 	SK_LOCK_ASSERT_HELD();
2204 	ASSERT(SKYWALK_NATIVE(ifp));
2205 	ASSERT(!SKYWALK_CAPABLE(ifp));
2206 	ASSERT(ifp->if_na == NULL);
2207 	ASSERT(ifp->if_na_ops == NULL);
2208 
2209 	devnifna = na_netif_alloc(Z_WAITOK);
2210 	hostnifna = na_netif_alloc(Z_WAITOK);
2211 
2212 	/*
2213 	 * We can be called for two different interface states:
2214 	 *
2215 	 * Fully attached: get an io ref count; upon success, this
2216 	 * holds a reference to the ifnet for the ifp pointer stored
2217 	 * in 'na_ifp' down below for both adapters.
2218 	 *
2219 	 * Embryonic: temporary hold the ifnet in na_private, which
2220 	 * upon a successful ifnet_attach(), will be moved over to
2221 	 * the 'na_ifp' with an io ref count held.
2222 	 *
2223 	 * The ifnet in 'na_ifp' will be released by na_release_locked().
2224 	 */
2225 	if (!ifnet_is_attached(ifp, 1)) {
2226 		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2227 			ifp = NULL;
2228 			retval = ENXIO;
2229 			goto err;
2230 		}
2231 		embryonic = TRUE;
2232 	}
2233 
2234 	/* initialize the device netif adapter */
2235 	devnifna->nifna_netif = nif;
2236 	nx_netif_retain(nif);
2237 	devna = &devnifna->nifna_up;
2238 	devna->na_type = NA_NETIF_DEV;
2239 	devna->na_free = na_netif_free;
2240 	(void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1);
2241 	devna->na_name[sizeof(devna->na_name) - 1] = '\0';
2242 	uuid_generate_random(devna->na_uuid);
2243 	if (embryonic) {
2244 		/*
2245 		 * We will move this over to na_ifp once
2246 		 * the interface is fully attached.
2247 		 */
2248 		devna->na_private = ifp;
2249 		ASSERT(devna->na_ifp == NULL);
2250 	} else {
2251 		ASSERT(devna->na_private == NULL);
2252 		/* use I/O refcnt from ifnet_is_attached() */
2253 		devna->na_ifp = ifp;
2254 	}
2255 	devna->na_activate = nx_netif_na_activate;
2256 	devna->na_channel_event_notify = nx_netif_na_channel_event_notify;
2257 	devna->na_txsync = nx_netif_na_txsync;
2258 	devna->na_rxsync = nx_netif_na_rxsync;
2259 	devna->na_dtor = nx_netif_na_dtor;
2260 	devna->na_krings_create = nx_netif_dev_krings_create;
2261 	devna->na_krings_delete = nx_netif_dev_krings_delete;
2262 	devna->na_special = nx_netif_na_special;
2263 
2264 	na_flags = NAF_NATIVE;
2265 	if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2266 		na_flags |= NAF_VIRTUAL_DEVICE;
2267 	}
2268 	if (NX_LLINK_PROV(nx)) {
2269 		/*
2270 		 * while operating in logical link mode, we don't need to
2271 		 * create backing memory regions for the rings as they are
2272 		 * not used.
2273 		 */
2274 		na_flags |= NAF_MEM_NO_INIT;
2275 	}
2276 	atomic_bitset_32(&devna->na_flags, na_flags);
2277 	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2278 	    NEXUS_STATS_TYPE_INVALID;
2279 
2280 	na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2281 	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2282 	na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2283 	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2284 	/*
2285 	 * Verify upper bounds; the parameters must have already been
2286 	 * validated by nxdom_prov_params() by the time we get here.
2287 	 */
2288 	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2289 	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2290 	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2291 	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2292 
2293 	na_attach_common(devna, nx, &nx_netif_prov_s);
2294 
2295 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2296 	    nx, devna)) != 0) {
2297 		ASSERT(devna->na_arena == NULL);
2298 		goto err;
2299 	}
2300 	ASSERT(devna->na_arena != NULL);
2301 
2302 	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2303 	ASSERT(devna->na_flowadv_max == 0 ||
2304 	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2305 
2306 	/* setup packet copy routines */
2307 	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2308 		nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2309 		nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2310 		nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2311 	} else {
2312 		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2313 		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2314 		nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2315 	}
2316 
2317 	/* initialize the host netif adapter */
2318 	hostnifna->nifna_netif = nif;
2319 	nx_netif_retain(nif);
2320 	hostna = &hostnifna->nifna_up;
2321 	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2322 	    "%s^", devna->na_name);
2323 	uuid_generate_random(hostna->na_uuid);
2324 	if (embryonic) {
2325 		/*
2326 		 * We will move this over to na_ifp once
2327 		 * the interface is fully attached.
2328 		 */
2329 		hostna->na_private = ifp;
2330 		ASSERT(hostna->na_ifp == NULL);
2331 	} else {
2332 		ASSERT(hostna->na_private == NULL);
2333 		hostna->na_ifp = devna->na_ifp;
2334 		ifnet_incr_iorefcnt(hostna->na_ifp);
2335 	}
2336 	hostna->na_type = NA_NETIF_HOST;
2337 	hostna->na_free = na_netif_free;
2338 	hostna->na_activate = nx_netif_host_na_activate;
2339 	hostna->na_txsync = nx_netif_host_na_txsync;
2340 	hostna->na_rxsync = nx_netif_host_na_rxsync;
2341 	hostna->na_dtor = nx_netif_na_dtor;
2342 	hostna->na_krings_create = nx_netif_host_krings_create;
2343 	hostna->na_krings_delete = nx_netif_host_krings_delete;
2344 	hostna->na_special = nx_netif_host_na_special;
2345 
2346 	na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2347 	if (NX_LLINK_PROV(nx)) {
2348 		/*
2349 		 * while operating in logical link mode, we don't need to
2350 		 * create backing memory regions for the rings as they are
2351 		 * not used.
2352 		 */
2353 		na_flags |= NAF_MEM_NO_INIT;
2354 	}
2355 	atomic_bitset_32(&hostna->na_flags, na_flags);
2356 	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2357 	    NEXUS_STATS_TYPE_INVALID;
2358 
2359 	na_set_nrings(hostna, NR_TX, 1);
2360 	na_set_nrings(hostna, NR_RX, 1);
2361 	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2362 	na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2363 
2364 	na_attach_common(hostna, nx, &nx_netif_prov_s);
2365 
2366 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2367 	    nx, hostna)) != 0) {
2368 		ASSERT(hostna->na_arena == NULL);
2369 		goto err;
2370 	}
2371 	ASSERT(hostna->na_arena != NULL);
2372 
2373 	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2374 	ASSERT(hostna->na_flowadv_max == 0 ||
2375 	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2376 
2377 	/* adjust the classq packet drop limit */
2378 	if (embryonic) {
2379 		uint32_t drop_lim;
2380 		struct kern_pbufpool_memory_info pp_info;
2381 
2382 		retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2383 		VERIFY(retval == 0);
2384 
2385 		/* set the drop limit as 80% of size of packet pool */
2386 		drop_lim = (pp_info.kpm_packets * 4) / 5;
2387 		VERIFY(drop_lim != 0);
2388 		IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2389 	}
2390 
2391 	/* these will be undone by destructor  */
2392 	ifp->if_na_ops = &na_netif_ops;
2393 	ifp->if_na = devnifna;
2394 	na_retain_locked(devna);
2395 	na_retain_locked(hostna);
2396 
2397 	SKYWALK_SET_CAPABLE(ifp);
2398 
2399 	NETIF_WLOCK(nif);
2400 	nif->nif_ifp = ifp;
2401 	nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2402 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2403 	    kernproc);
2404 	ASSERT(retval == 0);
2405 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2406 	    kernproc);
2407 	ASSERT(retval == 0);
2408 	NETIF_WUNLOCK(nif);
2409 
2410 #if SK_LOG
2411 	uuid_string_t uuidstr;
2412 	SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2413 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2414 	    sk_uuid_unparse(devna->na_uuid, uuidstr));
2415 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
2416 	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2417 	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2418 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b", devna->na_flags, NAF_BITS);
2419 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
2420 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2421 	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2422 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2423 	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2424 #if CONFIG_NEXUS_USER_PIPE
2425 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
2426 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
2427 #endif /* CONFIG_NEXUS_USER_PIPE */
2428 	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
2429 	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2430 	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2431 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2432 	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
2433 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
2434 	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2435 	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2436 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b",
2437 	    hostna->na_flags, NAF_BITS);
2438 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
2439 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2440 	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2441 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2442 	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2443 #if CONFIG_NEXUS_USER_PIPE
2444 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
2445 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
2446 #endif /* CONFIG_NEXUS_USER_PIPE */
2447 	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
2448 	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2449 #endif /* SK_LOG */
2450 
2451 err:
2452 	if (retval != 0) {
2453 		if (ifp != NULL) {
2454 			if (!embryonic) {
2455 				ifnet_decr_iorefcnt(ifp);
2456 			}
2457 			ifp = NULL;
2458 		}
2459 		if (devna != NULL) {
2460 			if (devna->na_arena != NULL) {
2461 				skmem_arena_release(devna->na_arena);
2462 				devna->na_arena = NULL;
2463 			}
2464 			if (devna->na_ifp != NULL) {
2465 				ifnet_decr_iorefcnt(devna->na_ifp);
2466 				devna->na_ifp = NULL;
2467 			}
2468 			devna->na_private = NULL;
2469 		}
2470 		if (hostna != NULL) {
2471 			if (hostna->na_arena != NULL) {
2472 				skmem_arena_release(hostna->na_arena);
2473 				hostna->na_arena = NULL;
2474 			}
2475 			if (hostna->na_ifp != NULL) {
2476 				ifnet_decr_iorefcnt(hostna->na_ifp);
2477 				hostna->na_ifp = NULL;
2478 			}
2479 			hostna->na_private = NULL;
2480 		}
2481 		if (devnifna != NULL) {
2482 			if (devnifna->nifna_netif != NULL) {
2483 				nx_netif_release(devnifna->nifna_netif);
2484 				devnifna->nifna_netif = NULL;
2485 			}
2486 			na_netif_free((struct nexus_adapter *)devnifna);
2487 		}
2488 		if (hostnifna != NULL) {
2489 			if (hostnifna->nifna_netif != NULL) {
2490 				nx_netif_release(hostnifna->nifna_netif);
2491 				hostnifna->nifna_netif = NULL;
2492 			}
2493 			na_netif_free((struct nexus_adapter *)hostnifna);
2494 		}
2495 	}
2496 	return retval;
2497 }
2498 
2499 /*
2500  * Any per-netif state that can be discovered at attach time should be
2501  * initialized here.
2502  */
2503 static void
nx_netif_flags_init(struct nx_netif * nif)2504 nx_netif_flags_init(struct nx_netif *nif)
2505 {
2506 	ifnet_t ifp = nif->nif_ifp;
2507 	struct kern_nexus *nx = nif->nif_nx;
2508 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2509 
2510 	switch (devna->na_type) {
2511 	case NA_NETIF_DEV:
2512 		if (strcmp(ifp->if_name, sk_ll_prefix) == 0) {
2513 			nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2514 			if_set_xflags(ifp, IFXF_LOW_LATENCY);
2515 		}
2516 		break;
2517 	case NA_NETIF_COMPAT_DEV:
2518 		nif->nif_flags |= NETIF_FLAG_COMPAT;
2519 		break;
2520 	default:
2521 		break;
2522 	}
2523 }
2524 
2525 /*
2526  * This is also supposed to check for any inconsistent state at detach time.
2527  */
2528 static void
nx_netif_flags_fini(struct nx_netif * nif)2529 nx_netif_flags_fini(struct nx_netif *nif)
2530 {
2531 	ifnet_t ifp = nif->nif_ifp;
2532 
2533 	if (ifp != NULL) {
2534 		if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2535 	}
2536 	nif->nif_flags = 0;
2537 }
2538 
2539 static void
nx_netif_capabilities_init(struct nx_netif * nif)2540 nx_netif_capabilities_init(struct nx_netif *nif)
2541 {
2542 	struct kern_nexus_capab_interface_advisory kncia;
2543 	struct kern_nexus *nx = nif->nif_nx;
2544 	nxprov_capab_config_fn_t capab_fn;
2545 	uint32_t capab_len;
2546 	int error;
2547 
2548 	if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2549 	    KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2550 		capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2551 		ASSERT(capab_fn != NULL);
2552 	} else {
2553 		capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2554 	}
2555 	if (capab_fn == NULL) {
2556 		return;
2557 	}
2558 	/* check/configure interface advisory notifications */
2559 	if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) != 0) {
2560 		bzero(&kncia, sizeof(kncia));
2561 		kncia.kncia_version =
2562 		    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2563 		*__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2564 		    &(kncia.kncia_notify)) = nx_netif_interface_advisory_notify;
2565 		*__DECONST(void **, &(kncia.kncia_kern_context)) = nx;
2566 		capab_len = sizeof(kncia);
2567 		error = capab_fn(NX_PROV(nx), nx,
2568 		    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &kncia, &capab_len);
2569 		if (error == 0) {
2570 			VERIFY(kncia.kncia_config != NULL);
2571 			VERIFY(kncia.kncia_provider_context != NULL);
2572 			nif->nif_intf_adv_config = kncia.kncia_config;
2573 			nif->nif_intf_adv_prov_ctx =
2574 			    kncia.kncia_provider_context;
2575 		}
2576 	}
2577 }
2578 
2579 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2580 nx_netif_capabilities_fini(struct nx_netif *nif)
2581 {
2582 	nif->nif_intf_adv_config = NULL;
2583 	nif->nif_intf_adv_prov_ctx = NULL;
2584 }
2585 
2586 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2587 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2588 {
2589 	struct nx_netif *nif = nifna->nifna_netif;
2590 	struct kern_nexus *nx = nif->nif_nx;
2591 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2592 	struct nexus_adapter *hostna = nx_port_get_na(nx,
2593 	    NEXUS_PORT_NET_IF_HOST);
2594 
2595 	ASSERT(devna != NULL);
2596 	ASSERT(hostna != NULL);
2597 
2598 	if (!ifnet_is_attached(ifp, 1)) {
2599 		VERIFY(0);
2600 		/* NOTREACHED */
2601 		__builtin_unreachable();
2602 	}
2603 
2604 	ASSERT(devna->na_private == ifp);
2605 	ASSERT(devna->na_ifp == NULL);
2606 	/* use I/O refcnt held by ifnet_is_attached() above */
2607 	devna->na_ifp = devna->na_private;
2608 	devna->na_private = NULL;
2609 
2610 	ASSERT(hostna->na_private == ifp);
2611 	ASSERT(hostna->na_ifp == NULL);
2612 	hostna->na_ifp = hostna->na_private;
2613 	hostna->na_private = NULL;
2614 	ifnet_incr_iorefcnt(hostna->na_ifp);
2615 
2616 	nx_netif_flags_init(nif);
2617 	nx_netif_llink_init(nif);
2618 	nx_netif_filter_init(nif);
2619 	nx_netif_flow_init(nif);
2620 	nx_netif_capabilities_init(nif);
2621 	nx_netif_agent_init(nif);
2622 }
2623 
2624 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2625 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2626     uint32_t thres, boolean_t low)
2627 {
2628 #pragma unused(ifp)
2629 	struct nx_netif *nif = nifna->nifna_netif;
2630 	struct kern_nexus *nx = nif->nif_nx;
2631 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2632 	uint64_t now = _net_uptime;
2633 	boolean_t purge;
2634 
2635 	ASSERT(thres != 0);
2636 
2637 	if (devna->na_work_ts == 0) {
2638 		return;
2639 	}
2640 
2641 	/*
2642 	 * Purge if it's has been inactive for some time (twice the drain
2643 	 * threshold), and clear the work timestamp to temporarily skip this
2644 	 * adapter until it's active again.  Purging cached objects can be
2645 	 * expensive since we'd need to allocate and construct them again,
2646 	 * so we do it only when necessary.
2647 	 */
2648 	if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2649 		devna->na_work_ts = 0;
2650 		purge = TRUE;
2651 	} else {
2652 		purge = FALSE;
2653 	}
2654 
2655 	SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2656 	    (purge ? "purging" : "pruning"), devna->na_name);
2657 
2658 	/*
2659 	 * Device and host adapters share the same packet buffer pool,
2660 	 * so just reap the arena belonging to the device instance.
2661 	 */
2662 	skmem_arena_reap(devna->na_arena, purge);
2663 
2664 	/*
2665 	 * Reap any caches configured for classq.
2666 	 */
2667 	ifclassq_reap_caches(purge);
2668 }
2669 
2670 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)2671 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2672     struct if_netif_stats *if_ns)
2673 {
2674 	struct nx_netif_mit *mit;
2675 	struct mit_cfg_tbl *mit_cfg;
2676 
2677 	if ((mit = nifna->nifna_rx_mit) == NULL) {
2678 		return;
2679 	}
2680 
2681 	if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2682 		return;
2683 	}
2684 
2685 	if_ns->ifn_rx_mit_interval = mit->mit_interval;
2686 	if_ns->ifn_rx_mit_mode = mit->mit_mode;
2687 	if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2688 	if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2689 	if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2690 	if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2691 	if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2692 	if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2693 	if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2694 
2695 	VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2696 	mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
2697 	if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
2698 	if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
2699 	if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
2700 	if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
2701 	if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
2702 }
2703 
2704 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2705 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
2706     struct chreq *chr, nxspec_cmd_t spec_cmd)
2707 {
2708 	ASSERT(na->na_type == NA_NETIF_DEV ||
2709 	    na->na_type == NA_NETIF_COMPAT_DEV);
2710 	return nx_netif_na_special_common(na, ch, chr, spec_cmd);
2711 }
2712 
2713 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2714 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
2715     struct chreq *chr, nxspec_cmd_t spec_cmd)
2716 {
2717 	int error = 0;
2718 
2719 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
2720 	    na->na_type == NA_NETIF_COMPAT_DEV ||
2721 	    na->na_type == NA_NETIF_COMPAT_HOST);
2722 	SK_LOCK_ASSERT_HELD();
2723 
2724 	switch (spec_cmd) {
2725 	case NXSPEC_CMD_CONNECT:
2726 		/*
2727 		 * netif adapter isn't created exclusively for kernel.
2728 		 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
2729 		 * na_special() connect and disconnect.
2730 		 */
2731 		if (NA_KERNEL_ONLY(na)) {
2732 			error = EBUSY;
2733 			goto done;
2734 		}
2735 		ASSERT(!(na->na_flags & NAF_SPEC_INIT));
2736 
2737 		atomic_bitset_32(&na->na_flags, NAF_KERNEL_ONLY);
2738 		error = na_bind_channel(na, ch, chr);
2739 		if (error != 0) {
2740 			atomic_bitclear_32(&na->na_flags, NAF_KERNEL_ONLY);
2741 			goto done;
2742 		}
2743 		atomic_bitset_32(&na->na_flags, NAF_SPEC_INIT);
2744 		break;
2745 
2746 	case NXSPEC_CMD_DISCONNECT:
2747 		ASSERT(NA_KERNEL_ONLY(na));
2748 		ASSERT(na->na_channels > 0);
2749 		ASSERT(na->na_flags & NAF_SPEC_INIT);
2750 		na_unbind_channel(ch);
2751 		atomic_bitclear_32(&na->na_flags,
2752 		    (NAF_SPEC_INIT | NAF_KERNEL_ONLY));
2753 		break;
2754 
2755 	case NXSPEC_CMD_START:
2756 		na_kr_drop(na, FALSE);
2757 		break;
2758 
2759 	case NXSPEC_CMD_STOP:
2760 		na_kr_drop(na, TRUE);
2761 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
2762 		lck_mtx_lock(&ch->ch_lock);
2763 		nxprov_advise_disconnect(na->na_nx, ch);
2764 		lck_mtx_unlock(&ch->ch_lock);
2765 		break;
2766 
2767 	default:
2768 		error = EINVAL;
2769 		break;
2770 	}
2771 
2772 done:
2773 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
2774 	    "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
2775 	    "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
2776 	    na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
2777 
2778 	return error;
2779 }
2780 
2781 /*
2782  * Get a skywalk netif adapter for the port.
2783  */
2784 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)2785 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
2786     struct chreq *chr, struct nxbind *nxb, struct proc *p,
2787     struct nexus_adapter **nap, boolean_t create)
2788 {
2789 #pragma unused(ch)
2790 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2791 	boolean_t anon = NX_ANONYMOUS_PROV(nx);
2792 	ch_endpoint_t ep = chr->cr_endpoint;
2793 	nexus_port_t nx_port = chr->cr_port;
2794 	struct nexus_adapter *na = NULL;
2795 	struct ifnet *ifp;
2796 	int err = 0;
2797 
2798 	SK_LOCK_ASSERT_HELD();
2799 	*nap = NULL; /* default */
2800 
2801 #if SK_LOG
2802 	uuid_string_t uuidstr;
2803 	SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
2804 	    "ring_id %d ring_set %u ep_type %u:%u create %u%s",
2805 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
2806 	    (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
2807 	    chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
2808 	    chr->cr_real_endpoint, chr->cr_endpoint, create,
2809 	    (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
2810 #endif /* SK_LOG */
2811 
2812 	if (!create || ep != CH_ENDPOINT_NET_IF) {
2813 		err = ENODEV;
2814 		goto done;
2815 	}
2816 
2817 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
2818 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
2819 		err = ENXIO;
2820 		goto done;
2821 	}
2822 	ifp = nif->nif_ifp;
2823 	if (!(SKYWALK_CAPABLE(ifp))) {
2824 		SK_ERR("interface %s is no longer usable", if_name(ifp));
2825 		err = ENOTSUP;
2826 		goto done;
2827 	}
2828 
2829 	if (chr->cr_mode & CHMODE_LOW_LATENCY) {
2830 		SK_ERR("low latency is not supported for netif channel");
2831 		err = ENOTSUP;
2832 		goto done;
2833 	}
2834 
2835 	switch (nx_port) {
2836 	case NEXUS_PORT_NET_IF_DEV:
2837 		/*
2838 		 * We have to reject direct user open that's not explicitly
2839 		 * allowed because netif nexuses do not by default have
2840 		 * user memory regions.
2841 		 */
2842 		if (p != kernproc &&
2843 		    (!skywalk_netif_direct_allowed(ifp->if_xname) ||
2844 		    (kauth_cred_issuser(kauth_cred_get()) == 0 &&
2845 		    (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
2846 		    !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
2847 			DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
2848 			    ifp, struct chreq *, chr);
2849 			err = ENOTSUP;
2850 			goto done;
2851 		}
2852 		if (chr->cr_mode & CHMODE_EVENT_RING) {
2853 			SK_ERR("event ring is not supported for netif dev port channel");
2854 			err = ENOTSUP;
2855 			goto done;
2856 		}
2857 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2858 		break;
2859 
2860 	case NEXUS_PORT_NET_IF_HOST:
2861 		if (p != kernproc) {
2862 			err = ENOTSUP;
2863 			goto done;
2864 		}
2865 		if (chr->cr_mode & CHMODE_EVENT_RING) {
2866 			SK_ERR("event ring is not supported for netif host port channel");
2867 			err = ENOTSUP;
2868 			goto done;
2869 		}
2870 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
2871 		break;
2872 
2873 	default:
2874 		ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
2875 
2876 		NETIF_WLOCK(nif);
2877 		err = nx_port_alloc(nx, nx_port, nxb, &na, p);
2878 		if (err != 0) {
2879 			NETIF_WUNLOCK(nif);
2880 			goto done;
2881 		}
2882 
2883 		if (na == NULL) {
2884 			if (chr->cr_mode & CHMODE_FILTER) {
2885 				err = netif_filter_na_create(nx, chr, &na);
2886 			} else {
2887 				err = netif_vp_na_create(nx, chr, &na);
2888 			}
2889 			if (err != 0) {
2890 				NETIF_WUNLOCK(nif);
2891 				goto done;
2892 			}
2893 			err = nx_port_alloc(nx, nx_port, nxb, &na, p);
2894 			if (err != 0) {
2895 				NETIF_WUNLOCK(nif);
2896 				goto done;
2897 			}
2898 		}
2899 		NETIF_WUNLOCK(nif);
2900 
2901 		break;
2902 	}
2903 
2904 	ASSERT(err == 0);
2905 	ASSERT(na != NULL);
2906 
2907 #if CONFIG_NEXUS_USER_PIPE
2908 	if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
2909 #else /* !CONFIG_NEXUS_USER_PIPE */
2910 	if (NA_OWNED_BY_ANY(na)) {
2911 #endif /* !CONFIG_NEXUS_USER_PIPE */
2912 		err = EBUSY;
2913 		na = NULL;
2914 		goto done;
2915 	}
2916 
2917 	*nap = na;
2918 	na_retain_locked(na);
2919 
2920 done:
2921 	ASSERT(err != 0 || na != NULL);
2922 	if (err) {
2923 		SK_ERR("na not found, err(%d)", err);
2924 	} else {
2925 		SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
2926 	}
2927 	return err;
2928 }
2929 
2930 /* na_krings_create callback for all netif device adapters */
2931 int
2932 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
2933 {
2934 	int ret;
2935 
2936 	ASSERT(na->na_type == NA_NETIF_DEV ||
2937 	    na->na_type == NA_NETIF_COMPAT_DEV);
2938 	/*
2939 	 * Allocate context structures for native netif only, for
2940 	 * IOSkywalkFamily to store its object references.
2941 	 */
2942 	ret = na_rings_mem_setup(na, 0, (na->na_flags & NAF_NATIVE), ch);
2943 
2944 	/*
2945 	 * We mark CKRF_DROP for kernel-only rings (kernel channel
2946 	 * opened by the flowswitch, etc.) to prevent packets from
2947 	 * going thru until after the client of the kernel channel
2948 	 * has fully plumbed things on its side.  For userland-facing
2949 	 * rings (regular channel opened to netif), this is not
2950 	 * required, and so don't mark CKRF_DROP there.
2951 	 */
2952 	if (ret == 0 && NA_KERNEL_ONLY(na)) {
2953 		na_kr_drop(na, TRUE);
2954 	}
2955 
2956 	return ret;
2957 }
2958 
2959 /* call with SK_LOCK held */
2960 void
2961 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
2962     boolean_t defunct)
2963 {
2964 	ASSERT(na->na_type == NA_NETIF_DEV ||
2965 	    na->na_type == NA_NETIF_COMPAT_DEV);
2966 
2967 	/* see comments in nx_netif_dev_krings_create() */
2968 	if (NA_KERNEL_ONLY(na)) {
2969 		na_kr_drop(na, TRUE);
2970 	}
2971 
2972 	na_rings_mem_teardown(na, ch, defunct);
2973 }
2974 
2975 struct nx_netif *
2976 nx_netif_alloc(zalloc_flags_t how)
2977 {
2978 	struct nx_netif *n;
2979 
2980 	SK_LOCK_ASSERT_HELD();
2981 
2982 	n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
2983 	if (n == NULL) {
2984 		return NULL;
2985 	}
2986 
2987 	NETIF_RWINIT(n);
2988 	os_ref_init(&n->nif_refcnt, NULL);
2989 	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
2990 
2991 	return n;
2992 }
2993 
2994 static void
2995 nx_netif_destroy(struct nx_netif *n)
2996 {
2997 	ASSERT(n->nif_dev_nxb == NULL);
2998 	ASSERT(n->nif_host_nxb == NULL);
2999 	ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3000 	nx_netif_llink_config_free(n);
3001 	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3002 	NETIF_RWDESTROY(n);
3003 	zfree(nx_netif_zone, n);
3004 }
3005 
3006 void
3007 nx_netif_release(struct nx_netif *n)
3008 {
3009 	SK_LOCK_ASSERT_HELD();
3010 
3011 	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3012 	    os_ref_get_count(&n->nif_refcnt));
3013 	if (os_ref_release(&n->nif_refcnt) == 0) {
3014 		nx_netif_destroy(n);
3015 	}
3016 }
3017 
3018 void
3019 nx_netif_retain(struct nx_netif *n)
3020 {
3021 	SK_LOCK_ASSERT_HELD();
3022 
3023 	/* retaining an object with a zero refcount is not allowed */
3024 	ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3025 	os_ref_retain(&n->nif_refcnt);
3026 	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3027 	    os_ref_get_count(&n->nif_refcnt));
3028 }
3029 
3030 void
3031 nx_netif_free(struct nx_netif *n)
3032 {
3033 	nx_netif_release(n);
3034 }
3035 
3036 static int
3037 nx_netif_na_channel_event_notify(struct nexus_adapter *na,
3038     struct __kern_packet *kpkt, struct __kern_channel_event *ev,
3039     uint16_t ev_len)
3040 {
3041 	int err;
3042 	struct netif_flow *nf;
3043 	struct nexus_adapter *netif_vpna;
3044 	struct nx_netif *nif = NIFNA(na)->nifna_netif;
3045 	struct netif_stats *nifs = &NIFNA(na)->nifna_netif->nif_stats;
3046 
3047 	NETIF_RLOCK(nif);
3048 	if (!NETIF_IS_LOW_LATENCY(nif)) {
3049 		err = ENOTSUP;
3050 		goto error;
3051 	}
3052 	if (__improbable(!NA_IS_ACTIVE(na))) {
3053 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NA_INACTIVE);
3054 		err = ENXIO;
3055 		goto error;
3056 	}
3057 	if (__improbable(NA_IS_DEFUNCT(na))) {
3058 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NA_DEFUNCT);
3059 		err = ENXIO;
3060 		goto error;
3061 	}
3062 	if (__improbable(nif->nif_vp_cnt == 0)) {
3063 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NO_VPNA);
3064 		err = ENXIO;
3065 		goto error;
3066 	}
3067 	/* The returned netif flow is refcounted. */
3068 	nf = nx_netif_flow_classify(nif, kpkt, NETIF_FLOW_OUTBOUND);
3069 	if (nf == NULL) {
3070 		SK_ERR("unclassified event (%d) dropped", ev->ev_type);
3071 		STATS_INC(nifs, NETIF_STATS_EV_DROP_DEMUX_ERR);
3072 		err = ENOENT;
3073 		goto error;
3074 	}
3075 	netif_vpna = (struct nexus_adapter *)nf->nf_cb_arg;
3076 	if (netif_vpna->na_channel_event_notify != NULL) {
3077 		err = netif_vpna->na_channel_event_notify(netif_vpna, kpkt,
3078 		    ev, ev_len);
3079 	} else {
3080 		STATS_INC(nifs, NETIF_STATS_EV_DROP_EV_VPNA_NOTSUP);
3081 		err = ENOTSUP;
3082 	}
3083 	nx_netif_flow_release(nif, nf);
3084 	NETIF_RUNLOCK(nif);
3085 	nf = NULL;
3086 	return err;
3087 
3088 error:
3089 	STATS_INC(nifs, NETIF_STATS_EV_DROP);
3090 	NETIF_RUNLOCK(nif);
3091 	return err;
3092 }
3093 
3094 static int
3095 nx_netif_interface_advisory_notify_common(struct kern_nexus *nx,
3096     const struct ifnet_interface_advisory *advisory)
3097 {
3098 	struct kern_nexus *notify_nx;
3099 	struct __kern_netif_intf_advisory *intf_adv;
3100 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3101 
3102 	if (nif->nif_fsw_nxadv != NULL) {
3103 		ASSERT(nif->nif_fsw != NULL);
3104 		intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3105 		notify_nx = nif->nif_fsw->fsw_nx;
3106 	} else {
3107 		intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3108 		notify_nx = nif->nif_nx;
3109 	}
3110 	/*
3111 	 * copy the advisory report in shared memory
3112 	 */
3113 	intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3114 	    sizeof(*advisory), 0);
3115 	STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3116 	/*
3117 	 * notify user channels on advisory report availability
3118 	 */
3119 	nx_interface_advisory_notify(notify_nx);
3120 	return 0;
3121 }
3122 
3123 int
3124 nx_netif_interface_advisory_report(struct nexus_adapter *devna,
3125     const struct ifnet_interface_advisory *advisory)
3126 {
3127 	ASSERT(devna->na_type == NA_NETIF_DEV);
3128 	if (__improbable(!NA_IS_ACTIVE(devna))) {
3129 		return ENXIO;
3130 	}
3131 	if (__improbable(NA_IS_DEFUNCT(devna))) {
3132 		return ENXIO;
3133 	}
3134 	return nx_netif_interface_advisory_notify_common(devna->na_nx,
3135 	           advisory);
3136 }
3137 
3138 static errno_t
3139 nx_netif_interface_advisory_notify(void *kern_ctx,
3140     const struct ifnet_interface_advisory *advisory)
3141 {
3142 	if (__improbable(kern_ctx == NULL || advisory == NULL ||
3143 	    advisory->version != IF_INTERFACE_ADVISORY_VERSION_CURRENT)) {
3144 		return EINVAL;
3145 	}
3146 	if (__improbable((advisory->direction !=
3147 	    IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3148 	    (advisory->direction != IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3149 		return EINVAL;
3150 	}
3151 	return nx_netif_interface_advisory_notify_common(kern_ctx, advisory);
3152 }
3153 
3154 void
3155 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3156 {
3157 	struct kern_nexus *nx_netif;
3158 	struct nx_netif *nif;
3159 
3160 	if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3161 		return;
3162 	}
3163 	if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3164 		struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3165 		nx_netif = fsw->fsw_nifna->na_nx;
3166 	} else {
3167 		nx_netif = nx;
3168 	}
3169 	ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3170 	nif = NX_NETIF_PRIVATE(nx_netif);
3171 	if (nif->nif_intf_adv_config != NULL) {
3172 		nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3173 	}
3174 }
3175 
3176 /*
3177  * This function has no use anymore since we are now passing truncated packets
3178  * to filters. We keep this logic just in case we need to prevent certain
3179  * packets from being passed to filters.
3180  */
3181 static boolean_t
3182 packet_is_filterable(struct nexus_netif_adapter *nifna,
3183     struct __kern_packet *pkt)
3184 {
3185 #pragma unused (nifna, pkt)
3186 	return TRUE;
3187 }
3188 
3189 /*
3190  * This function is only meant for supporting the RX path because the TX path
3191  * will not send packets > MTU size due to the disabling of TSO when filters
3192  * are enabled.
3193  */
3194 static void
3195 get_filterable_packets(struct nexus_netif_adapter *nifna,
3196     struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3197     struct __kern_packet **passthrough_chain)
3198 {
3199 	struct nx_netif *nif = nifna->nifna_netif;
3200 	struct netif_stats *nifs = &nif->nif_stats;
3201 	struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3202 	struct __kern_packet *fpkt_head = NULL, *passthrough_head = NULL;
3203 	struct __kern_packet **fpkt_tailp = &fpkt_head;
3204 	struct __kern_packet **passthrough_tailp = &passthrough_head;
3205 	int fcnt = 0, pcnt = 0, dcnt = 0;
3206 
3207 	while (pkt != NULL) {
3208 		next = pkt->pkt_nextpkt;
3209 		pkt->pkt_nextpkt = NULL;
3210 
3211 		if (!packet_is_filterable(nifna, pkt)) {
3212 			pcnt++;
3213 			*passthrough_tailp = pkt;
3214 			passthrough_tailp = &pkt->pkt_nextpkt;
3215 			pkt = next;
3216 			continue;
3217 		}
3218 		fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3219 		if (fpkt != NULL) {
3220 			fcnt++;
3221 			*fpkt_tailp = fpkt;
3222 			fpkt_tailp = &fpkt->pkt_nextpkt;
3223 		} else {
3224 			dcnt++;
3225 		}
3226 		pkt = next;
3227 	}
3228 	*fpkt_chain = fpkt_head;
3229 	*passthrough_chain = passthrough_head;
3230 
3231 	/*
3232 	 * No need to increment drop stats because that's already
3233 	 * done in nx_netif_pkt_to_filter_pkt.
3234 	 */
3235 	STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3236 	DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3237 	    int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3238 	    fpkt_head, struct __kern_packet *, passthrough_head);
3239 }
3240 
3241 /*
3242  * This is only used by ring-based notify functions for now.
3243  * When a qset-based notify becomes available, this function can be used
3244  * unmodified.
3245  */
3246 void
3247 netif_receive(struct nexus_netif_adapter *nifna,
3248     struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3249 {
3250 	struct nx_netif *nif = nifna->nifna_netif;
3251 	struct nexus_adapter *na = &nifna->nifna_up;
3252 	struct netif_stats *nifs = &nif->nif_stats;
3253 	int err, dropcnt, dropstat = -1;
3254 
3255 	/* update our work timestamp */
3256 	na->na_work_ts = _net_uptime;
3257 
3258 	if (nif->nif_filter_cnt > 0) {
3259 		struct __kern_packet *fpkt_chain = NULL;
3260 		struct __kern_packet *passthrough_chain = NULL;
3261 
3262 		get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3263 		    &passthrough_chain);
3264 		if (fpkt_chain != NULL) {
3265 			(void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3266 			    NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3267 		}
3268 		if (passthrough_chain != NULL) {
3269 			pkt_chain = passthrough_chain;
3270 		} else {
3271 			return;
3272 		}
3273 	} else if (nx_netif_filter_default_drop != 0) {
3274 		DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3275 		    struct __kern_packet *, pkt_chain);
3276 		dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3277 		goto drop;
3278 	}
3279 	if (nif->nif_flow_cnt > 0) {
3280 		struct __kern_packet *remain = NULL;
3281 
3282 		err = nx_netif_demux(nifna, pkt_chain, &remain,
3283 		    NETIF_FLOW_SOURCE);
3284 		if (remain == NULL) {
3285 			return;
3286 		}
3287 		pkt_chain = remain;
3288 	}
3289 	if (na->na_rx != NULL) {
3290 		na->na_rx(na, pkt_chain, stats);
3291 	} else {
3292 		DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3293 		    struct __kern_packet *, pkt_chain);
3294 		dropstat = NETIF_STATS_DROP_NO_RX_CB;
3295 		goto drop;
3296 	}
3297 	return;
3298 drop:
3299 	dropcnt = 0;
3300 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3301 	if (dropstat != -1) {
3302 		STATS_ADD(nifs, dropstat, dropcnt);
3303 	}
3304 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3305 }
3306 
3307 static slot_idx_t
3308 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3309     slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3310 {
3311 	uint64_t elapsed;
3312 	uint64_t now;
3313 	struct __kern_packet *pkt;
3314 	clock_sec_t sec;
3315 	clock_usec_t usec;
3316 	slot_idx_t i;
3317 
3318 	if (__probable(rate == 0)) {
3319 		return end;
3320 	}
3321 
3322 	/* init tbr if not so */
3323 	if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3324 		r->ckr_tbr_token = rate;
3325 		r->ckr_tbr_depth = rate;
3326 		r->ckr_tbr_last = mach_absolute_time();
3327 	} else {
3328 		now = mach_absolute_time();
3329 		elapsed = now - r->ckr_tbr_last;
3330 		absolutetime_to_microtime(elapsed, &sec, &usec);
3331 		r->ckr_tbr_token +=
3332 		    ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3333 		if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3334 			r->ckr_tbr_token = r->ckr_tbr_depth;
3335 		}
3336 		r->ckr_tbr_last = now;
3337 	}
3338 
3339 	*rate_limited = FALSE;
3340 	for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3341 		pkt = KR_KSD(r, i)->sd_pkt;
3342 		if (__improbable(pkt == NULL)) {
3343 			continue;
3344 		}
3345 		if (__improbable(r->ckr_tbr_token <= 0)) {
3346 			end = i;
3347 			*rate_limited = TRUE;
3348 			break;
3349 		}
3350 		r->ckr_tbr_token -= pkt->pkt_length * 8;
3351 	}
3352 
3353 	SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3354 	    r, r->ckr_name, i);
3355 
3356 	return end;
3357 }
3358 
3359 SK_NO_INLINE_ATTRIBUTE
3360 static struct __kern_packet *
3361 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3362 {
3363 	struct __kern_packet *pkt_chain = NULL, **tailp = &pkt_chain;
3364 	slot_idx_t idx = ring->ckr_rhead;
3365 
3366 	while (idx != end) {
3367 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3368 		struct __kern_packet *pkt = ksd->sd_pkt;
3369 
3370 		ASSERT(pkt->pkt_nextpkt == NULL);
3371 		KR_SLOT_DETACH_METADATA(ring, ksd);
3372 		*tailp = pkt;
3373 		tailp = &pkt->pkt_nextpkt;
3374 		idx = SLOT_NEXT(idx, ring->ckr_lim);
3375 	}
3376 	ring->ckr_rhead = end;
3377 	ring->ckr_rtail = ring->ckr_ktail;
3378 	return pkt_chain;
3379 }
3380 
3381 int
3382 netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3383     uint32_t flags)
3384 {
3385 	struct nexus_adapter *hwna;
3386 	struct nexus_netif_adapter *nifna;
3387 	struct nx_netif *nif;
3388 	struct __kern_packet *pkt_chain;
3389 	struct nexus_pkt_stats stats;
3390 	sk_protect_t protect;
3391 	slot_idx_t ktail;
3392 	int err = 0;
3393 
3394 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3395 	    SK_KVA(ring));
3396 
3397 	ASSERT(ring->ckr_tx == NR_RX);
3398 	ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3399 
3400 	err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3401 	if (err != 0) {
3402 		/* not a serious error, so no need to be chatty here */
3403 		SK_DF(SK_VERB_FSW,
3404 		    "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3405 		    "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3406 		    ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3407 		    CKRF_BITS, err);
3408 		goto out;
3409 	}
3410 	if (__improbable(KR_DROP(ring))) {
3411 		kr_exit(ring);
3412 		err = ENODEV;
3413 		goto out;
3414 	}
3415 	hwna = KRNA(ring);
3416 	nifna = NIFNA(hwna);
3417 	nif = nifna->nifna_netif;
3418 	if (__improbable(hwna->na_ifp == NULL)) {
3419 		kr_exit(ring);
3420 		err = ENODEV;
3421 		goto out;
3422 	}
3423 	protect = sk_sync_protect();
3424 	err = ring->ckr_na_sync(ring, p, 0);
3425 	if (err != 0 && err != EAGAIN) {
3426 		goto put_out;
3427 	}
3428 
3429 	/* read the tail pointer once */
3430 	ktail = ring->ckr_ktail;
3431 	if (__improbable(ring->ckr_khead == ktail)) {
3432 		SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3433 		    "how strange, interrupt with no packets on hwna "
3434 		    "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3435 		goto put_out;
3436 	}
3437 	ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3438 	    ktail, &ring->ckr_rate_limited);
3439 
3440 	pkt_chain = consume_pkts(ring, ktail);
3441 	if (pkt_chain != NULL) {
3442 		netif_receive(nifna, pkt_chain, &stats);
3443 
3444 		if (ring->ckr_netif_mit_stats != NULL &&
3445 		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3446 			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3447 			    stats.nps_bytes);
3448 		}
3449 	}
3450 
3451 put_out:
3452 	sk_sync_unprotect(protect);
3453 	kr_exit(ring);
3454 
3455 out:
3456 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3457 	    SK_KVA(ring), err);
3458 	return err;
3459 }
3460 
3461 int
3462 netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3463     uint32_t flags)
3464 {
3465 #pragma unused(p, flags)
3466 	sk_protect_t protect;
3467 	struct nexus_adapter *hwna;
3468 	struct nexus_pkt_stats stats = {};
3469 	uint32_t i, count;
3470 	int err = 0;
3471 
3472 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3473 	    SK_KVA(ring));
3474 
3475 	/* XXX
3476 	 * sk_sync_protect() is not needed for this case because
3477 	 * we are not using the dev ring. Unfortunately lots of
3478 	 * macros used by fsw still require this.
3479 	 */
3480 	protect = sk_sync_protect();
3481 	hwna = KRNA(ring);
3482 	count = na_get_nslots(hwna, NR_RX);
3483 	err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
3484 	if (__improbable(err != 0)) {
3485 		SK_ERR("nx_rx_sync_packets failed: %d", err);
3486 		DTRACE_SKYWALK2(rx__sync__packets__failed,
3487 		    struct __kern_channel_ring *, ring, int, err);
3488 		goto out;
3489 	}
3490 	DTRACE_SKYWALK1(chain__count, uint32_t, count);
3491 	for (i = 0; i < count; i++) {
3492 		struct __kern_packet *pkt_chain;
3493 
3494 		pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3495 		ASSERT(pkt_chain != NULL);
3496 		netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);
3497 
3498 		if (ring->ckr_netif_mit_stats != NULL &&
3499 		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3500 			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3501 			    stats.nps_bytes);
3502 		}
3503 	}
3504 out:
3505 	sk_sync_unprotect(protect);
3506 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3507 	    SK_KVA(ring), err);
3508 	return err;
3509 }
3510 
3511 
3512 /*
3513  * Configure the NA to operate in a particular mode.
3514  */
3515 static channel_ring_notify_t
3516 netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3517 {
3518 	channel_ring_notify_t notify = NULL;
3519 	boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3520 	    nx_has_rx_sync_packets(ring));
3521 
3522 	if (mode == NETIF_MODE_FSW) {
3523 		notify = (has_sync_pkts ? netif_rx_notify_fast :
3524 		    netif_rx_notify_default);
3525 	} else if (mode == NETIF_MODE_LLW) {
3526 		notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3527 		    netif_llw_rx_notify_default);
3528 	}
3529 	return notify;
3530 }
3531 
3532 
3533 static uint32_t
3534 netif_mode_to_flag(netif_mode_t mode)
3535 {
3536 	uint32_t flag = 0;
3537 
3538 	if (mode == NETIF_MODE_FSW) {
3539 		flag = NAF_MODE_FSW;
3540 	} else if (mode == NETIF_MODE_LLW) {
3541 		flag = NAF_MODE_LLW;
3542 	}
3543 	return flag;
3544 }
3545 
3546 static void
3547 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3548     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3549     struct nexus_pkt_stats *), boolean_t set)
3550 {
3551 	uint32_t i;
3552 	uint32_t flag;
3553 
3554 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
3555 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
3556 
3557 	for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3558 		struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3559 		channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);
3560 
3561 		if (set) {
3562 			kr->ckr_save_notify = kr->ckr_netif_notify;
3563 			kr->ckr_netif_notify = notify;
3564 		} else {
3565 			kr->ckr_netif_notify = kr->ckr_save_notify;
3566 			kr->ckr_save_notify = NULL;
3567 		}
3568 	}
3569 	if (set) {
3570 		hwna->na_rx = rx;
3571 		flag = netif_mode_to_flag(mode);
3572 		atomic_bitset_32(&hwna->na_flags, flag);
3573 	} else {
3574 		hwna->na_rx = NULL;
3575 		atomic_bitclear_32(&hwna->na_flags,
3576 		    (NAF_MODE_FSW | NAF_MODE_LLW));
3577 	}
3578 }
3579 
3580 void
3581 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3582     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3583     struct nexus_pkt_stats *))
3584 {
3585 	return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3586 }
3587 
3588 void
3589 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3590 {
3591 	return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3592 }
3593 
3594 static void
3595 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3596 {
3597 	struct nexus_netif_adapter *nifna = NIFNA(na);
3598 	struct nx_netif *nif = nifna->nifna_netif;
3599 	struct netif_stats *nifs = &nif->nif_stats;
3600 	struct __kern_channel_ring *r;
3601 	struct nexus_pkt_stats stats;
3602 	sk_protect_t protect;
3603 	boolean_t ring_drop = FALSE;
3604 	int err, dropcnt;
3605 
3606 	if (!NA_OWNED_BY_FSW(na)) {
3607 		DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3608 		goto fail;
3609 	}
3610 	ASSERT(na->na_rx != NULL);
3611 
3612 	/*
3613 	 * XXX
3614 	 * This function is called when a filter injects a packet back to the
3615 	 * regular RX path. We can assume the ring is 0 for now because RSS
3616 	 * is not supported. This needs to be revisited when we add support for
3617 	 * RSS.
3618 	 */
3619 	r = &na->na_rx_rings[0];
3620 	ASSERT(r->ckr_tx == NR_RX);
3621 	err = kr_enter(r, TRUE);
3622 	VERIFY(err == 0);
3623 
3624 	if (__improbable(KR_DROP(r))) {
3625 		kr_exit(r);
3626 		DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3627 		    struct __kern_channel_ring *, r);
3628 		ring_drop = TRUE;
3629 		goto fail;
3630 	}
3631 	protect = sk_sync_protect();
3632 	na->na_rx(na, pkt_chain, &stats);
3633 
3634 	if (r->ckr_netif_mit_stats != NULL &&
3635 	    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3636 		r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3637 	}
3638 	sk_sync_unprotect(protect);
3639 
3640 	kr_exit(r);
3641 	return;
3642 
3643 fail:
3644 	dropcnt = 0;
3645 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3646 	if (ring_drop) {
3647 		STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3648 	}
3649 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3650 }
3651 
3652 /*
3653  * This is called when an inbound packet has traversed all filters.
3654  */
3655 errno_t
3656 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3657     struct __kern_packet *fpkt_chain, uint32_t flags)
3658 {
3659 #pragma unused (flags)
3660 	struct nx_netif *nif = nifna->nifna_netif;
3661 	struct netif_stats *nifs = &nif->nif_stats;
3662 	struct nexus_adapter *na = &nifna->nifna_up;
3663 	struct __kern_packet *pkt_chain;
3664 	int err;
3665 
3666 	pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3667 	    fpkt_chain, NETIF_CONVERT_RX);
3668 	if (pkt_chain == NULL) {
3669 		return ENOMEM;
3670 	}
3671 	if (nif->nif_flow_cnt > 0) {
3672 		struct __kern_packet *remain = NULL;
3673 
3674 		err = nx_netif_demux(nifna, pkt_chain, &remain,
3675 		    NETIF_FLOW_INJECT);
3676 		if (remain == NULL) {
3677 			return err;
3678 		}
3679 		pkt_chain = remain;
3680 	}
3681 	if (na->na_rx != NULL) {
3682 		netif_inject_rx(na, pkt_chain);
3683 	} else {
3684 		int dropcnt = 0;
3685 		nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3686 		STATS_ADD(nifs,
3687 		    NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3688 		STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3689 	}
3690 	return 0;
3691 }
3692 
3693 /*
3694  * This is called when an outbound packet has traversed all filters.
3695  */
3696 errno_t
3697 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3698     struct __kern_packet *fpkt_chain, uint32_t flags)
3699 {
3700 #pragma unused (flags)
3701 	struct nx_netif *nif = nifna->nifna_netif;
3702 	struct nexus_adapter *na = &nifna->nifna_up;
3703 	int err;
3704 
3705 	if (NETIF_IS_COMPAT(nif)) {
3706 		struct mbuf *m_chain;
3707 		mbuf_svc_class_t sc;
3708 
3709 		m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3710 		    fpkt_chain, NETIF_CONVERT_TX);
3711 		if (m_chain == NULL) {
3712 			return ENOMEM;
3713 		}
3714 		/*
3715 		 * All packets in the chain have the same service class.
3716 		 * If the sc is missing or invalid, a valid value will be
3717 		 * returned.
3718 		 */
3719 		sc = mbuf_get_service_class(m_chain);
3720 		err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
3721 		    sc, m_chain);
3722 	} else {
3723 		struct __kern_packet *pkt_chain;
3724 		kern_packet_svc_class_t sc;
3725 
3726 		pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3727 		    fpkt_chain, NETIF_CONVERT_TX);
3728 		if (pkt_chain == NULL) {
3729 			return ENOMEM;
3730 		}
3731 		/*
3732 		 * All packets in the chain have the same service class.
3733 		 * If the sc is missing or invalid, a valid value will be
3734 		 * returned.
3735 		 */
3736 		sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
3737 		err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
3738 		    sc, pkt_chain);
3739 	}
3740 	/* Tell driver to resume dequeuing */
3741 	ifnet_start(na->na_ifp);
3742 	return err;
3743 }
3744 
3745 void
3746 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
3747     struct skmem_region_params *srp)
3748 {
3749 #pragma unused(na, srp)
3750 	return;
3751 }
3752 
3753 /* returns true, if starter thread is utilized */
3754 static bool
3755 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
3756 {
3757 #if (DEVELOPMENT || DEBUG)
3758 	if (__improbable(nx_netif_force_ifnet_start != 0)) {
3759 		ifnet_start(ifp);
3760 		return true;
3761 	}
3762 #endif /* !DEVELOPMENT && !DEBUG */
3763 	/*
3764 	 * use starter thread in following conditions:
3765 	 * - interface is not skywalk native
3766 	 * - interface attached to virtual driver (ipsec, utun)
3767 	 * - TBR is enabled
3768 	 * - delayed start mechanism is in use
3769 	 * - remaining stack space on the thread is not enough for driver
3770 	 * - caller is in rx workloop context
3771 	 * - caller is from the flowswitch path doing ARP resolving
3772 	 * - caller requires the use of starter thread (stack usage)
3773 	 */
3774 	if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
3775 	    !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
3776 	    ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
3777 	    IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
3778 	    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
3779 	    sk_is_rx_notify_protected() ||
3780 	    sk_is_async_transmit_protected() ||
3781 	    (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
3782 		DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
3783 		    uint32_t, flags);
3784 		ifnet_start(ifp);
3785 		return true;
3786 	}
3787 	lck_mtx_lock_spin(&ifp->if_start_lock);
3788 	/* interface is flow controlled */
3789 	if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
3790 		lck_mtx_unlock(&ifp->if_start_lock);
3791 		return true;
3792 	}
3793 	/* if starter thread is active, utilize it */
3794 	if (ifp->if_start_active) {
3795 		ifp->if_start_req++;
3796 		lck_mtx_unlock(&ifp->if_start_lock);
3797 		return true;
3798 	}
3799 	lck_mtx_unlock(&ifp->if_start_lock);
3800 	/* Check remaining stack space */
3801 	if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
3802 		ifnet_start(ifp);
3803 		return true;
3804 	}
3805 	return false;
3806 }
3807 
3808 void
3809 netif_transmit(struct ifnet *ifp, uint32_t flags)
3810 {
3811 	if (netif_use_starter_thread(ifp, flags)) {
3812 		return;
3813 	}
3814 	/*
3815 	 * If no longer attached, don't issue doorbell as ifp
3816 	 * is being destroyed; else hold an IO refcnt to
3817 	 * prevent the interface from being detached.
3818 	 */
3819 	if (!ifnet_datamov_begin(ifp)) {
3820 		return;
3821 	}
3822 	nx_netif_doorbell_internal(ifp, flags);
3823 	/*
3824 	 * Release the IO refcnt taken above.
3825 	 */
3826 	ifnet_datamov_end(ifp);
3827 }
3828 
3829 static struct ifclassq *
3830 netif_get_default_ifcq(struct nexus_adapter *hwna)
3831 {
3832 	struct nx_netif *nif;
3833 	struct ifclassq *ifcq;
3834 
3835 	nif = NX_NETIF_PRIVATE(hwna->na_nx);
3836 	if (NETIF_LLINK_ENABLED(nif)) {
3837 		struct netif_qset *qset;
3838 
3839 		/*
3840 		 * Use the default ifcq for now.
3841 		 * In the future this could be chosen by the caller.
3842 		 */
3843 		qset = nx_netif_get_default_qset_noref(nif);
3844 		ASSERT(qset != NULL);
3845 		ifcq = qset->nqs_ifcq;
3846 	} else {
3847 		ifcq = nif->nif_ifp->if_snd;
3848 	}
3849 	return ifcq;
3850 }
3851 
3852 static errno_t
3853 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
3854     uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
3855     boolean_t *pkts_pending, kern_packet_svc_class_t sc)
3856 {
3857 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
3858 	struct ifnet *ifp = hwna->na_ifp;
3859 	errno_t rc;
3860 
3861 	ASSERT(ifp != NULL);
3862 	ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
3863 	ASSERT((pkt_limit != 0) && (byte_limit != 0));
3864 
3865 	if (ifcq == NULL) {
3866 		ifcq = netif_get_default_ifcq(hwna);
3867 	}
3868 	if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
3869 		rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
3870 		    pkt_limit, byte_limit, &pkt_head, NULL, NULL, NULL);
3871 	} else {
3872 		rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
3873 		    &pkt_head, NULL, NULL, NULL);
3874 	}
3875 	ASSERT((rc == 0) || (rc == EAGAIN));
3876 	ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
3877 
3878 	if (IFCQ_LEN(ifcq) != 0) {
3879 		*pkts_pending = TRUE;
3880 	} else {
3881 		*pkts_pending = FALSE;
3882 	}
3883 
3884 	*head = pkt_head.cp_kpkt;
3885 	return rc;
3886 }
3887 
3888 #if SK_LOG
3889 /* Hoisted out of line to reduce kernel stack footprint */
3890 SK_LOG_ATTRIBUTE
3891 static void
3892 netif_no_ring_space_log(const struct nexus_adapter *na,
3893     const kern_channel_ring_t ring)
3894 {
3895 	SK_DF(SK_VERB_SYNC | SK_VERB_TX,
3896 	    "no ring space: na \"%s\" [%u] "
3897 	    "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)"
3898 	    "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)",
3899 	    na->na_name, ring->ckr_ring_id,
3900 	    ring->ckr_name, ring->ckr_khead,
3901 	    ring->ckr_ktail, ring->ckr_klease,
3902 	    ring->ckr_rhead, ring->ckr_rtail);
3903 }
3904 #endif /* SK_LOG */
3905 
3906 /*
3907  * netif refill function for rings
3908  */
3909 errno_t
3910 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
3911     uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
3912     boolean_t canblock)
3913 {
3914 	struct nexus_adapter *hwna;
3915 	struct ifnet *ifp;
3916 	struct __kern_packet *head = NULL;
3917 	sk_protect_t protect;
3918 	errno_t rc = 0;
3919 	errno_t sync_err = 0;
3920 	uint32_t npkts = 0, consumed = 0;
3921 	uint32_t flags;
3922 	slot_idx_t idx, ktail;
3923 	int ring_space = 0;
3924 
3925 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
3926 
3927 	VERIFY(ring != NULL);
3928 	hwna = KRNA(ring);
3929 	ifp = hwna->na_ifp;
3930 
3931 	ASSERT(hwna->na_type == NA_NETIF_DEV);
3932 	ASSERT(ring->ckr_tx == NR_TX);
3933 	*pkts_pending = FALSE;
3934 
3935 	if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
3936 		SK_ERR("invalid limits plim %d, blim %d",
3937 		    pkt_limit, byte_limit);
3938 		rc = EINVAL;
3939 		goto out;
3940 	}
3941 
3942 	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
3943 		SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
3944 		    SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
3945 		rc = ENXIO;
3946 		goto out;
3947 	}
3948 
3949 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
3950 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
3951 		    "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
3952 		rc = ENXIO;
3953 		goto out;
3954 	}
3955 
3956 	/*
3957 	 * if the ring is busy, it means another dequeue is in
3958 	 * progress, so ignore this request and return success.
3959 	 */
3960 	if (kr_enter(ring, canblock) != 0) {
3961 		rc = 0;
3962 		goto out;
3963 	}
3964 	/* mark thread with sync-in-progress flag */
3965 	protect = sk_sync_protect();
3966 
3967 	if (__improbable(KR_DROP(ring) ||
3968 	    !NA_IS_ACTIVE(ring->ckr_na))) {
3969 		SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
3970 		rc = ENXIO;
3971 		goto done;
3972 	}
3973 
3974 	idx = ring->ckr_rhead;
3975 	ktail = ring->ckr_ktail;
3976 	/* calculate available space on tx ring */
3977 	ring_space = ktail - idx;
3978 	if (ring_space < 0) {
3979 		ring_space += ring->ckr_num_slots;
3980 	}
3981 	if (ring_space == 0) {
3982 		struct ifclassq *ifcq;
3983 
3984 		/* no space in ring, driver should retry */
3985 #if SK_LOG
3986 		if (__improbable((sk_verbose &
3987 		    (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
3988 			netif_no_ring_space_log(hwna, ring);
3989 		}
3990 #endif /* SK_LOG */
3991 		ifcq = netif_get_default_ifcq(hwna);
3992 		if (IFCQ_LEN(ifcq) != 0) {
3993 			*pkts_pending = TRUE;
3994 		}
3995 		/*
3996 		 * We ran out of space in ring, most probably
3997 		 * because the driver is slow to drain its TX queue.
3998 		 * We want another doorbell to be generated as soon
3999 		 * as the TX notify completion happens; mark this
4000 		 * through ckr_pending_doorbell counter.  Do this
4001 		 * regardless of whether there's any pending packet.
4002 		 */
4003 		ring->ckr_pending_doorbell++;
4004 		rc = EAGAIN;
4005 		goto sync_ring;
4006 	}
4007 
4008 	if ((uint32_t)ring_space < pkt_limit) {
4009 		pkt_limit = ring_space;
4010 	}
4011 
4012 	if (tx_doorbell_ctxt &&
4013 	    ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4014 		pkt_limit = MIN(pkt_limit,
4015 		    nx_netif_doorbell_max_dequeue);
4016 	}
4017 
4018 	rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4019 	    &head, pkts_pending, ring->ckr_svc);
4020 
4021 	/*
4022 	 * There's room in ring; if we haven't dequeued everything,
4023 	 * mark ckr_pending_doorbell for the next TX notify to issue
4024 	 * a TX door bell; otherwise, clear it.  The next packet that
4025 	 * gets enqueued will trigger a door bell again.
4026 	 */
4027 	if (*pkts_pending) {
4028 		ring->ckr_pending_doorbell++;
4029 	} else if (ring->ckr_pending_doorbell != 0) {
4030 		ring->ckr_pending_doorbell = 0;
4031 	}
4032 
4033 	if (rc != 0) {
4034 		/*
4035 		 * This is expected sometimes as the IOSkywalkFamily
4036 		 * errs on the side of caution to perform an extra
4037 		 * dequeue when multiple doorbells are pending;
4038 		 * nothing to dequeue, do a sync if there are slots
4039 		 * to reclaim else just return.
4040 		 */
4041 		SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4042 		    "nothing to dequeue, err %d", rc);
4043 
4044 		if ((uint32_t)ring_space == ring->ckr_lim) {
4045 			goto done;
4046 		} else {
4047 			goto sync_ring;
4048 		}
4049 	}
4050 	/* move the dequeued packets to tx ring */
4051 	while (head != NULL && idx != ktail) {
4052 		ASSERT(npkts <= pkt_limit);
4053 		struct __kern_packet *pkt = head;
4054 		KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
4055 		    (struct __kern_quantum *)pkt);
4056 		npkts++;
4057 		if (__improbable(pkt->pkt_trace_id != 0)) {
4058 			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4059 			KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4060 		}
4061 		idx = SLOT_NEXT(idx, ring->ckr_lim);
4062 		head = pkt->pkt_nextpkt;
4063 		pkt->pkt_nextpkt = NULL;
4064 	}
4065 
4066 	/*
4067 	 * We checked for ring space earlier so the ring should have enough
4068 	 * space for the entire chain.
4069 	 */
4070 	ASSERT(head == NULL);
4071 	ring->ckr_rhead = idx;
4072 
4073 sync_ring:
4074 	flags = NA_SYNCF_NETIF;
4075 	if (ring->ckr_pending_doorbell != 0) {
4076 		flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4077 	}
4078 
4079 	ring->ckr_khead_pre = ring->ckr_khead;
4080 	sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4081 	if (sync_err != 0 && sync_err != EAGAIN) {
4082 		SK_ERR("unexpected sync err %d", sync_err);
4083 		if (rc == 0) {
4084 			rc = sync_err;
4085 		}
4086 		goto done;
4087 	}
4088 	/*
4089 	 * Verify that the driver has detached packets from the consumed slots.
4090 	 */
4091 	idx = ring->ckr_khead_pre;
4092 	consumed = 0;
4093 	while (idx != ring->ckr_khead) {
4094 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4095 
4096 		consumed++;
4097 		VERIFY(!KSD_VALID_METADATA(ksd));
4098 		idx = SLOT_NEXT(idx, ring->ckr_lim);
4099 	}
4100 	ring->ckr_khead_pre = ring->ckr_khead;
4101 
4102 done:
4103 	sk_sync_unprotect(protect);
4104 	kr_exit(ring);
4105 out:
4106 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4107 	    SK_KVA(ring), rc, 0, npkts);
4108 
4109 	return rc;
4110 }
4111 
4112 void
4113 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4114     uint32_t count, uint32_t flags)
4115 {
4116 #pragma unused (count)
4117 	struct netif_queue *q = queue;
4118 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4119 	struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4120 	bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4121 	struct pktq *pktq = &q->nq_pktq;
4122 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4123 	struct nexus_pkt_stats stats;
4124 	sk_protect_t protect;
4125 
4126 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4127 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4128 		int drop_cnt = 0;
4129 
4130 		pp_free_packet_chain(pkt_chain, &drop_cnt);
4131 		STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4132 		return;
4133 	}
4134 	KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4135 	if (flush) {
4136 		pkt_chain = KPKTQ_FIRST(pktq);
4137 		KPKTQ_INIT(pktq);
4138 
4139 		protect = sk_sync_protect();
4140 		netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4141 		sk_sync_unprotect(protect);
4142 	}
4143 }
4144 
4145 errno_t
4146 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4147     uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4148 {
4149 	struct netif_queue *q = queue;
4150 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4151 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4152 	struct nexus_adapter *hwna;
4153 	struct __kern_packet *pkt_chain = NULL;
4154 	errno_t rc;
4155 
4156 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4157 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4158 		STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4159 		return ENXIO;
4160 	}
4161 	hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4162 
4163 	if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4164 	    sk_is_tx_notify_protected()) {
4165 		pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4166 	}
4167 	rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4168 	    byte_limit, &pkt_chain, pending, q->nq_svc);
4169 
4170 	if (pkt_chain != NULL) {
4171 		*ph_chain = SK_PKT2PH(pkt_chain);
4172 	}
4173 	return rc;
4174 }
4175 
4176 errno_t
4177 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4178     struct kern_nexus_netif_llink_init *llink_init)
4179 {
4180 	errno_t err;
4181 	struct nx_netif *nif;
4182 	struct netif_llink *llink;
4183 	struct netif_stats *nifs;
4184 
4185 	VERIFY(nx != NULL);
4186 	VERIFY(llink_init != NULL);
4187 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4188 
4189 	nif = NX_NETIF_PRIVATE(nx);
4190 	nifs = &nif->nif_stats;
4191 
4192 	err = nx_netif_validate_llink_config(llink_init, false);
4193 	if (err != 0) {
4194 		SK_ERR("Invalid llink init params");
4195 		STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4196 		return err;
4197 	}
4198 
4199 	err = nx_netif_llink_add(nif, llink_init, &llink);
4200 	return err;
4201 }
4202 
4203 errno_t
4204 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4205     kern_nexus_netif_llink_id_t llink_id)
4206 {
4207 	struct nx_netif *nif;
4208 
4209 	VERIFY(nx != NULL);
4210 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4211 
4212 	nif = NX_NETIF_PRIVATE(nx);
4213 	return nx_netif_llink_remove(nif, llink_id);
4214 }
4215 
4216 errno_t
4217 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4218     kern_packet_svc_class_t *svc)
4219 {
4220 	*svc = queue->nq_svc;
4221 	return 0;
4222 }
4223