xref: /xnu-8020.121.3/bsd/skywalk/nexus/netif/nx_netif.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941) !
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * The netif nexus domain has two domain providers: native and compat, with
31  * the latter being the default provider of this domain. The compat provider
32  * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33  *
34  * A netif nexus instance can be in a native or compat mode; in either case,
35  * it is associated with two instances of a nexus_adapter structure, and allows
36  * at most two channels opened to the nexus.  Two two adapters correspond to
37  * host and device ports, respectively.
38  *
39  * By itself, a netif nexus isn't associated with a network interface. The
40  * association happens by attaching a network interface to the nexus instance.
41  * A channel can only be successfully opened to a netif nexus after it has an
42  * interface attached to it.
43  *
44  * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45  * structure refers to the attached netif nexus adapter via its if_na field.
46  * The nexus also holds a reference to the interface on its na_ifp field. Note
47  * that attaching to a netif_compat nexus does not alter the input/output data
48  * path, nor does it remove any of the interface's hardware offload flags. It
49  * merely associates the interface and netif nexus together.
50  *
51  * During a detach, the above references are dropped and the fields are cleared;
52  * the interface is also marked as non-Skywalk-capable. This detach can happen
53  * explicitly via a command down the nexus, or implicitly when the nexus goes
54  * away (assuming there's no channel opened to it.)
55  *
56  * A userland channel can be opened to a netif nexus via the usual ch_open()
57  * way, assuming the nexus provider is setup to allow access for the userland
58  * process (either by binding the nexus port to PID, etc. or by creating the
59  * nexus in the anonymous mode.)
60  *
61  * Alternatively, a kernel channel can also be opened to it by some kernel
62  * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63  * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64  * indicate that.
65  *
66  * Opening a channel to the host port of a native or compat netif causes the
67  * ifnet output path to be redirected to nx_netif_host_transmit().  We also,
68  * at present, disable any hardware offload features.
69  *
70  * Opening a channel to the device port of a compat netif causes the ifnet
71  * input path to be redirected to nx_netif_compat_receive().  This is specific
72  * to the compat variant, as the native variant's RX path already goes to
73  * the native netif.
74  *
75  * During channel close, we restore the original I/O callbacks, as well as the
76  * interface's offload flags.
77  */
78 
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87 
88 #define NX_NETIF_MAXRINGS       NX_MAX_NUM_RING_PAIR
89 #define NX_NETIF_MINSLOTS       2       /* XXX same as above */
90 #define NX_NETIF_MAXSLOTS       NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91 #define NX_NETIF_TXRINGSIZE     512     /* default TX ring size */
92 #define NX_NETIF_RXRINGSIZE     1024    /* default RX ring size */
93 #define NX_NETIF_BUFSIZE        (2 * 1024)  /* default buffer size */
94 #define NX_NETIF_MINBUFSIZE     (128)  /* min buffer size */
95 #define NX_NETIF_MAXBUFSIZE     (16 * 1024) /* max buffer size */
96 
97 /*
98  * TODO: [email protected] -- minimum buflets for now; we will need to
99  * have a way to adjust this based on the underlying interface's
100  * parameters, e.g. jumbo MTU, large segment offload, etc.
101  */
102 #define NX_NETIF_UMD_SIZE       _USER_PACKET_SIZE(BUFLETS_MIN)
103 #define NX_NETIF_KMD_SIZE       _KERN_PACKET_SIZE(BUFLETS_MIN)
104 
105 /*
106  * minimum stack space required for IOSkywalkFamily and Driver execution.
107  */
108 #if XNU_TARGET_OS_OSX
109 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 1)
110 #else /* !XNU_TARGET_OS_OSX */
111 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 2)
112 #endif /* XNU_TARGET_OS_OSX */
113 
114 static void nx_netif_dom_init(struct nxdom *);
115 static void nx_netif_dom_terminate(struct nxdom *);
116 static void nx_netif_dom_fini(struct nxdom *);
117 static int nx_netif_prov_params_adjust(
118 	const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 	struct nxprov_adjusted_params *);
120 
121 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122     struct nxbind *, void *);
123 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125     struct kern_nexus *, struct kern_channel *, struct chreq *,
126     struct kern_channel *, struct nxbind *, struct proc *);
127 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128     struct kern_nexus *, struct kern_channel *);
129 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130     struct kern_nexus *, struct kern_channel *, struct proc *);
131 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132     struct kern_nexus *, struct kern_channel *, boolean_t);
133 
134 static void nx_netif_doorbell(struct ifnet *);
135 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136     uint32_t);
137 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138     uint32_t);
139 static void nx_netif_na_dtor(struct nexus_adapter *na);
140 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141     uint32_t);
142 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143     uint32_t);
144 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145 
146 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147     struct proc *);
148 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149     struct proc *);
150 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152 static void nx_netif_flags_init(struct nx_netif *);
153 static void nx_netif_flags_fini(struct nx_netif *);
154 static int nx_netif_na_channel_event_notify(struct nexus_adapter *,
155     struct __kern_packet *, struct __kern_channel_event *, uint16_t);
156 static void nx_netif_capabilities_fini(struct nx_netif *);
157 static errno_t nx_netif_interface_advisory_notify(void *,
158     const struct ifnet_interface_advisory *);
159 
160 struct nxdom nx_netif_dom_s = {
161 	.nxdom_prov_head =
162     STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
163 	.nxdom_type =           NEXUS_TYPE_NET_IF,
164 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
165 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
166 	.nxdom_name =           "netif",
167 	.nxdom_ports = {
168 		.nb_def = 2,
169 		.nb_min = 2,
170 		.nb_max = NX_NETIF_MAXPORTS,
171 	},
172 	.nxdom_tx_rings = {
173 		.nb_def = 1,
174 		.nb_min = 1,
175 		.nb_max = NX_NETIF_MAXRINGS,
176 	},
177 	.nxdom_rx_rings = {
178 		.nb_def = 1,
179 		.nb_min = 1,
180 		.nb_max = NX_NETIF_MAXRINGS,
181 	},
182 	.nxdom_tx_slots = {
183 		.nb_def = NX_NETIF_TXRINGSIZE,
184 		.nb_min = NX_NETIF_MINSLOTS,
185 		.nb_max = NX_NETIF_MAXSLOTS,
186 	},
187 	.nxdom_rx_slots = {
188 		.nb_def = NX_NETIF_RXRINGSIZE,
189 		.nb_min = NX_NETIF_MINSLOTS,
190 		.nb_max = NX_NETIF_MAXSLOTS,
191 	},
192 	.nxdom_buf_size = {
193 		.nb_def = NX_NETIF_BUFSIZE,
194 		.nb_min = NX_NETIF_MINBUFSIZE,
195 		.nb_max = NX_NETIF_MAXBUFSIZE,
196 	},
197 	.nxdom_meta_size = {
198 		.nb_def = NX_NETIF_UMD_SIZE,
199 		.nb_min = NX_NETIF_UMD_SIZE,
200 		.nb_max = NX_METADATA_USR_MAX_SZ,
201 	},
202 	.nxdom_stats_size = {
203 		.nb_def = 0,
204 		.nb_min = 0,
205 		.nb_max = NX_STATS_MAX_SZ,
206 	},
207 	.nxdom_pipes = {
208 		.nb_def = 0,
209 		.nb_min = 0,
210 		.nb_max = NX_UPIPE_MAXPIPES,
211 	},
212 	.nxdom_flowadv_max = {
213 		.nb_def = 0,
214 		.nb_min = 0,
215 		.nb_max = NX_FLOWADV_MAX,
216 	},
217 	.nxdom_nexusadv_size = {
218 		.nb_def = 0,
219 		.nb_min = 0,
220 		.nb_max = NX_NEXUSADV_MAX_SZ,
221 	},
222 	.nxdom_capabilities = {
223 		.nb_def = NXPCAP_USER_CHANNEL,
224 		.nb_min = 0,
225 		.nb_max = NXPCAP_USER_CHANNEL,
226 	},
227 	.nxdom_qmap = {
228 		.nb_def = NEXUS_QMAP_TYPE_DEFAULT,
229 		.nb_min = NEXUS_QMAP_TYPE_DEFAULT,
230 		.nb_max = NEXUS_QMAP_TYPE_WMM,
231 	},
232 	.nxdom_max_frags = {
233 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
234 		.nb_min = NX_PBUF_FRAGS_MIN,
235 		.nb_max = NX_PBUF_FRAGS_MAX,
236 	},
237 	.nxdom_init =           nx_netif_dom_init,
238 	.nxdom_terminate =      nx_netif_dom_terminate,
239 	.nxdom_fini =           nx_netif_dom_fini,
240 	.nxdom_find_port =      NULL,
241 	.nxdom_port_is_reserved = NULL,
242 	.nxdom_bind_port =      nx_netif_dom_bind_port,
243 	.nxdom_unbind_port =    nx_netif_dom_unbind_port,
244 	.nxdom_connect =        nx_netif_dom_connect,
245 	.nxdom_disconnect =     nx_netif_dom_disconnect,
246 	.nxdom_defunct =        nx_netif_dom_defunct,
247 	.nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
248 };
249 
250 struct kern_nexus_domain_provider nx_netif_prov_s = {
251 	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF,
252 	/*
253 	 * Don't install this as the default domain provider, i.e.
254 	 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
255 	 * provider to be the one handling userland-issued requests
256 	 * coming down thru nxprov_create() instead.
257 	 */
258 	.nxdom_prov_flags =             0,
259 	.nxdom_prov_cb = {
260 		.dp_cb_init =           nx_netif_prov_init,
261 		.dp_cb_fini =           nx_netif_prov_fini,
262 		.dp_cb_params =         nx_netif_prov_params,
263 		.dp_cb_mem_new =        nx_netif_prov_mem_new,
264 		.dp_cb_config =         nx_netif_prov_config,
265 		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
266 		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
267 		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
268 		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
269 		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
270 	},
271 };
272 
273 struct nexus_ifnet_ops na_netif_ops = {
274 	.ni_finalize = na_netif_finalize,
275 	.ni_reap = nx_netif_reap,
276 	.ni_dequeue = nx_netif_native_tx_dequeue,
277 	.ni_get_len = nx_netif_native_tx_get_len
278 };
279 
280 #define NX_NETIF_DOORBELL_MAX_DEQUEUE    64
281 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
282 
283 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
284     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
285 #if (DEVELOPMENT || DEBUG)
286 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
287     CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
288     "ifname prefix for enabling low latency support");
289 static uint32_t nx_netif_force_ifnet_start = 0;
290 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
291     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
292     "always use ifnet starter thread");
293 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
294     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
295     NX_NETIF_DOORBELL_MAX_DEQUEUE,
296     "max packets to dequeue in doorbell context");
297 #endif /* !DEVELOPMENT && !DEBUG */
298 
299 static ZONE_DEFINE(na_netif_zone, SKMEM_ZONE_PREFIX ".na.netif",
300     sizeof(struct nexus_netif_adapter), ZC_ZFREE_CLEARMEM);
301 
302 static ZONE_DEFINE(nx_netif_zone, SKMEM_ZONE_PREFIX ".nx.netif",
303     sizeof(struct nx_netif), ZC_ZFREE_CLEARMEM);
304 
305 #define SKMEM_TAG_NETIF_MIT          "com.apple.skywalk.netif.mit"
306 static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
307 
308 #define SKMEM_TAG_NETIF_FILTER       "com.apple.skywalk.netif.filter"
309 SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
310 
311 #define SKMEM_TAG_NETIF_FLOW         "com.apple.skywalk.netif.flow"
312 SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
313 
314 #define SKMEM_TAG_NETIF_AGENT_FLOW   "com.apple.skywalk.netif.agent_flow"
315 SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
316 
317 #define SKMEM_TAG_NETIF_LLINK        "com.apple.skywalk.netif.llink"
318 SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
319 
320 #define SKMEM_TAG_NETIF_QSET         "com.apple.skywalk.netif.qset"
321 SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
322 
323 #define SKMEM_TAG_NETIF_LLINK_INFO   "com.apple.skywalk.netif.llink_info"
324 SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
325 
326 static void
nx_netif_dom_init(struct nxdom * nxdom)327 nx_netif_dom_init(struct nxdom *nxdom)
328 {
329 	SK_LOCK_ASSERT_HELD();
330 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
331 
332 	_CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
333 	_CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
334 	_CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
335 	_CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
336 	_CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
337 	_CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
338 	_CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
339 
340 	(void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
341 
342 	nx_netif_compat_init(nxdom);
343 
344 	ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
345 	    strcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
346 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
347 
348 	netif_gso_init();
349 }
350 
351 static void
nx_netif_dom_terminate(struct nxdom * nxdom)352 nx_netif_dom_terminate(struct nxdom *nxdom)
353 {
354 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
355 
356 	SK_LOCK_ASSERT_HELD();
357 
358 	netif_gso_fini();
359 	nx_netif_compat_fini();
360 
361 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
362 	    nxdom_prov_link, tnxdp) {
363 		(void) nxdom_prov_del(nxdom_prov);
364 	}
365 }
366 
367 static void
nx_netif_dom_fini(struct nxdom * nxdom)368 nx_netif_dom_fini(struct nxdom *nxdom)
369 {
370 #pragma unused(nxdom)
371 }
372 
373 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)374 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
375 {
376 #pragma unused(nxdom_prov)
377 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
378 	return 0;
379 }
380 
381 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)382 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
383     uint32_t flags)
384 {
385 #pragma unused(kring, p, flags)
386 	return ENXIO;
387 }
388 
389 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)390 nx_netif_prov_nx_stop(struct kern_nexus *nx)
391 {
392 	uint32_t r;
393 	struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
394 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
395 
396 	SK_LOCK_ASSERT_HELD();
397 	ASSERT(nx != NULL);
398 
399 	/* place all rings in drop mode */
400 	na_kr_drop(na, TRUE);
401 
402 	/* ensure global visibility */
403 	membar_sync();
404 
405 	/* reset all TX notify callbacks */
406 	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
407 		while (!atomic_test_set_ptr(&na->na_tx_rings[r].ckr_na_notify,
408 		    ptrauth_nop_cast(void *, na->na_tx_rings[r].ckr_na_notify),
409 		    ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
410 			;
411 		}
412 		membar_sync();
413 		if (nifna->nifna_tx_mit != NULL) {
414 			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
415 		}
416 	}
417 	if (nifna->nifna_tx_mit != NULL) {
418 		skn_free_type_array(tx, struct nx_netif_mit,
419 		    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
420 		nifna->nifna_tx_mit = NULL;
421 	}
422 
423 	/* reset all RX notify callbacks */
424 	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
425 		while (!atomic_test_set_ptr(&na->na_rx_rings[r].ckr_na_notify,
426 		    ptrauth_nop_cast(void *, na->na_rx_rings[r].ckr_na_notify),
427 		    ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
428 			;
429 		}
430 		membar_sync();
431 		if (nifna->nifna_rx_mit != NULL) {
432 			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
433 		}
434 	}
435 	if (nifna->nifna_rx_mit != NULL) {
436 		skn_free_type_array(rx, struct nx_netif_mit,
437 		    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
438 		nifna->nifna_rx_mit = NULL;
439 	}
440 	return 0;
441 }
442 
443 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)444 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
445     ifnet_t ifp)
446 {
447 	if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
448 		*(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
449 		*(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
450 	} else if (IFNET_IS_WIFI(ifp)) {
451 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
452 		    ifp->if_name[2] == '\0') {
453 			/* Wi-Fi Access Point */
454 			*(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
455 			*(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
456 		} else if (ifp->if_eflags & IFEF_AWDL) {
457 			/* AWDL */
458 			*(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
459 			*(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
460 		} else {
461 			/* Wi-Fi infrastructure */
462 			*(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
463 			*(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
464 		}
465 	} else if (IFNET_IS_ETHERNET(ifp)) {
466 #if !XNU_TARGET_OS_OSX
467 		/*
468 		 * On non-macOS platforms, treat all compat Ethernet
469 		 * interfaces as USB Ethernet with reduced ring sizes.
470 		 */
471 		*(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
472 		*(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
473 #else /* XNU_TARGET_OS_OSX */
474 		if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
475 			*(adj->adj_rx_slots) =
476 			    sk_netif_compat_usb_eth_rx_ring_sz;
477 			*(adj->adj_tx_slots) =
478 			    sk_netif_compat_usb_eth_tx_ring_sz;
479 		}
480 #endif /* XNU_TARGET_OS_OSX */
481 	}
482 }
483 
484 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)485 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
486     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
487 {
488 	/*
489 	 * for netif compat adjust the following parameters for memory
490 	 * optimization:
491 	 * - change the size of buffer object to 128 bytes.
492 	 * - don't allocate rx ring for host port and tx ring for dev port.
493 	 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
494 	 *   Assumption here is that pdp_ip0 is always used as the data
495 	 *   interface.
496 	 * - reduce the ring size for AWDL interface.
497 	 * - reduce the ring size for USB ethernet interface.
498 	 */
499 	if (strcmp(nxdom_prov->nxdom_prov_name,
500 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
501 		/*
502 		 * Leave the parameters default if userspace access may be
503 		 * needed. We can't use skywalk_direct_allowed() here because
504 		 * the drivers have not attached yet.
505 		 */
506 		if (skywalk_netif_direct_enabled()) {
507 			goto done;
508 		}
509 
510 		*(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
511 		*(adj->adj_tx_rings) = 1;
512 		if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
513 			ifnet_t ifp;
514 			ifnet_head_lock_shared();
515 			ifp = ifindex2ifnet[nxp->nxp_ifindex];
516 			ifnet_head_done();
517 			VERIFY(ifp != NULL);
518 			nx_netif_compat_adjust_ring_size(adj, ifp);
519 		}
520 		if (adj->adj_buf_srp->srp_r_seg_size == 0) {
521 			adj->adj_buf_srp->srp_r_seg_size =
522 			    skmem_usr_buf_seg_size;
523 		}
524 	} else { /* netif native */
525 		if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
526 			*(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
527 			*(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
528 		}
529 		/*
530 		 * Add another extra ring for host port. Note that if the
531 		 * nexus isn't configured to use the same pbufpool for all of
532 		 * its ports, we'd end up allocating extra here.
533 		 * Not a big deal since that case isn't the default.
534 		 */
535 		*(adj->adj_tx_rings) += 1;
536 		*(adj->adj_rx_rings) += 1;
537 
538 		if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
539 			SK_ERR("buf size too small, min (%d)",
540 			    PKT_MAX_PROTO_HEADER_SIZE);
541 			return EINVAL;
542 		}
543 		_CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
544 		    NX_INTF_ADV_SIZE);
545 		*(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
546 	}
547 done:
548 	/* enable magazines layer for metadata */
549 	*(adj->adj_md_magazines) = TRUE;
550 	return 0;
551 }
552 
553 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS])554 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
555     const uint32_t req, const struct nxprov_params *nxp0,
556     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS])
557 {
558 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
559 
560 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
561 	           nxdom, nxdom, nxdom, nx_netif_prov_params_adjust);
562 }
563 
564 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)565 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
566     struct kern_nexus *nx, struct nexus_adapter *na)
567 {
568 #pragma unused(nxdom_prov)
569 	int err = 0;
570 	boolean_t pp_truncated_buf = FALSE;
571 	boolean_t allow_direct;
572 	boolean_t kernel_only;
573 
574 	SK_DF(SK_VERB_NETIF,
575 	    "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
576 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
577 	    SK_KVA(na));
578 
579 	ASSERT(na->na_arena == NULL);
580 	if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
581 	    (na->na_type == NA_NETIF_COMPAT_HOST)) {
582 		pp_truncated_buf = TRUE;
583 	}
584 	/*
585 	 * We do this check to determine whether to create the extra
586 	 * regions needed for userspace access. This is per interface.
587 	 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
588 	 */
589 	allow_direct = skywalk_netif_direct_allowed(na->na_name);
590 
591 	/*
592 	 * Both ports (host and dev) share the same packet buffer pool;
593 	 * the first time a port gets opened will allocate the pp that
594 	 * gets stored in the nexus, which will then be used by any
595 	 * subsequent opens.
596 	 */
597 	kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
598 	na->na_arena = skmem_arena_create_for_nexus(na,
599 	    NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
600 	    &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
601 	ASSERT(na->na_arena != NULL || err != 0);
602 	ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
603 	    NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
604 	    NX_DOM(nx)->nxdom_md_subtype));
605 
606 	return err;
607 }
608 
609 SK_NO_INLINE_ATTRIBUTE
610 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)611 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
612 {
613 	struct nx_llink_info_req *nlir = NULL;
614 	struct nx_netif *nif;
615 	struct netif_llink *llink;
616 	uint16_t llink_cnt;
617 	size_t len, user_len;
618 	int err, i;
619 
620 	nif = NX_NETIF_PRIVATE(nx);
621 	if (!NETIF_LLINK_ENABLED(nif)) {
622 		SK_ERR("llink mode not enabled");
623 		return ENOTSUP;
624 	}
625 	lck_rw_lock_shared(&nif->nif_llink_lock);
626 	llink_cnt = nif->nif_llink_cnt;
627 	if (llink_cnt == 0) {
628 		SK_ERR("zero llink cnt");
629 		err = ENXIO;
630 		goto done;
631 	}
632 	len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
633 	/* preserve sopt_valsize because it gets overwritten by copyin */
634 	user_len = sopt->sopt_valsize;
635 	if (user_len < len) {
636 		SK_ERR("buffer too small");
637 		err = ENOBUFS;
638 		goto done;
639 	}
640 	nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
641 	if (nlir == NULL) {
642 		SK_ERR("failed to allocate nlir");
643 		err = ENOMEM;
644 		goto done;
645 	}
646 	err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
647 	if (err != 0) {
648 		SK_ERR("copyin failed: %d", err);
649 		goto done;
650 	}
651 	if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
652 		SK_ERR("nlir version mismatch: %d != %d",
653 		    nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
654 		err = ENOTSUP;
655 		goto done;
656 	}
657 	nlir->nlir_llink_cnt = llink_cnt;
658 	i = 0;
659 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
660 		struct nx_llink_info *nli;
661 		struct netif_qset *qset;
662 		uint16_t qset_cnt;
663 		int j;
664 
665 		nli = &nlir->nlir_llink[i];
666 		nli->nli_link_id = llink->nll_link_id;
667 		nli->nli_link_id_internal = llink->nll_link_id_internal;
668 		nli->nli_state = llink->nll_state;
669 		nli->nli_flags = llink->nll_flags;
670 
671 		qset_cnt = llink->nll_qset_cnt;
672 		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
673 		nli->nli_qset_cnt = qset_cnt;
674 
675 		j = 0;
676 		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
677 			struct nx_qset_info *nqi;
678 
679 			nqi = &nli->nli_qset[j];
680 			nqi->nqi_id = qset->nqs_id;
681 			nqi->nqi_flags = qset->nqs_flags;
682 			nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
683 			nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
684 			j++;
685 		}
686 		ASSERT(j == qset_cnt);
687 		i++;
688 	}
689 	ASSERT(i == llink_cnt);
690 	sopt->sopt_valsize = user_len;
691 	err = sooptcopyout(sopt, nlir, len);
692 	if (err != 0) {
693 		SK_ERR("sooptcopyout failed: %d", err);
694 	}
695 done:
696 	lck_rw_unlock_shared(&nif->nif_llink_lock);
697 	if (nlir != NULL) {
698 		sk_free_data(nlir, len);
699 	}
700 	return err;
701 }
702 
703 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)704 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
705     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
706     struct proc *p, kauth_cred_t cred)
707 {
708 #pragma unused(nxdom_prov)
709 	struct sockopt sopt;
710 	int err = 0;
711 
712 	SK_LOCK_ASSERT_HELD();
713 
714 	/* proceed only if the client possesses netif entitlement */
715 	if ((err = skywalk_priv_check_cred(p, cred,
716 	    PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
717 		goto done;
718 	}
719 
720 	if (ncr->nc_req == USER_ADDR_NULL) {
721 		err = EINVAL;
722 		goto done;
723 	}
724 
725 	/* to make life easier for handling copies */
726 	bzero(&sopt, sizeof(sopt));
727 	sopt.sopt_dir = sopt_dir;
728 	sopt.sopt_val = ncr->nc_req;
729 	sopt.sopt_valsize = ncr->nc_req_len;
730 	sopt.sopt_p = p;
731 
732 	switch (ncr->nc_cmd) {
733 	case NXCFG_CMD_ATTACH:
734 	case NXCFG_CMD_DETACH: {
735 		struct nx_spec_req nsr;
736 
737 		bzero(&nsr, sizeof(nsr));
738 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
739 		if (err != 0) {
740 			goto done;
741 		}
742 
743 		/*
744 		 * Null-terminate in case this has an interface name;
745 		 * the union is already large enough for uuid_t.
746 		 */
747 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
748 		if (p != kernproc) {
749 			nsr.nsr_flags &= NXSPECREQ_MASK;
750 		}
751 
752 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
753 		if (err != 0) {
754 			goto done;
755 		}
756 
757 		/* XXX: [email protected] -- can this copyout fail? */
758 		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
759 		break;
760 	}
761 	case NXCFG_CMD_FLOW_ADD:
762 	case NXCFG_CMD_FLOW_DEL: {
763 		_CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
764 		    offsetof(struct nx_flow_req, _nfr_common_field_end));
765 		struct nx_flow_req nfr;
766 
767 		bzero(&nfr, sizeof(nfr));
768 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
769 		if (err != 0) {
770 			goto done;
771 		}
772 
773 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
774 		if (err != 0) {
775 			goto done;
776 		}
777 
778 		/* XXX: [email protected] -- can this copyout fail? */
779 		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
780 		break;
781 	}
782 	case NXCFG_CMD_GET_LLINK_INFO: {
783 		err = nx_netif_get_llink_info(&sopt, nx);
784 		break;
785 	}
786 	default:
787 		err = EINVAL;
788 		goto done;
789 	}
790 done:
791 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
792 	    "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
793 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
794 	return err;
795 }
796 
797 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)798 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
799 {
800 #pragma unused(nxdom_prov)
801 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
802 }
803 
804 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)805 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
806 {
807 	struct nx_netif *n;
808 	char name[64];
809 	int error;
810 
811 	SK_LOCK_ASSERT_HELD();
812 	ASSERT(nx->nx_arg == NULL);
813 
814 	SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
815 
816 	nx->nx_arg = nx_netif_alloc(Z_WAITOK);
817 	n = NX_NETIF_PRIVATE(nx);
818 	if (NX_USER_CHANNEL_PROV(nx) &&
819 	    NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
820 		(void) snprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
821 		error = nx_advisory_alloc(nx, name,
822 		    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
823 		    NEXUS_ADVISORY_TYPE_NETIF);
824 		if (error != 0) {
825 			nx_netif_free(n);
826 			return error;
827 		}
828 	}
829 	n->nif_nx = nx;
830 	SK_D("create new netif 0x%llx for nexus 0x%llx",
831 	    SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
832 	return 0;
833 }
834 
835 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)836 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
837 {
838 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
839 
840 	SK_LOCK_ASSERT_HELD();
841 
842 	SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
843 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
844 
845 	/*
846 	 * XXX
847 	 * detach should be done separately to be symmetrical with attach.
848 	 */
849 	nx_advisory_free(nx);
850 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
851 		/* we're called by nx_detach(), so this cannot fail */
852 		int err = nx_netif_ctl_detach(nx, NULL);
853 		VERIFY(err == 0);
854 	}
855 	if (n->nif_dev_nxb != NULL) {
856 		nxb_free(n->nif_dev_nxb);
857 		n->nif_dev_nxb = NULL;
858 	}
859 	if (n->nif_host_nxb != NULL) {
860 		nxb_free(n->nif_host_nxb);
861 		n->nif_host_nxb = NULL;
862 	}
863 	SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
864 	nx_netif_free(n);
865 	nx->nx_arg = NULL;
866 }
867 
868 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)869 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
870     struct kern_pbufpool **rpp)
871 {
872 	ASSERT(nx->nx_tx_pp != NULL);
873 	ASSERT(nx->nx_rx_pp != NULL);
874 
875 	if (tpp != NULL) {
876 		*tpp = nx->nx_tx_pp;
877 	}
878 	if (rpp != NULL) {
879 		*rpp = nx->nx_rx_pp;
880 	}
881 
882 	return 0;
883 }
884 
885 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)886 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
887 {
888 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
889 	struct ifnet *ifp = nif->nif_ifp;
890 	struct sk_stats_net_if *sns = out;
891 	size_t actual_space = sizeof(struct sk_stats_net_if);
892 
893 	if (out != NULL && actual_space <= len) {
894 		uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
895 		if (ifp != NULL) {
896 			(void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
897 		}
898 		sns->sns_nifs = nif->nif_stats;
899 	}
900 
901 	return actual_space;
902 }
903 
904 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * out,size_t len)905 __netif_mib_get_llinks(struct kern_nexus *nx, void *out, size_t len)
906 {
907 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
908 	struct nx_llink_info *nli_list = out;
909 	size_t actual_space = 0;
910 	if (NETIF_LLINK_ENABLED(nif)) {
911 		lck_rw_lock_shared(&nif->nif_llink_lock);
912 		actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
913 
914 		if (out != NULL && actual_space <= len) {
915 			struct netif_llink *llink;
916 			int i = 0;
917 			STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
918 				struct nx_llink_info *nli;
919 				struct netif_qset *qset;
920 				uint16_t qset_cnt;
921 				int j;
922 
923 				nli = &nli_list[i];
924 				uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
925 				nli->nli_link_id = llink->nll_link_id;
926 				nli->nli_link_id_internal = llink->nll_link_id_internal;
927 				nli->nli_state = llink->nll_state;
928 				nli->nli_flags = llink->nll_flags;
929 
930 				qset_cnt = llink->nll_qset_cnt;
931 				ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
932 				nli->nli_qset_cnt = qset_cnt;
933 
934 				j = 0;
935 				SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
936 					struct nx_qset_info *nqi;
937 
938 					nqi = &nli->nli_qset[j];
939 					nqi->nqi_id = qset->nqs_id;
940 					nqi->nqi_flags = qset->nqs_flags;
941 					nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
942 					nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
943 					j++;
944 				}
945 				ASSERT(j == qset_cnt);
946 				i++;
947 			}
948 			ASSERT(i == nif->nif_llink_cnt);
949 		}
950 		lck_rw_unlock_shared(&nif->nif_llink_lock);
951 	}
952 
953 	return actual_space;
954 }
955 
956 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)957 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
958     void *out, size_t len, struct proc *p)
959 {
960 #pragma unused(p)
961 	size_t ret;
962 
963 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
964 	    (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
965 		return 0;
966 	}
967 
968 	switch (filter->nmf_type) {
969 	case NXMIB_NETIF_STATS:
970 		ret = __netif_mib_get_stats(nx, out, len);
971 		break;
972 	case NXMIB_LLINK_LIST:
973 		ret = __netif_mib_get_llinks(nx, out, len);
974 		break;
975 	default:
976 		ret = 0;
977 		break;
978 	}
979 	return ret;
980 }
981 
982 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)983 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
984     struct nxbind *nxb, void *info)
985 {
986 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
987 	nexus_port_t first, last, port;
988 	int error;
989 
990 	ASSERT(nx_port != NULL);
991 	ASSERT(nxb != NULL);
992 
993 	port = *nx_port;
994 
995 	/*
996 	 * If port is:
997 	 * != NEXUS_PORT_ANY: attempt to bind to the specified port
998 	 * == NEXUS_PORT_ANY: find an available port, bind to it, and
999 	 *                    return back the assigned port.
1000 	 */
1001 	first = NEXUS_PORT_NET_IF_CLIENT;
1002 	last = NXDOM_MAX(NX_DOM(nx), ports);
1003 	ASSERT(first <= last);
1004 
1005 	NETIF_WLOCK(nif);
1006 
1007 	if (__improbable(first == last)) {
1008 		error = ENOMEM;
1009 	} else if (port != NEXUS_PORT_ANY) {
1010 		error = nx_port_bind_info(nx, port, nxb, info);
1011 		SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1012 	} else {
1013 		error = nx_port_find(nx, first, last - 1, &port);
1014 		ASSERT(error != 0 || (port >= first && port < last));
1015 		if (error == 0) {
1016 			error = nx_port_bind_info(nx, port, nxb, info);
1017 			SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1018 			    port, error);
1019 		}
1020 	}
1021 	NETIF_WUNLOCK(nif);
1022 
1023 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1024 	if (error == 0) {
1025 		*nx_port = port;
1026 	}
1027 
1028 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1029 	    "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1030 	    SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1031 	    nx->nx_active_ports, error);
1032 
1033 	return error;
1034 }
1035 
1036 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1037 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1038 {
1039 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1040 	int error = 0;
1041 
1042 	ASSERT(nx_port != NEXUS_PORT_ANY);
1043 
1044 	NETIF_WLOCK(nif);
1045 	error = nx_port_unbind(nx, nx_port);
1046 	NETIF_WUNLOCK(nif);
1047 
1048 	return error;
1049 }
1050 
1051 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)1052 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1053     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1054     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1055 {
1056 #pragma unused(nxdom_prov)
1057 	int err = 0;
1058 
1059 	SK_LOCK_ASSERT_HELD();
1060 
1061 	ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1062 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1063 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
1064 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1065 	ASSERT(!(ch->ch_flags & CHANF_HOST));
1066 
1067 	switch (chr->cr_port) {
1068 	case NEXUS_PORT_NET_IF_DEV:
1069 		if (chr->cr_mode & CHMODE_HOST) {
1070 			err = EINVAL;
1071 			goto done;
1072 		}
1073 		break;
1074 
1075 	case NEXUS_PORT_NET_IF_HOST:
1076 		if (!(chr->cr_mode & CHMODE_HOST)) {
1077 			if (ch->ch_flags & CHANF_KERNEL) {
1078 				err = EINVAL;
1079 				goto done;
1080 			}
1081 			chr->cr_mode |= CHMODE_HOST;
1082 		}
1083 		/*
1084 		 * This channel is exclusively opened to the host
1085 		 * rings; don't notify the external provider.
1086 		 */
1087 		atomic_bitset_32(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP);
1088 		break;
1089 
1090 	default:
1091 		/*
1092 		 * This channel is shared between netif and user process;
1093 		 * don't notify the external provider.
1094 		 */
1095 		atomic_bitset_32(&ch->ch_flags, CHANF_EXT_SKIP);
1096 		break;
1097 	}
1098 
1099 	chr->cr_ring_set = RING_SET_DEFAULT;
1100 	chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1101 	(void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1102 	    nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1103 	    nx->nx_prov->nxprov_params->nxp_name);
1104 
1105 	if (ch->ch_flags & CHANF_KERNEL) {
1106 		err = na_connect_spec(nx, ch, chr, p);
1107 	} else {
1108 		err = na_connect(nx, ch, chr, ch0, nxb, p);
1109 	}
1110 
1111 	if (err == 0) {
1112 		/*
1113 		 * Mark the kernel slot descriptor region as busy; this
1114 		 * prevents it from being torn-down at channel defunct
1115 		 * time, as the (external) nexus owner may be calling
1116 		 * KPIs that require accessing the slots.
1117 		 */
1118 		skmem_arena_nexus_sd_set_noidle(
1119 			skmem_arena_nexus(ch->ch_na->na_arena), 1);
1120 	}
1121 
1122 done:
1123 	return err;
1124 }
1125 
1126 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1127 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1128     struct kern_nexus *nx, struct kern_channel *ch)
1129 {
1130 #pragma unused(nxdom_prov)
1131 	SK_LOCK_ASSERT_HELD();
1132 
1133 	SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1134 	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1135 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1136 
1137 	/*
1138 	 * Release busy assertion held earlier in nx_netif_dom_connect();
1139 	 * this allows for the final arena teardown to succeed.
1140 	 */
1141 	skmem_arena_nexus_sd_set_noidle(
1142 		skmem_arena_nexus(ch->ch_na->na_arena), -1);
1143 
1144 	if (ch->ch_flags & CHANF_KERNEL) {
1145 		na_disconnect_spec(nx, ch);
1146 	} else {
1147 		na_disconnect(nx, ch);
1148 	}
1149 }
1150 
1151 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1152 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1153     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1154 {
1155 #pragma unused(nxdom_prov, nx)
1156 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1157 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1158 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1159 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1160 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1161 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST);
1162 
1163 	na_ch_rings_defunct(ch, p);
1164 }
1165 
1166 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1167 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1168     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1169 {
1170 #pragma unused(nxdom_prov)
1171 	if (!locked) {
1172 		SK_LOCK_ASSERT_NOTHELD();
1173 		SK_LOCK();
1174 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1175 	} else {
1176 		SK_LOCK_ASSERT_HELD();
1177 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1178 	}
1179 
1180 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1181 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1182 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1183 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST);
1184 
1185 	na_defunct(nx, ch, ch->ch_na, locked);
1186 
1187 	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1188 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1189 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1190 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1191 
1192 	if (!locked) {
1193 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1194 		SK_UNLOCK();
1195 	} else {
1196 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1197 		SK_LOCK_ASSERT_HELD();
1198 	}
1199 }
1200 
1201 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1202 na_netif_alloc(zalloc_flags_t how)
1203 {
1204 	_CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1205 
1206 	return zalloc_flags(na_netif_zone, how | Z_ZERO);
1207 }
1208 
1209 void
na_netif_free(struct nexus_adapter * na)1210 na_netif_free(struct nexus_adapter *na)
1211 {
1212 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1213 
1214 	SK_LOCK_ASSERT_HELD();
1215 	SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1216 
1217 	ASSERT(na->na_refcount == 0);
1218 	ASSERT(nifna->nifna_tx_mit == NULL);
1219 	ASSERT(nifna->nifna_rx_mit == NULL);
1220 	bzero(nifna, sizeof(*nifna));
1221 
1222 	zfree(na_netif_zone, nifna);
1223 }
1224 
1225 /* Process NXCFG_CMD_ATTACH */
1226 SK_NO_INLINE_ATTRIBUTE
1227 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1228 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1229     struct proc *p)
1230 {
1231 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1232 	struct ifnet *ifp = NULL;
1233 	boolean_t compat;
1234 	int err = 0;
1235 
1236 	SK_LOCK_ASSERT_HELD();
1237 
1238 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1239 	compat = (strcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1240 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1241 
1242 	uuid_clear(nsr->nsr_if_uuid);
1243 	/*
1244 	 * The netif accepts either an interface name or a pointer to
1245 	 * an ifnet, but never a UUID.
1246 	 */
1247 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1248 		err = EINVAL;
1249 		goto done;
1250 	}
1251 	if (nsr->nsr_flags & NXSPECREQ_IFP) {
1252 		if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1253 			err = EINVAL;
1254 			goto done;
1255 		}
1256 	} else if ((ifp = ifunit_ref(nsr->nsr_name)) == NULL) {
1257 		err = ENXIO;
1258 		goto done;
1259 	}
1260 
1261 	if ((compat && SKYWALK_NATIVE(ifp)) ||
1262 	    (!compat && !SKYWALK_NATIVE(ifp))) {
1263 		/* native driver for netif; non-native for netif_compat  */
1264 		err = ENODEV;
1265 	} else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1266 		err = EBUSY;
1267 	} else {
1268 		ASSERT(uuid_is_null(n->nif_uuid));
1269 		/*
1270 		 * Upon success, callee will hold its own ifnet iorefcnt
1271 		 * as well as a retain count on the nexus adapter.
1272 		 */
1273 		if (compat) {
1274 			err = nx_netif_compat_attach(nx, ifp);
1275 		} else {
1276 			err = nx_netif_attach(nx, ifp);
1277 		}
1278 
1279 		if (err == 0) {
1280 			/* return the adapter UUID */
1281 			uuid_generate_random(n->nif_uuid);
1282 			uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1283 #if (DEVELOPMENT || DEBUG)
1284 			skoid_create(&n->nif_skoid,
1285 			    SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1286 			    CTLFLAG_RW);
1287 #endif /* !DEVELOPMENT && !DEBUG */
1288 		}
1289 	}
1290 done:
1291 	/* drop I/O refcnt from ifunit_ref() */
1292 	if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1293 		ifnet_decr_iorefcnt(ifp);
1294 	}
1295 
1296 #if SK_LOG
1297 	uuid_string_t uuidstr, ifuuidstr;
1298 	const char *nustr;
1299 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1300 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1301 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1302 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1303 		    SK_KVA(nsr->nsr_ifp));
1304 		nustr = uuidstr;
1305 	} else {
1306 		nustr = nsr->nsr_name;
1307 	}
1308 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1309 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1310 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1311 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1312 #endif /* SK_LOG */
1313 
1314 	return err;
1315 }
1316 
1317 /* process NXCFG_CMD_DETACH */
1318 SK_NO_INLINE_ATTRIBUTE
1319 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1320 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1321 {
1322 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1323 	int err = 0;
1324 
1325 	SK_LOCK_ASSERT_HELD();
1326 
1327 	/*
1328 	 * nsr is NULL when we're called from the destructor, and it
1329 	 * implies that we'll detach whatever that is attached.
1330 	 */
1331 	if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1332 		err = EINVAL;
1333 	} else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1334 	    nif->nif_uuid) != 0) {
1335 		err = ESRCH;
1336 	} else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1337 		/* nx_netif_ctl_attach() not yet done or already detached */
1338 		err = ENXIO;
1339 	} else if (nx->nx_ch_count != 0) {
1340 		/*
1341 		 * There's at least a channel opened; we can't
1342 		 * yank the interface from underneath the nexus
1343 		 * since our dlil input/output handler may be
1344 		 * running now.  Bail out and come back here
1345 		 * again when the nexus detaches.
1346 		 */
1347 		err = EBUSY;
1348 	} else {
1349 		struct ifnet *ifp;
1350 		boolean_t suspended = FALSE;
1351 
1352 		ifp = nif->nif_ifp;
1353 		if (ifp == NULL) {
1354 			err = EALREADY;
1355 			goto done;
1356 		}
1357 		/*
1358 		 * For regular kernel-attached interfaces, quiescing is handled by
1359 		 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1360 		 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1361 		 * are constructed on the fly and can also be torn down on the fly.
1362 		 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1363 		 * can be detached while the interface is still attached.
1364 		 */
1365 		if (ifnet_datamov_suspend_if_needed(ifp)) {
1366 			SK_UNLOCK();
1367 			suspended = TRUE;
1368 			ifnet_datamov_drain(ifp);
1369 			SK_LOCK();
1370 		}
1371 		nx_netif_agent_fini(nif);
1372 		nx_netif_capabilities_fini(nif);
1373 		nx_netif_flow_fini(nif);
1374 		nx_netif_filter_fini(nif);
1375 		nx_netif_llink_fini(nif);
1376 		nx_netif_flags_fini(nif);
1377 
1378 		uuid_clear(nif->nif_uuid);
1379 		/* nx_netif_{compat_}attach() held both references */
1380 		na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1381 		na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1382 		nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1383 		nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1384 
1385 		ifp->if_na_ops = NULL;
1386 		ifp->if_na = NULL;
1387 		nif->nif_ifp = NULL;
1388 		nif->nif_netif_nxadv = NULL;
1389 		SKYWALK_CLEAR_CAPABLE(ifp);
1390 		if (suspended) {
1391 			ifnet_datamov_resume(ifp);
1392 		}
1393 
1394 #if (DEVELOPMENT || DEBUG)
1395 		skoid_destroy(&nif->nif_skoid);
1396 #endif /* !DEVELOPMENT && !DEBUG */
1397 	}
1398 done:
1399 #if SK_LOG
1400 	if (nsr != NULL) {
1401 		uuid_string_t ifuuidstr;
1402 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1403 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1404 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1405 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1406 		    nsr->nsr_flags, err);
1407 	} else {
1408 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1409 		    "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1410 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
1411 	}
1412 #endif /* SK_LOG */
1413 
1414 	return err;
1415 }
1416 
1417 /*
1418  * XXX
1419  * These checks are copied from fsw.c
1420  * There are no tests exercising this code. Do we still need this?
1421  */
1422 SK_NO_INLINE_ATTRIBUTE
1423 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1424 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1425     struct proc *p, struct nx_flow_req *req)
1426 {
1427 #pragma unused(nif)
1428 	boolean_t need_check;
1429 	int error;
1430 
1431 	if (uuid_is_null(req->nfr_flow_uuid)) {
1432 		return EINVAL;
1433 	}
1434 	req->nfr_flags &= NXFLOWREQF_MASK;
1435 	req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1436 
1437 	if (cmd == NXCFG_CMD_FLOW_DEL) {
1438 		return 0;
1439 	}
1440 	need_check = FALSE;
1441 	if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1442 		need_check = TRUE;
1443 	} else if (!uuid_is_null(req->nfr_euuid)) {
1444 		uuid_t uuid;
1445 
1446 		/* get the UUID of the issuing process */
1447 		proc_getexecutableuuid(p, uuid, sizeof(uuid));
1448 
1449 		/*
1450 		 * If this is not issued by a process for its own
1451 		 * executable UUID and if the process does not have
1452 		 * the necessary privilege, reject the request.
1453 		 * The logic is similar to so_set_effective_uuid().
1454 		 */
1455 		if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1456 			need_check = TRUE;
1457 		}
1458 	}
1459 	if (need_check) {
1460 		kauth_cred_t cred = kauth_cred_proc_ref(p);
1461 		error = priv_check_cred(cred,
1462 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1463 		kauth_cred_unref(&cred);
1464 		if (error != 0) {
1465 			return error;
1466 		}
1467 	}
1468 	return 0;
1469 }
1470 
1471 SK_NO_INLINE_ATTRIBUTE
1472 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1473 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1474     struct nx_flow_req *req)
1475 {
1476 	int err;
1477 
1478 	ASSERT(p != PROC_NULL);
1479 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1480 	if (err != 0) {
1481 		return err;
1482 	}
1483 
1484 	/* init kernel only fields */
1485 	nx_flow_req_internalize(req);
1486 	req->nfr_context = NULL;
1487 	req->nfr_flow_stats = NULL;
1488 	req->nfr_port_reservation = NULL;
1489 	req->nfr_pid = proc_pid(p);
1490 
1491 	err = nx_netif_netagent_flow_add(nif, req);
1492 	nx_flow_req_externalize(req);
1493 	return err;
1494 }
1495 
1496 SK_NO_INLINE_ATTRIBUTE
1497 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1498 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1499     struct nx_flow_req *req)
1500 {
1501 	int err;
1502 
1503 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1504 	if (err != 0) {
1505 		return err;
1506 	}
1507 
1508 	nx_flow_req_internalize(req);
1509 	req->nfr_pid = proc_pid(p);
1510 
1511 	err = nx_netif_netagent_flow_del(nif, req);
1512 	nx_flow_req_externalize(req);
1513 	return err;
1514 }
1515 
1516 SK_NO_INLINE_ATTRIBUTE
1517 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1518 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1519     struct proc *p)
1520 {
1521 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1522 	struct nx_spec_req *nsr = data;
1523 	struct nx_flow_req *nfr = data;
1524 	int error = 0;
1525 
1526 	SK_LOCK_ASSERT_HELD();
1527 
1528 	switch (nc_cmd) {
1529 	case NXCFG_CMD_ATTACH:
1530 		error = nx_netif_ctl_attach(nx, nsr, p);
1531 		break;
1532 
1533 	case NXCFG_CMD_DETACH:
1534 		error = nx_netif_ctl_detach(nx, nsr);
1535 		break;
1536 
1537 	case NXCFG_CMD_FLOW_ADD:
1538 		error = nx_netif_ctl_flow_add(nif, p, nfr);
1539 		break;
1540 
1541 	case NXCFG_CMD_FLOW_DEL:
1542 		error = nx_netif_ctl_flow_del(nif, p, nfr);
1543 		break;
1544 
1545 	default:
1546 		SK_ERR("invalid cmd %u", nc_cmd);
1547 		error = EINVAL;
1548 		break;
1549 	}
1550 	return error;
1551 }
1552 
1553 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1554 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1555     uint32_t flags)
1556 {
1557 #pragma unused(flags)
1558 	struct netif_qset *qset;
1559 
1560 	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1561 		(void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1562 	}
1563 }
1564 
1565 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1566 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1567 {
1568 	struct nx_netif *nif;
1569 	struct netif_llink *llink;
1570 
1571 	nif = NX_NETIF_PRIVATE(nx);
1572 
1573 	lck_rw_lock_shared(&nif->nif_llink_lock);
1574 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1575 		nx_netif_llink_notify(nx, llink, flags);
1576 	}
1577 	lck_rw_unlock_shared(&nif->nif_llink_lock);
1578 }
1579 
1580 /*
1581  * if_start() callback for native Skywalk interfaces, registered
1582  * at ifnet_allocate_extended() time, and invoked by the ifnet
1583  * starter thread.
1584  */
1585 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1586 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1587 {
1588 	if (__improbable(ifp->if_na == NULL)) {
1589 		return;
1590 	}
1591 
1592 	/*
1593 	 * Do this only if the nexus adapter is active, i.e. a channel
1594 	 * has been opened to it by the module above (flowswitch, etc.)
1595 	 */
1596 	struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1597 	if (__probable(NA_IS_ACTIVE(hwna))) {
1598 		struct kern_nexus *nx = hwna->na_nx;
1599 
1600 		/* update our work timestamp */
1601 		hwna->na_work_ts = _net_uptime;
1602 
1603 		if (NX_LLINK_PROV(nx)) {
1604 			nx_netif_llink_notify_all(nx, flags);
1605 		} else {
1606 			struct __kern_channel_ring *kring;
1607 
1608 			/* for doorbell purposes, use TX ring 0 */
1609 			kring = &hwna->na_tx_rings[0];
1610 
1611 			/* Issue a synchronous TX doorbell on the netif device ring */
1612 			kring->ckr_na_sync(kring, PROC_NULL,
1613 			    (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1614 		}
1615 	} else {
1616 		struct netif_stats *nifs =
1617 		    &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1618 		STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1619 	}
1620 }
1621 
1622 static void
nx_netif_doorbell(struct ifnet * ifp)1623 nx_netif_doorbell(struct ifnet *ifp)
1624 {
1625 	nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1626 }
1627 
1628 /*
1629  * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1630  * perform synchronous TX doorbell to the driver, by invoking the driver's
1631  * doorbell callback directly in the same thread context.  It is also called
1632  * when the layer above performs a TX sync operation, where we might need
1633  * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1634  */
1635 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1636 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1637     uint32_t flags)
1638 {
1639 #pragma unused(p)
1640 	struct ifnet *ifp = KRNA(kring)->na_ifp;
1641 	boolean_t sync_only;
1642 	int ret = 0;
1643 
1644 	ASSERT(ifp != NULL);
1645 
1646 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1647 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1648 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1649 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1650 	    flags);
1651 
1652 	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1653 		SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1654 		    SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1655 		return ENXIO;
1656 	}
1657 
1658 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1659 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1660 		    "flow control ON", SK_KVA(kring), if_name(ifp),
1661 		    SK_KVA(ifp));
1662 		return ENXIO;
1663 	}
1664 
1665 	/* update our work timestamp */
1666 	KRNA(kring)->na_work_ts = _net_uptime;
1667 
1668 	sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1669 	    !KR_KERNEL_ONLY(kring);
1670 	/* regular sync (reclaim) */
1671 	if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1672 		ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1673 		    kring->ckr_pending_intr != 0);
1674 		kring->ckr_pending_intr = 0;
1675 
1676 		/* direct user channels do not need to use the doorbell */
1677 		if (__improbable(sync_only)) {
1678 			return ret;
1679 		}
1680 	}
1681 
1682 	/*
1683 	 * Doorbell call.  Here we do doorbell explicitly if the flag is
1684 	 * set or implicitly if we're opened directly by a user channel.
1685 	 * Synchronous vs. asynchronous depending on the context.
1686 	 */
1687 	if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1688 		if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1689 			ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1690 			    !(flags & NA_SYNCF_NETIF_ASYNC));
1691 			nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1692 		} else {
1693 			ifnet_start(ifp);
1694 		}
1695 	}
1696 
1697 	return ret;
1698 }
1699 
1700 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1701 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1702     uint32_t flags)
1703 {
1704 #pragma unused(p)
1705 	int ret;
1706 
1707 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1708 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1709 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1710 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1711 	    flags);
1712 
1713 	ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1714 
1715 	/* update our work timestamp */
1716 	KRNA(kring)->na_work_ts = _net_uptime;
1717 
1718 	ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1719 	    kring->ckr_pending_intr != 0);
1720 	kring->ckr_pending_intr = 0;
1721 
1722 	return ret;
1723 }
1724 
1725 static void
nx_netif_na_dtor(struct nexus_adapter * na)1726 nx_netif_na_dtor(struct nexus_adapter *na)
1727 {
1728 	struct ifnet *ifp;
1729 	struct nexus_netif_adapter *nifna = NIFNA(na);
1730 
1731 	SK_LOCK_ASSERT_HELD();
1732 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1733 
1734 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1735 
1736 	/*
1737 	 * If the finalizer callback hasn't been called for whatever
1738 	 * reasons, pick up the embryonic ifnet stored in na_private.
1739 	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1740 	 */
1741 	if ((ifp = na->na_ifp) == NULL) {
1742 		ifp = na->na_private;
1743 		na->na_private = NULL;
1744 	} else {
1745 		ifnet_decr_iorefcnt(ifp);
1746 		na->na_ifp = NULL;
1747 	}
1748 
1749 	if (nifna->nifna_netif != NULL) {
1750 		nx_netif_release(nifna->nifna_netif);
1751 		nifna->nifna_netif = NULL;
1752 	}
1753 	ASSERT(SKYWALK_NATIVE(ifp));
1754 }
1755 
1756 /*
1757  * Dispatch rx/tx interrupts to the channel rings.
1758  *
1759  * The 'notify' routine depends on what the ring is attached to.
1760  * - for a channel file descriptor, do an event wakeup on the individual
1761  *   waitqueue, plus one on the global one if needed (see na_notify)
1762  * - for a device port connected to a FlowSwitch, call the proper
1763  *   forwarding routine; see nx_fsw_tx_hwna_notify()
1764  *   or nx_fsw_rx_hwna_notify().
1765  */
1766 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1767 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1768     uint32_t flags, uint32_t *work_done)
1769 {
1770 	struct netif_stats *nifs =
1771 	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1772 	int (*notify)(struct __kern_channel_ring *kring,
1773 	    struct proc *, uint32_t flags);
1774 	int ret;
1775 
1776 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1777 
1778 	SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1779 	    ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1780 	    "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1781 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1782 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1783 
1784 	/* update our work timestamp */
1785 	KRNA(kring)->na_work_ts = _net_uptime;
1786 
1787 	kring->ckr_pending_intr++;
1788 	if (work_done != NULL) {
1789 		*work_done = 1; /* do not fire again */
1790 	}
1791 	/*
1792 	 * We can't be calling ckr_na_notify here since we could already be
1793 	 * intercepting it, else we'd end up recursively calling ourselves.
1794 	 * Use the original na_notify callback saved during na_activate, or in
1795 	 * the case when the module above us is the flowswitch, the notify
1796 	 * routine that it has installed in place of our original one.
1797 	 */
1798 	if (__probable(!KR_DROP(kring) &&
1799 	    (notify = kring->ckr_netif_notify) != NULL)) {
1800 		ret = notify(kring, p, flags);
1801 	} else {
1802 		/*
1803 		 * If the ring is in drop mode, pretend as if it's busy.
1804 		 * This allows the mitigation thread to pause for a while
1805 		 * before attempting again.
1806 		 */
1807 		ret = EBUSY;
1808 	}
1809 	if (__improbable(ret != 0)) {
1810 		switch (kring->ckr_tx) {
1811 		case NR_RX:
1812 			if (ret == EBUSY) {
1813 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1814 			} else if (ret == EAGAIN) {
1815 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1816 			} else {
1817 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1818 			}
1819 			break;
1820 
1821 		case NR_TX:
1822 			if (ret == EBUSY) {
1823 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1824 			} else if (ret == EAGAIN) {
1825 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1826 			} else {
1827 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1828 			}
1829 			break;
1830 
1831 		default:
1832 			break;
1833 		}
1834 	}
1835 
1836 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1837 
1838 	return ret;
1839 }
1840 
1841 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1842 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1843     uint32_t flags)
1844 {
1845 	return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1846 }
1847 
1848 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1849 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1850     uint32_t flags)
1851 {
1852 	int ret;
1853 
1854 	/*
1855 	 * In the event the mitigation thread is disabled, protect
1856 	 * against recursion by detecting if we're already in the
1857 	 * context of an RX notify.  IOSkywalkFamily may invoke the
1858 	 * notify callback as part of its RX sync callback.
1859 	 */
1860 	if (__probable(!sk_is_rx_notify_protected())) {
1861 		sk_protect_t protect;
1862 		uint32_t work_done;
1863 
1864 		protect = sk_rx_notify_protect();
1865 		ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1866 		sk_sync_unprotect(protect);
1867 	} else {
1868 		ret = EAGAIN;
1869 	}
1870 
1871 	return ret;
1872 }
1873 
1874 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)1875 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
1876     boolean_t *tx_mit, boolean_t *tx_mit_simple,
1877     boolean_t *rx_mit, boolean_t *rx_mit_simple)
1878 {
1879 	struct nx_netif *nif = nifna->nifna_netif;
1880 
1881 	/*
1882 	 * TX mitigation is disabled by default, but can be
1883 	 * overridden via "sk_netif_tx_mit=N" boot-arg, where
1884 	 * N is one of SK_NETIF_MIT_FORCE_* values.
1885 	 */
1886 	*tx_mit = *tx_mit_simple = FALSE;
1887 	switch (sk_netif_tx_mit) {
1888 	case SK_NETIF_MIT_FORCE_SIMPLE:
1889 		*tx_mit_simple = TRUE;
1890 		OS_FALLTHROUGH;
1891 	case SK_NETIF_MIT_FORCE_ADVANCED:
1892 		*tx_mit = TRUE;
1893 		break;
1894 	case SK_NETIF_MIT_FORCE_OFF:
1895 	case SK_NETIF_MIT_AUTO:
1896 		ASSERT(*tx_mit == FALSE);
1897 		break;
1898 	default:
1899 		VERIFY(0);
1900 		/* NOTREACHED */
1901 		__builtin_unreachable();
1902 	}
1903 
1904 	/*
1905 	 * RX mitigation is enabled by default only for BSD-style
1906 	 * virtual network interfaces, but can be overridden
1907 	 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
1908 	 * SK_NETIF_MIT_FORCE_* values.
1909 	 */
1910 	*rx_mit = *rx_mit_simple = FALSE;
1911 	switch (sk_netif_rx_mit) {
1912 	case SK_NETIF_MIT_FORCE_OFF:
1913 		ASSERT(*rx_mit == FALSE);
1914 		break;
1915 	case SK_NETIF_MIT_FORCE_SIMPLE:
1916 		*rx_mit_simple = TRUE;
1917 		OS_FALLTHROUGH;
1918 	case SK_NETIF_MIT_FORCE_ADVANCED:
1919 		*rx_mit = TRUE;
1920 		break;
1921 	case SK_NETIF_MIT_AUTO:
1922 		*rx_mit_simple = TRUE;
1923 #if !XNU_TARGET_OS_OSX
1924 		/*
1925 		 * On non-macOS platforms, enable RX mitigation
1926 		 * thread only for BSD-style virtual (and regular)
1927 		 * interfaces, since otherwise we may run out of
1928 		 * stack when subjected to IPsec processing, etc.
1929 		 */
1930 		*rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
1931 		    NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
1932 #else /* XNU_TARGET_OS_OSX */
1933 		/*
1934 		 * On macOS platform, enable RX mitigation on all but
1935 		 * low-latency interfaces, since we could potentially
1936 		 * have filter providers, etc.  Ideally this should
1937 		 * be detected and dealt with dynamically.
1938 		 */
1939 		*rx_mit = !NETIF_IS_LOW_LATENCY(nif);
1940 #endif /* XNU_TARGET_OS_OSX */
1941 		break;
1942 	default:
1943 		VERIFY(0);
1944 		/* NOTREACHED */
1945 		__builtin_unreachable();
1946 	}
1947 }
1948 
1949 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)1950 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
1951 {
1952 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1953 	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
1954 	struct nx_netif *nif = nifna->nifna_netif;
1955 	struct ifnet *ifp = na->na_ifp;
1956 	int error = 0;
1957 	uint32_t r;
1958 
1959 	ASSERT(na->na_type == NA_NETIF_DEV);
1960 	ASSERT(!(na->na_flags & NAF_HOST_ONLY));
1961 
1962 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
1963 	    SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
1964 
1965 	switch (mode) {
1966 	case NA_ACTIVATE_MODE_ON:
1967 		ASSERT(SKYWALK_CAPABLE(ifp));
1968 
1969 		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
1970 		    &rx_mit, &rx_mit_simple);
1971 
1972 		/*
1973 		 * Init the mitigation support on all the dev TX rings.
1974 		 */
1975 		if (tx_mit) {
1976 			nifna->nifna_tx_mit =
1977 			    skn_alloc_type_array(tx_on, struct nx_netif_mit,
1978 			    na_get_nrings(na, NR_TX), Z_WAITOK,
1979 			    skmem_tag_netif_mit);
1980 			if (nifna->nifna_tx_mit == NULL) {
1981 				SK_ERR("TX mitigation allocation failed");
1982 				error = ENOMEM;
1983 				goto out;
1984 			}
1985 		} else {
1986 			ASSERT(nifna->nifna_tx_mit == NULL);
1987 		}
1988 
1989 		/*
1990 		 * Init the mitigation support on all the dev RX rings.
1991 		 */
1992 		if (rx_mit) {
1993 			nifna->nifna_rx_mit =
1994 			    skn_alloc_type_array(rx_on, struct nx_netif_mit,
1995 			    na_get_nrings(na, NR_RX), Z_WAITOK,
1996 			    skmem_tag_netif_mit);
1997 			if (nifna->nifna_rx_mit == NULL) {
1998 				SK_ERR("RX mitigation allocation failed");
1999 				if (nifna->nifna_tx_mit != NULL) {
2000 					skn_free_type_array(rx_fail,
2001 					    struct nx_netif_mit,
2002 					    na_get_nrings(na, NR_TX),
2003 					    nifna->nifna_tx_mit);
2004 					nifna->nifna_tx_mit = NULL;
2005 				}
2006 				error = ENOMEM;
2007 				goto out;
2008 			}
2009 		} else {
2010 			ASSERT(nifna->nifna_rx_mit == NULL);
2011 		}
2012 
2013 		/* intercept na_notify callback on the TX rings */
2014 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2015 			na->na_tx_rings[r].ckr_netif_notify =
2016 			    na->na_tx_rings[r].ckr_na_notify;
2017 			na->na_tx_rings[r].ckr_na_notify =
2018 			    nx_netif_na_notify_tx;
2019 			if (nifna->nifna_tx_mit != NULL) {
2020 				nx_netif_mit_init(nif, ifp,
2021 				    &nifna->nifna_tx_mit[r],
2022 				    &na->na_tx_rings[r], tx_mit_simple);
2023 			}
2024 		}
2025 
2026 		/* intercept na_notify callback on the RX rings */
2027 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2028 			na->na_rx_rings[r].ckr_netif_notify =
2029 			    na->na_rx_rings[r].ckr_na_notify;
2030 			na->na_rx_rings[r].ckr_na_notify =
2031 			    nx_netif_na_notify_rx;
2032 			if (nifna->nifna_rx_mit != NULL) {
2033 				nx_netif_mit_init(nif, ifp,
2034 				    &nifna->nifna_rx_mit[r],
2035 				    &na->na_rx_rings[r], rx_mit_simple);
2036 			}
2037 		}
2038 		nx_netif_filter_enable(nif);
2039 		nx_netif_flow_enable(nif);
2040 		atomic_bitset_32(&na->na_flags, NAF_ACTIVE);
2041 
2042 		/* steer all start requests to netif; this must not fail */
2043 		lck_mtx_lock(&ifp->if_start_lock);
2044 		error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2045 		VERIFY(error == 0);
2046 		lck_mtx_unlock(&ifp->if_start_lock);
2047 		break;
2048 
2049 	case NA_ACTIVATE_MODE_DEFUNCT:
2050 		ASSERT(SKYWALK_CAPABLE(ifp));
2051 		break;
2052 
2053 	case NA_ACTIVATE_MODE_OFF:
2054 		/*
2055 		 * Note that here we cannot assert SKYWALK_CAPABLE()
2056 		 * as we're called in the destructor path.
2057 		 */
2058 		atomic_bitclear_32(&na->na_flags, NAF_ACTIVE);
2059 		nx_netif_flow_disable(nif);
2060 		nx_netif_filter_disable(nif);
2061 
2062 		/*
2063 		 * Here we may block while holding sk_lock, but because
2064 		 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2065 		 * should immediately return.  A better approach would be
2066 		 * to drop sk_lock and add a monitor for this routine.
2067 		 */
2068 		lck_mtx_lock(&ifp->if_start_lock);
2069 		while (ifp->if_start_active != 0) {
2070 			++ifp->if_start_waiters;
2071 			(void) msleep(&ifp->if_start_waiters,
2072 			    &ifp->if_start_lock, (PZERO - 1),
2073 			    na->na_name, NULL);
2074 		}
2075 		/* steer all start requests to default handler */
2076 		ifnet_reset_start_handler(ifp);
2077 		lck_mtx_unlock(&ifp->if_start_lock);
2078 
2079 		/* reset all TX notify callbacks */
2080 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2081 			na->na_tx_rings[r].ckr_na_notify =
2082 			    na->na_tx_rings[r].ckr_netif_notify;
2083 			na->na_tx_rings[r].ckr_netif_notify = NULL;
2084 			if (nifna->nifna_tx_mit != NULL) {
2085 				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2086 				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2087 			}
2088 		}
2089 
2090 		if (nifna->nifna_tx_mit != NULL) {
2091 			skn_free_type_array(tx_off, struct nx_netif_mit,
2092 			    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
2093 			nifna->nifna_tx_mit = NULL;
2094 		}
2095 
2096 		/* reset all RX notify callbacks */
2097 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2098 			na->na_rx_rings[r].ckr_na_notify =
2099 			    na->na_rx_rings[r].ckr_netif_notify;
2100 			na->na_rx_rings[r].ckr_netif_notify = NULL;
2101 			if (nifna->nifna_rx_mit != NULL) {
2102 				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2103 				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2104 			}
2105 		}
2106 		if (nifna->nifna_rx_mit != NULL) {
2107 			skn_free_type_array(rx_off, struct nx_netif_mit,
2108 			    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
2109 			nifna->nifna_rx_mit = NULL;
2110 		}
2111 		break;
2112 
2113 	default:
2114 		VERIFY(0);
2115 		/* NOTREACHED */
2116 		__builtin_unreachable();
2117 	}
2118 out:
2119 	return error;
2120 }
2121 
2122 SK_NO_INLINE_ATTRIBUTE
2123 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2124 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2125 __attribute__((optnone))
2126 {
2127 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2128 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2129 	struct nexus_netif_adapter *devnifna = NULL;
2130 	struct nexus_netif_adapter *hostnifna = NULL;
2131 	struct nexus_adapter *devna = NULL;
2132 	struct nexus_adapter *hostna = NULL;
2133 	boolean_t embryonic = FALSE;
2134 	int retval = 0;
2135 	uint32_t na_flags;
2136 
2137 	SK_LOCK_ASSERT_HELD();
2138 	ASSERT(SKYWALK_NATIVE(ifp));
2139 	ASSERT(!SKYWALK_CAPABLE(ifp));
2140 	ASSERT(ifp->if_na == NULL);
2141 	ASSERT(ifp->if_na_ops == NULL);
2142 
2143 	devnifna = na_netif_alloc(Z_WAITOK);
2144 	hostnifna = na_netif_alloc(Z_WAITOK);
2145 
2146 	/*
2147 	 * We can be called for two different interface states:
2148 	 *
2149 	 * Fully attached: get an io ref count; upon success, this
2150 	 * holds a reference to the ifnet for the ifp pointer stored
2151 	 * in 'na_ifp' down below for both adapters.
2152 	 *
2153 	 * Embryonic: temporary hold the ifnet in na_private, which
2154 	 * upon a successful ifnet_attach(), will be moved over to
2155 	 * the 'na_ifp' with an io ref count held.
2156 	 *
2157 	 * The ifnet in 'na_ifp' will be released by na_release_locked().
2158 	 */
2159 	if (!ifnet_is_attached(ifp, 1)) {
2160 		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2161 			ifp = NULL;
2162 			retval = ENXIO;
2163 			goto err;
2164 		}
2165 		embryonic = TRUE;
2166 	}
2167 
2168 	/* initialize the device netif adapter */
2169 	devnifna->nifna_netif = nif;
2170 	nx_netif_retain(nif);
2171 	devna = &devnifna->nifna_up;
2172 	devna->na_type = NA_NETIF_DEV;
2173 	devna->na_free = na_netif_free;
2174 	(void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1);
2175 	devna->na_name[sizeof(devna->na_name) - 1] = '\0';
2176 	uuid_generate_random(devna->na_uuid);
2177 	if (embryonic) {
2178 		/*
2179 		 * We will move this over to na_ifp once
2180 		 * the interface is fully attached.
2181 		 */
2182 		devna->na_private = ifp;
2183 		ASSERT(devna->na_ifp == NULL);
2184 	} else {
2185 		ASSERT(devna->na_private == NULL);
2186 		/* use I/O refcnt from ifnet_is_attached() */
2187 		devna->na_ifp = ifp;
2188 	}
2189 	devna->na_activate = nx_netif_na_activate;
2190 	devna->na_channel_event_notify = nx_netif_na_channel_event_notify;
2191 	devna->na_txsync = nx_netif_na_txsync;
2192 	devna->na_rxsync = nx_netif_na_rxsync;
2193 	devna->na_dtor = nx_netif_na_dtor;
2194 	devna->na_krings_create = nx_netif_dev_krings_create;
2195 	devna->na_krings_delete = nx_netif_dev_krings_delete;
2196 	devna->na_special = nx_netif_na_special;
2197 
2198 	na_flags = NAF_NATIVE;
2199 	if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2200 		na_flags |= NAF_VIRTUAL_DEVICE;
2201 	}
2202 	if (NX_LLINK_PROV(nx)) {
2203 		/*
2204 		 * while operating in logical link mode, we don't need to
2205 		 * create backing memory regions for the rings as they are
2206 		 * not used.
2207 		 */
2208 		na_flags |= NAF_MEM_NO_INIT;
2209 	}
2210 	atomic_bitset_32(&devna->na_flags, na_flags);
2211 	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2212 	    NEXUS_STATS_TYPE_INVALID;
2213 
2214 	na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2215 	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2216 	na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2217 	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2218 	/*
2219 	 * Verify upper bounds; the parameters must have already been
2220 	 * validated by nxdom_prov_params() by the time we get here.
2221 	 */
2222 	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2223 	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2224 	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2225 	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2226 
2227 	na_attach_common(devna, nx, &nx_netif_prov_s);
2228 
2229 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2230 	    nx, devna)) != 0) {
2231 		ASSERT(devna->na_arena == NULL);
2232 		goto err;
2233 	}
2234 	ASSERT(devna->na_arena != NULL);
2235 
2236 	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2237 	ASSERT(devna->na_flowadv_max == 0 ||
2238 	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2239 
2240 	/* setup packet copy routines */
2241 	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2242 		nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2243 		nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2244 		nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2245 	} else {
2246 		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2247 		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2248 		nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2249 	}
2250 
2251 	/* initialize the host netif adapter */
2252 	hostnifna->nifna_netif = nif;
2253 	nx_netif_retain(nif);
2254 	hostna = &hostnifna->nifna_up;
2255 	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2256 	    "%s^", devna->na_name);
2257 	uuid_generate_random(hostna->na_uuid);
2258 	if (embryonic) {
2259 		/*
2260 		 * We will move this over to na_ifp once
2261 		 * the interface is fully attached.
2262 		 */
2263 		hostna->na_private = ifp;
2264 		ASSERT(hostna->na_ifp == NULL);
2265 	} else {
2266 		ASSERT(hostna->na_private == NULL);
2267 		hostna->na_ifp = devna->na_ifp;
2268 		ifnet_incr_iorefcnt(hostna->na_ifp);
2269 	}
2270 	hostna->na_type = NA_NETIF_HOST;
2271 	hostna->na_free = na_netif_free;
2272 	hostna->na_activate = nx_netif_host_na_activate;
2273 	hostna->na_txsync = nx_netif_host_na_txsync;
2274 	hostna->na_rxsync = nx_netif_host_na_rxsync;
2275 	hostna->na_dtor = nx_netif_na_dtor;
2276 	hostna->na_krings_create = nx_netif_host_krings_create;
2277 	hostna->na_krings_delete = nx_netif_host_krings_delete;
2278 	hostna->na_special = nx_netif_host_na_special;
2279 
2280 	na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2281 	if (NX_LLINK_PROV(nx)) {
2282 		/*
2283 		 * while operating in logical link mode, we don't need to
2284 		 * create backing memory regions for the rings as they are
2285 		 * not used.
2286 		 */
2287 		na_flags |= NAF_MEM_NO_INIT;
2288 	}
2289 	atomic_bitset_32(&hostna->na_flags, na_flags);
2290 	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2291 	    NEXUS_STATS_TYPE_INVALID;
2292 
2293 	na_set_nrings(hostna, NR_TX, 1);
2294 	na_set_nrings(hostna, NR_RX, 1);
2295 	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2296 	na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2297 
2298 	na_attach_common(hostna, nx, &nx_netif_prov_s);
2299 
2300 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2301 	    nx, hostna)) != 0) {
2302 		ASSERT(hostna->na_arena == NULL);
2303 		goto err;
2304 	}
2305 	ASSERT(hostna->na_arena != NULL);
2306 
2307 	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2308 	ASSERT(hostna->na_flowadv_max == 0 ||
2309 	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2310 
2311 	/* adjust the classq packet drop limit */
2312 	if (embryonic) {
2313 		uint32_t drop_lim;
2314 		struct kern_pbufpool_memory_info pp_info;
2315 
2316 		retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2317 		VERIFY(retval == 0);
2318 
2319 		/* set the drop limit as 80% of size of packet pool */
2320 		drop_lim = (pp_info.kpm_packets * 4) / 5;
2321 		VERIFY(drop_lim != 0);
2322 		IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2323 	}
2324 
2325 	/* these will be undone by destructor  */
2326 	ifp->if_na_ops = &na_netif_ops;
2327 	ifp->if_na = devnifna;
2328 	na_retain_locked(devna);
2329 	na_retain_locked(hostna);
2330 
2331 	SKYWALK_SET_CAPABLE(ifp);
2332 
2333 	NETIF_WLOCK(nif);
2334 	nif->nif_ifp = ifp;
2335 	nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2336 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2337 	    kernproc);
2338 	ASSERT(retval == 0);
2339 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2340 	    kernproc);
2341 	ASSERT(retval == 0);
2342 	NETIF_WUNLOCK(nif);
2343 
2344 #if SK_LOG
2345 	uuid_string_t uuidstr;
2346 	SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2347 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2348 	    sk_uuid_unparse(devna->na_uuid, uuidstr));
2349 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
2350 	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2351 	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2352 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b", devna->na_flags, NAF_BITS);
2353 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
2354 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2355 	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2356 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2357 	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2358 #if CONFIG_NEXUS_USER_PIPE
2359 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
2360 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
2361 #endif /* CONFIG_NEXUS_USER_PIPE */
2362 	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
2363 	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2364 	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2365 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2366 	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
2367 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
2368 	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2369 	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2370 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b",
2371 	    hostna->na_flags, NAF_BITS);
2372 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
2373 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2374 	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2375 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2376 	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2377 #if CONFIG_NEXUS_USER_PIPE
2378 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
2379 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
2380 #endif /* CONFIG_NEXUS_USER_PIPE */
2381 	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
2382 	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2383 #endif /* SK_LOG */
2384 
2385 err:
2386 	if (retval != 0) {
2387 		if (ifp != NULL) {
2388 			if (!embryonic) {
2389 				ifnet_decr_iorefcnt(ifp);
2390 			}
2391 			ifp = NULL;
2392 		}
2393 		if (devna != NULL) {
2394 			if (devna->na_arena != NULL) {
2395 				skmem_arena_release(devna->na_arena);
2396 				devna->na_arena = NULL;
2397 			}
2398 			if (devna->na_ifp != NULL) {
2399 				ifnet_decr_iorefcnt(devna->na_ifp);
2400 				devna->na_ifp = NULL;
2401 			}
2402 			devna->na_private = NULL;
2403 		}
2404 		if (hostna != NULL) {
2405 			if (hostna->na_arena != NULL) {
2406 				skmem_arena_release(hostna->na_arena);
2407 				hostna->na_arena = NULL;
2408 			}
2409 			if (hostna->na_ifp != NULL) {
2410 				ifnet_decr_iorefcnt(hostna->na_ifp);
2411 				hostna->na_ifp = NULL;
2412 			}
2413 			hostna->na_private = NULL;
2414 		}
2415 		if (devnifna != NULL) {
2416 			if (devnifna->nifna_netif != NULL) {
2417 				nx_netif_release(devnifna->nifna_netif);
2418 				devnifna->nifna_netif = NULL;
2419 			}
2420 			na_netif_free((struct nexus_adapter *)devnifna);
2421 		}
2422 		if (hostnifna != NULL) {
2423 			if (hostnifna->nifna_netif != NULL) {
2424 				nx_netif_release(hostnifna->nifna_netif);
2425 				hostnifna->nifna_netif = NULL;
2426 			}
2427 			na_netif_free((struct nexus_adapter *)hostnifna);
2428 		}
2429 	}
2430 	return retval;
2431 }
2432 
2433 /*
2434  * Any per-netif state that can be discovered at attach time should be
2435  * initialized here.
2436  */
2437 static void
nx_netif_flags_init(struct nx_netif * nif)2438 nx_netif_flags_init(struct nx_netif *nif)
2439 {
2440 	ifnet_t ifp = nif->nif_ifp;
2441 	struct kern_nexus *nx = nif->nif_nx;
2442 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2443 
2444 	switch (devna->na_type) {
2445 	case NA_NETIF_DEV:
2446 		if (strcmp(ifp->if_name, sk_ll_prefix) == 0) {
2447 			nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2448 			if_set_xflags(ifp, IFXF_LOW_LATENCY);
2449 		}
2450 		break;
2451 	case NA_NETIF_COMPAT_DEV:
2452 		nif->nif_flags |= NETIF_FLAG_COMPAT;
2453 		break;
2454 	default:
2455 		break;
2456 	}
2457 }
2458 
2459 /*
2460  * This is also supposed to check for any inconsistent state at detach time.
2461  */
2462 static void
nx_netif_flags_fini(struct nx_netif * nif)2463 nx_netif_flags_fini(struct nx_netif *nif)
2464 {
2465 	ifnet_t ifp = nif->nif_ifp;
2466 
2467 	if (ifp != NULL) {
2468 		if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2469 	}
2470 	nif->nif_flags = 0;
2471 }
2472 
2473 static void
nx_netif_capabilities_init(struct nx_netif * nif)2474 nx_netif_capabilities_init(struct nx_netif *nif)
2475 {
2476 	struct kern_nexus_capab_interface_advisory kncia;
2477 	struct kern_nexus *nx = nif->nif_nx;
2478 	nxprov_capab_config_fn_t capab_fn;
2479 	uint32_t capab_len;
2480 	int error;
2481 
2482 	if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2483 	    KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2484 		capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2485 		ASSERT(capab_fn != NULL);
2486 	} else {
2487 		capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2488 	}
2489 	if (capab_fn == NULL) {
2490 		return;
2491 	}
2492 	/* check/configure interface advisory notifications */
2493 	if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) != 0) {
2494 		bzero(&kncia, sizeof(kncia));
2495 		kncia.kncia_version =
2496 		    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2497 		*__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2498 		    &(kncia.kncia_notify)) = nx_netif_interface_advisory_notify;
2499 		*__DECONST(void **, &(kncia.kncia_kern_context)) = nx;
2500 		capab_len = sizeof(kncia);
2501 		error = capab_fn(NX_PROV(nx), nx,
2502 		    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &kncia, &capab_len);
2503 		if (error == 0) {
2504 			VERIFY(kncia.kncia_config != NULL);
2505 			VERIFY(kncia.kncia_provider_context != NULL);
2506 			nif->nif_intf_adv_config = kncia.kncia_config;
2507 			nif->nif_intf_adv_prov_ctx =
2508 			    kncia.kncia_provider_context;
2509 		}
2510 	}
2511 }
2512 
2513 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2514 nx_netif_capabilities_fini(struct nx_netif *nif)
2515 {
2516 	nif->nif_intf_adv_config = NULL;
2517 	nif->nif_intf_adv_prov_ctx = NULL;
2518 }
2519 
2520 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2521 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2522 {
2523 	struct nx_netif *nif = nifna->nifna_netif;
2524 	struct kern_nexus *nx = nif->nif_nx;
2525 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2526 	struct nexus_adapter *hostna = nx_port_get_na(nx,
2527 	    NEXUS_PORT_NET_IF_HOST);
2528 
2529 	ASSERT(devna != NULL);
2530 	ASSERT(hostna != NULL);
2531 
2532 	if (!ifnet_is_attached(ifp, 1)) {
2533 		VERIFY(0);
2534 		/* NOTREACHED */
2535 		__builtin_unreachable();
2536 	}
2537 
2538 	ASSERT(devna->na_private == ifp);
2539 	ASSERT(devna->na_ifp == NULL);
2540 	/* use I/O refcnt held by ifnet_is_attached() above */
2541 	devna->na_ifp = devna->na_private;
2542 	devna->na_private = NULL;
2543 
2544 	ASSERT(hostna->na_private == ifp);
2545 	ASSERT(hostna->na_ifp == NULL);
2546 	hostna->na_ifp = hostna->na_private;
2547 	hostna->na_private = NULL;
2548 	ifnet_incr_iorefcnt(hostna->na_ifp);
2549 
2550 	nx_netif_flags_init(nif);
2551 	nx_netif_llink_init(nif);
2552 	nx_netif_filter_init(nif);
2553 	nx_netif_flow_init(nif);
2554 	nx_netif_capabilities_init(nif);
2555 	nx_netif_agent_init(nif);
2556 }
2557 
2558 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2559 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2560     uint32_t thres, boolean_t low)
2561 {
2562 #pragma unused(ifp)
2563 	struct nx_netif *nif = nifna->nifna_netif;
2564 	struct kern_nexus *nx = nif->nif_nx;
2565 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2566 	uint64_t now = _net_uptime;
2567 	boolean_t purge;
2568 
2569 	ASSERT(thres != 0);
2570 
2571 	if (devna->na_work_ts == 0) {
2572 		return;
2573 	}
2574 
2575 	/*
2576 	 * Purge if it's has been inactive for some time (twice the drain
2577 	 * threshold), and clear the work timestamp to temporarily skip this
2578 	 * adapter until it's active again.  Purging cached objects can be
2579 	 * expensive since we'd need to allocate and construct them again,
2580 	 * so we do it only when necessary.
2581 	 */
2582 	if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2583 		devna->na_work_ts = 0;
2584 		purge = TRUE;
2585 	} else {
2586 		purge = FALSE;
2587 	}
2588 
2589 	SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2590 	    (purge ? "purging" : "pruning"), devna->na_name);
2591 
2592 	/*
2593 	 * Device and host adapters share the same packet buffer pool,
2594 	 * so just reap the arena belonging to the device instance.
2595 	 */
2596 	skmem_arena_reap(devna->na_arena, purge);
2597 
2598 	/*
2599 	 * Reap any caches configured for classq.
2600 	 */
2601 	ifclassq_reap_caches(purge);
2602 }
2603 
2604 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)2605 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2606     struct if_netif_stats *if_ns)
2607 {
2608 	struct nx_netif_mit *mit;
2609 	struct mit_cfg_tbl *mit_cfg;
2610 
2611 	if ((mit = nifna->nifna_rx_mit) == NULL) {
2612 		return;
2613 	}
2614 
2615 	if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2616 		return;
2617 	}
2618 
2619 	if_ns->ifn_rx_mit_interval = mit->mit_interval;
2620 	if_ns->ifn_rx_mit_mode = mit->mit_mode;
2621 	if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2622 	if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2623 	if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2624 	if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2625 	if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2626 	if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2627 	if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2628 
2629 	VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2630 	mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
2631 	if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
2632 	if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
2633 	if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
2634 	if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
2635 	if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
2636 }
2637 
2638 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2639 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
2640     struct chreq *chr, nxspec_cmd_t spec_cmd)
2641 {
2642 	ASSERT(na->na_type == NA_NETIF_DEV ||
2643 	    na->na_type == NA_NETIF_COMPAT_DEV);
2644 	return nx_netif_na_special_common(na, ch, chr, spec_cmd);
2645 }
2646 
2647 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2648 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
2649     struct chreq *chr, nxspec_cmd_t spec_cmd)
2650 {
2651 	int error = 0;
2652 
2653 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
2654 	    na->na_type == NA_NETIF_COMPAT_DEV ||
2655 	    na->na_type == NA_NETIF_COMPAT_HOST);
2656 	SK_LOCK_ASSERT_HELD();
2657 
2658 	switch (spec_cmd) {
2659 	case NXSPEC_CMD_CONNECT:
2660 		/*
2661 		 * netif adapter isn't created exclusively for kernel.
2662 		 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
2663 		 * na_special() connect and disconnect.
2664 		 */
2665 		if (NA_KERNEL_ONLY(na)) {
2666 			error = EBUSY;
2667 			goto done;
2668 		}
2669 		ASSERT(!(na->na_flags & NAF_SPEC_INIT));
2670 
2671 		atomic_bitset_32(&na->na_flags, NAF_KERNEL_ONLY);
2672 		error = na_bind_channel(na, ch, chr);
2673 		if (error != 0) {
2674 			atomic_bitclear_32(&na->na_flags, NAF_KERNEL_ONLY);
2675 			goto done;
2676 		}
2677 		atomic_bitset_32(&na->na_flags, NAF_SPEC_INIT);
2678 		break;
2679 
2680 	case NXSPEC_CMD_DISCONNECT:
2681 		ASSERT(NA_KERNEL_ONLY(na));
2682 		ASSERT(na->na_channels > 0);
2683 		ASSERT(na->na_flags & NAF_SPEC_INIT);
2684 		na_unbind_channel(ch);
2685 		atomic_bitclear_32(&na->na_flags,
2686 		    (NAF_SPEC_INIT | NAF_KERNEL_ONLY));
2687 		break;
2688 
2689 	case NXSPEC_CMD_START:
2690 		na_kr_drop(na, FALSE);
2691 		break;
2692 
2693 	case NXSPEC_CMD_STOP:
2694 		na_kr_drop(na, TRUE);
2695 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
2696 		lck_mtx_lock(&ch->ch_lock);
2697 		nxprov_advise_disconnect(na->na_nx, ch);
2698 		lck_mtx_unlock(&ch->ch_lock);
2699 		break;
2700 
2701 	default:
2702 		error = EINVAL;
2703 		break;
2704 	}
2705 
2706 done:
2707 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
2708 	    "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
2709 	    "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
2710 	    na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
2711 
2712 	return error;
2713 }
2714 
2715 /*
2716  * Get a skywalk netif adapter for the port.
2717  */
2718 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)2719 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
2720     struct chreq *chr, struct nxbind *nxb, struct proc *p,
2721     struct nexus_adapter **nap, boolean_t create)
2722 {
2723 #pragma unused(ch)
2724 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2725 	boolean_t anon = NX_ANONYMOUS_PROV(nx);
2726 	ch_endpoint_t ep = chr->cr_endpoint;
2727 	nexus_port_t nx_port = chr->cr_port;
2728 	struct nexus_adapter *na = NULL;
2729 	struct ifnet *ifp;
2730 	int err = 0;
2731 
2732 	SK_LOCK_ASSERT_HELD();
2733 	*nap = NULL; /* default */
2734 
2735 #if SK_LOG
2736 	uuid_string_t uuidstr;
2737 	SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
2738 	    "ring_id %d ring_set %u ep_type %u:%u create %u%s",
2739 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
2740 	    (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
2741 	    chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
2742 	    chr->cr_real_endpoint, chr->cr_endpoint, create,
2743 	    (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
2744 #endif /* SK_LOG */
2745 
2746 	if (!create || ep != CH_ENDPOINT_NET_IF) {
2747 		err = ENODEV;
2748 		goto done;
2749 	}
2750 
2751 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
2752 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
2753 		err = ENXIO;
2754 		goto done;
2755 	}
2756 	ifp = nif->nif_ifp;
2757 	if (!(SKYWALK_CAPABLE(ifp))) {
2758 		SK_ERR("interface %s is no longer usable", if_name(ifp));
2759 		err = ENOTSUP;
2760 		goto done;
2761 	}
2762 
2763 	if (chr->cr_mode & CHMODE_LOW_LATENCY) {
2764 		SK_ERR("low latency is not supported for netif channel");
2765 		err = ENOTSUP;
2766 		goto done;
2767 	}
2768 
2769 	switch (nx_port) {
2770 	case NEXUS_PORT_NET_IF_DEV:
2771 		/*
2772 		 * We have to reject direct user open that's not explicitly
2773 		 * allowed because netif nexuses do not by default have
2774 		 * user memory regions.
2775 		 */
2776 		if (p != kernproc &&
2777 		    (!skywalk_netif_direct_allowed(ifp->if_xname) ||
2778 		    (kauth_cred_issuser(kauth_cred_get()) == 0 &&
2779 		    (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
2780 		    !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
2781 			DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
2782 			    ifp, struct chreq *, chr);
2783 			err = ENOTSUP;
2784 			goto done;
2785 		}
2786 		if (chr->cr_mode & CHMODE_EVENT_RING) {
2787 			SK_ERR("event ring is not supported for netif dev port channel");
2788 			err = ENOTSUP;
2789 			goto done;
2790 		}
2791 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2792 		break;
2793 
2794 	case NEXUS_PORT_NET_IF_HOST:
2795 		if (p != kernproc) {
2796 			err = ENOTSUP;
2797 			goto done;
2798 		}
2799 		if (chr->cr_mode & CHMODE_EVENT_RING) {
2800 			SK_ERR("event ring is not supported for netif host port channel");
2801 			err = ENOTSUP;
2802 			goto done;
2803 		}
2804 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
2805 		break;
2806 
2807 	default:
2808 		ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
2809 
2810 		NETIF_WLOCK(nif);
2811 		err = nx_port_alloc(nx, nx_port, nxb, &na, p);
2812 		if (err != 0) {
2813 			NETIF_WUNLOCK(nif);
2814 			goto done;
2815 		}
2816 
2817 		if (na == NULL) {
2818 			if (chr->cr_mode & CHMODE_FILTER) {
2819 				err = netif_filter_na_create(nx, chr, &na);
2820 			} else {
2821 				err = netif_vp_na_create(nx, chr, &na);
2822 			}
2823 			if (err != 0) {
2824 				NETIF_WUNLOCK(nif);
2825 				goto done;
2826 			}
2827 			err = nx_port_alloc(nx, nx_port, nxb, &na, p);
2828 			if (err != 0) {
2829 				NETIF_WUNLOCK(nif);
2830 				goto done;
2831 			}
2832 		}
2833 		NETIF_WUNLOCK(nif);
2834 
2835 		break;
2836 	}
2837 
2838 	ASSERT(err == 0);
2839 	ASSERT(na != NULL);
2840 
2841 #if CONFIG_NEXUS_USER_PIPE
2842 	if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
2843 #else /* !CONFIG_NEXUS_USER_PIPE */
2844 	if (NA_OWNED_BY_ANY(na)) {
2845 #endif /* !CONFIG_NEXUS_USER_PIPE */
2846 		err = EBUSY;
2847 		na = NULL;
2848 		goto done;
2849 	}
2850 
2851 	*nap = na;
2852 	na_retain_locked(na);
2853 
2854 done:
2855 	ASSERT(err != 0 || na != NULL);
2856 	if (err) {
2857 		SK_ERR("na not found, err(%d)", err);
2858 	} else {
2859 		SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
2860 	}
2861 	return err;
2862 }
2863 
2864 /* na_krings_create callback for all netif device adapters */
2865 int
2866 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
2867 {
2868 	int ret;
2869 
2870 	ASSERT(na->na_type == NA_NETIF_DEV ||
2871 	    na->na_type == NA_NETIF_COMPAT_DEV);
2872 	/*
2873 	 * Allocate context structures for native netif only, for
2874 	 * IOSkywalkFamily to store its object references.
2875 	 */
2876 	ret = na_rings_mem_setup(na, 0, (na->na_flags & NAF_NATIVE), ch);
2877 
2878 	/*
2879 	 * We mark CKRF_DROP for kernel-only rings (kernel channel
2880 	 * opened by the flowswitch, etc.) to prevent packets from
2881 	 * going thru until after the client of the kernel channel
2882 	 * has fully plumbed things on its side.  For userland-facing
2883 	 * rings (regular channel opened to netif), this is not
2884 	 * required, and so don't mark CKRF_DROP there.
2885 	 */
2886 	if (ret == 0 && NA_KERNEL_ONLY(na)) {
2887 		na_kr_drop(na, TRUE);
2888 	}
2889 
2890 	return ret;
2891 }
2892 
2893 /* call with SK_LOCK held */
2894 void
2895 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
2896     boolean_t defunct)
2897 {
2898 	ASSERT(na->na_type == NA_NETIF_DEV ||
2899 	    na->na_type == NA_NETIF_COMPAT_DEV);
2900 
2901 	/* see comments in nx_netif_dev_krings_create() */
2902 	if (NA_KERNEL_ONLY(na)) {
2903 		na_kr_drop(na, TRUE);
2904 	}
2905 
2906 	na_rings_mem_teardown(na, ch, defunct);
2907 }
2908 
2909 struct nx_netif *
2910 nx_netif_alloc(zalloc_flags_t how)
2911 {
2912 	struct nx_netif *n;
2913 
2914 	SK_LOCK_ASSERT_HELD();
2915 
2916 	n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
2917 	if (n == NULL) {
2918 		return NULL;
2919 	}
2920 
2921 	NETIF_RWINIT(n);
2922 	os_ref_init(&n->nif_refcnt, NULL);
2923 	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
2924 
2925 	return n;
2926 }
2927 
2928 static void
2929 nx_netif_destroy(struct nx_netif *n)
2930 {
2931 	ASSERT(n->nif_dev_nxb == NULL);
2932 	ASSERT(n->nif_host_nxb == NULL);
2933 	ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
2934 	nx_netif_llink_config_free(n);
2935 	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
2936 	NETIF_RWDESTROY(n);
2937 	zfree(nx_netif_zone, n);
2938 }
2939 
2940 void
2941 nx_netif_release(struct nx_netif *n)
2942 {
2943 	SK_LOCK_ASSERT_HELD();
2944 
2945 	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
2946 	    os_ref_get_count(&n->nif_refcnt));
2947 	if (os_ref_release(&n->nif_refcnt) == 0) {
2948 		nx_netif_destroy(n);
2949 	}
2950 }
2951 
2952 void
2953 nx_netif_retain(struct nx_netif *n)
2954 {
2955 	SK_LOCK_ASSERT_HELD();
2956 
2957 	/* retaining an object with a zero refcount is not allowed */
2958 	ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
2959 	os_ref_retain(&n->nif_refcnt);
2960 	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
2961 	    os_ref_get_count(&n->nif_refcnt));
2962 }
2963 
2964 void
2965 nx_netif_free(struct nx_netif *n)
2966 {
2967 	nx_netif_release(n);
2968 }
2969 
2970 static int
2971 nx_netif_na_channel_event_notify(struct nexus_adapter *na,
2972     struct __kern_packet *kpkt, struct __kern_channel_event *ev,
2973     uint16_t ev_len)
2974 {
2975 	int err;
2976 	struct netif_flow *nf;
2977 	struct nexus_adapter *netif_vpna;
2978 	struct nx_netif *nif = NIFNA(na)->nifna_netif;
2979 	struct netif_stats *nifs = &NIFNA(na)->nifna_netif->nif_stats;
2980 
2981 	NETIF_RLOCK(nif);
2982 	if (!NETIF_IS_LOW_LATENCY(nif)) {
2983 		err = ENOTSUP;
2984 		goto error;
2985 	}
2986 	if (__improbable(!NA_IS_ACTIVE(na))) {
2987 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NA_INACTIVE);
2988 		err = ENXIO;
2989 		goto error;
2990 	}
2991 	if (__improbable(NA_IS_DEFUNCT(na))) {
2992 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NA_DEFUNCT);
2993 		err = ENXIO;
2994 		goto error;
2995 	}
2996 	if (__improbable(nif->nif_vp_cnt == 0)) {
2997 		STATS_INC(nifs, NETIF_STATS_EV_DROP_NO_VPNA);
2998 		err = ENXIO;
2999 		goto error;
3000 	}
3001 	/* The returned netif flow is refcounted. */
3002 	nf = nx_netif_flow_classify(nif, kpkt, NETIF_FLOW_OUTBOUND);
3003 	if (nf == NULL) {
3004 		SK_ERR("unclassified event (%d) dropped", ev->ev_type);
3005 		STATS_INC(nifs, NETIF_STATS_EV_DROP_DEMUX_ERR);
3006 		err = ENOENT;
3007 		goto error;
3008 	}
3009 	netif_vpna = (struct nexus_adapter *)nf->nf_cb_arg;
3010 	if (netif_vpna->na_channel_event_notify != NULL) {
3011 		err = netif_vpna->na_channel_event_notify(netif_vpna, kpkt,
3012 		    ev, ev_len);
3013 	} else {
3014 		STATS_INC(nifs, NETIF_STATS_EV_DROP_EV_VPNA_NOTSUP);
3015 		err = ENOTSUP;
3016 	}
3017 	nx_netif_flow_release(nif, nf);
3018 	NETIF_RUNLOCK(nif);
3019 	nf = NULL;
3020 	return err;
3021 
3022 error:
3023 	STATS_INC(nifs, NETIF_STATS_EV_DROP);
3024 	NETIF_RUNLOCK(nif);
3025 	return err;
3026 }
3027 
3028 static int
3029 nx_netif_interface_advisory_notify_common(struct kern_nexus *nx,
3030     const struct ifnet_interface_advisory *advisory)
3031 {
3032 	struct kern_nexus *notify_nx;
3033 	struct __kern_netif_intf_advisory *intf_adv;
3034 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3035 
3036 	if (nif->nif_fsw_nxadv != NULL) {
3037 		ASSERT(nif->nif_fsw != NULL);
3038 		intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3039 		notify_nx = nif->nif_fsw->fsw_nx;
3040 	} else {
3041 		intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3042 		notify_nx = nif->nif_nx;
3043 	}
3044 	/*
3045 	 * copy the advisory report in shared memory
3046 	 */
3047 	intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3048 	    sizeof(*advisory), 0);
3049 	STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3050 	/*
3051 	 * notify user channels on advisory report availability
3052 	 */
3053 	nx_interface_advisory_notify(notify_nx);
3054 	return 0;
3055 }
3056 
3057 int
3058 nx_netif_interface_advisory_report(struct nexus_adapter *devna,
3059     const struct ifnet_interface_advisory *advisory)
3060 {
3061 	ASSERT(devna->na_type == NA_NETIF_DEV);
3062 	if (__improbable(!NA_IS_ACTIVE(devna))) {
3063 		return ENXIO;
3064 	}
3065 	if (__improbable(NA_IS_DEFUNCT(devna))) {
3066 		return ENXIO;
3067 	}
3068 	return nx_netif_interface_advisory_notify_common(devna->na_nx,
3069 	           advisory);
3070 }
3071 
3072 static errno_t
3073 nx_netif_interface_advisory_notify(void *kern_ctx,
3074     const struct ifnet_interface_advisory *advisory)
3075 {
3076 	if (__improbable(kern_ctx == NULL || advisory == NULL ||
3077 	    advisory->version != IF_INTERFACE_ADVISORY_VERSION_CURRENT)) {
3078 		return EINVAL;
3079 	}
3080 	if (__improbable((advisory->direction !=
3081 	    IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3082 	    (advisory->direction != IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3083 		return EINVAL;
3084 	}
3085 	return nx_netif_interface_advisory_notify_common(kern_ctx, advisory);
3086 }
3087 
3088 void
3089 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3090 {
3091 	struct kern_nexus *nx_netif;
3092 	struct nx_netif *nif;
3093 
3094 	if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3095 		return;
3096 	}
3097 	if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3098 		struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3099 		nx_netif = fsw->fsw_nifna->na_nx;
3100 	} else {
3101 		nx_netif = nx;
3102 	}
3103 	ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3104 	nif = NX_NETIF_PRIVATE(nx_netif);
3105 	if (nif->nif_intf_adv_config != NULL) {
3106 		nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3107 	}
3108 }
3109 
3110 /*
3111  * This function has no use anymore since we are now passing truncated packets
3112  * to filters. We keep this logic just in case we need to prevent certain
3113  * packets from being passed to filters.
3114  */
3115 static boolean_t
3116 packet_is_filterable(struct nexus_netif_adapter *nifna,
3117     struct __kern_packet *pkt)
3118 {
3119 #pragma unused (nifna, pkt)
3120 	return TRUE;
3121 }
3122 
3123 /*
3124  * This function is only meant for supporting the RX path because the TX path
3125  * will not send packets > MTU size due to the disabling of TSO when filters
3126  * are enabled.
3127  */
3128 static void
3129 get_filterable_packets(struct nexus_netif_adapter *nifna,
3130     struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3131     struct __kern_packet **passthrough_chain)
3132 {
3133 	struct nx_netif *nif = nifna->nifna_netif;
3134 	struct netif_stats *nifs = &nif->nif_stats;
3135 	struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3136 	struct __kern_packet *fpkt_head = NULL, *passthrough_head = NULL;
3137 	struct __kern_packet **fpkt_tailp = &fpkt_head;
3138 	struct __kern_packet **passthrough_tailp = &passthrough_head;
3139 	int fcnt = 0, pcnt = 0, dcnt = 0;
3140 
3141 	while (pkt != NULL) {
3142 		next = pkt->pkt_nextpkt;
3143 		pkt->pkt_nextpkt = NULL;
3144 
3145 		if (!packet_is_filterable(nifna, pkt)) {
3146 			pcnt++;
3147 			*passthrough_tailp = pkt;
3148 			passthrough_tailp = &pkt->pkt_nextpkt;
3149 			pkt = next;
3150 			continue;
3151 		}
3152 		fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3153 		if (fpkt != NULL) {
3154 			fcnt++;
3155 			*fpkt_tailp = fpkt;
3156 			fpkt_tailp = &fpkt->pkt_nextpkt;
3157 		} else {
3158 			dcnt++;
3159 		}
3160 		pkt = next;
3161 	}
3162 	*fpkt_chain = fpkt_head;
3163 	*passthrough_chain = passthrough_head;
3164 
3165 	/*
3166 	 * No need to increment drop stats because that's already
3167 	 * done in nx_netif_pkt_to_filter_pkt.
3168 	 */
3169 	STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3170 	DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3171 	    int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3172 	    fpkt_head, struct __kern_packet *, passthrough_head);
3173 }
3174 
3175 /*
3176  * This is only used by ring-based notify functions for now.
3177  * When a qset-based notify becomes available, this function can be used
3178  * unmodified.
3179  */
3180 void
3181 netif_receive(struct nexus_netif_adapter *nifna,
3182     struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3183 {
3184 	struct nx_netif *nif = nifna->nifna_netif;
3185 	struct nexus_adapter *na = &nifna->nifna_up;
3186 	struct netif_stats *nifs = &nif->nif_stats;
3187 	int err, dropcnt, dropstat = -1;
3188 
3189 	/* update our work timestamp */
3190 	na->na_work_ts = _net_uptime;
3191 
3192 	if (nif->nif_filter_cnt > 0) {
3193 		struct __kern_packet *fpkt_chain = NULL;
3194 		struct __kern_packet *passthrough_chain = NULL;
3195 
3196 		get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3197 		    &passthrough_chain);
3198 		if (fpkt_chain != NULL) {
3199 			(void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3200 			    NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3201 		}
3202 		if (passthrough_chain != NULL) {
3203 			pkt_chain = passthrough_chain;
3204 		} else {
3205 			return;
3206 		}
3207 	} else if (nx_netif_filter_default_drop != 0) {
3208 		DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3209 		    struct __kern_packet *, pkt_chain);
3210 		dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3211 		goto drop;
3212 	}
3213 	if (nif->nif_flow_cnt > 0) {
3214 		struct __kern_packet *remain = NULL;
3215 
3216 		err = nx_netif_demux(nifna, pkt_chain, &remain,
3217 		    NETIF_FLOW_SOURCE);
3218 		if (remain == NULL) {
3219 			return;
3220 		}
3221 		pkt_chain = remain;
3222 	}
3223 	if (na->na_rx != NULL) {
3224 		na->na_rx(na, pkt_chain, stats);
3225 	} else {
3226 		DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3227 		    struct __kern_packet *, pkt_chain);
3228 		dropstat = NETIF_STATS_DROP_NO_RX_CB;
3229 		goto drop;
3230 	}
3231 	return;
3232 drop:
3233 	dropcnt = 0;
3234 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3235 	if (dropstat != -1) {
3236 		STATS_ADD(nifs, dropstat, dropcnt);
3237 	}
3238 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3239 }
3240 
3241 static slot_idx_t
3242 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3243     slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3244 {
3245 	uint64_t elapsed;
3246 	uint64_t now;
3247 	struct __kern_packet *pkt;
3248 	clock_sec_t sec;
3249 	clock_usec_t usec;
3250 	slot_idx_t i;
3251 
3252 	if (__probable(rate == 0)) {
3253 		return end;
3254 	}
3255 
3256 	/* init tbr if not so */
3257 	if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3258 		r->ckr_tbr_token = rate;
3259 		r->ckr_tbr_depth = rate;
3260 		r->ckr_tbr_last = mach_absolute_time();
3261 	} else {
3262 		now = mach_absolute_time();
3263 		elapsed = now - r->ckr_tbr_last;
3264 		absolutetime_to_microtime(elapsed, &sec, &usec);
3265 		r->ckr_tbr_token +=
3266 		    ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3267 		if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3268 			r->ckr_tbr_token = r->ckr_tbr_depth;
3269 		}
3270 		r->ckr_tbr_last = now;
3271 	}
3272 
3273 	*rate_limited = FALSE;
3274 	for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3275 		pkt = KR_KSD(r, i)->sd_pkt;
3276 		if (__improbable(pkt == NULL)) {
3277 			continue;
3278 		}
3279 		if (__improbable(r->ckr_tbr_token <= 0)) {
3280 			end = i;
3281 			*rate_limited = TRUE;
3282 			break;
3283 		}
3284 		r->ckr_tbr_token -= pkt->pkt_length * 8;
3285 	}
3286 
3287 	SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3288 	    r, r->ckr_name, i);
3289 
3290 	return end;
3291 }
3292 
3293 SK_NO_INLINE_ATTRIBUTE
3294 static struct __kern_packet *
3295 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3296 {
3297 	struct __kern_packet *pkt_chain = NULL, **tailp = &pkt_chain;
3298 	slot_idx_t idx = ring->ckr_rhead;
3299 
3300 	while (idx != end) {
3301 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3302 		struct __kern_packet *pkt = ksd->sd_pkt;
3303 
3304 		ASSERT(pkt->pkt_nextpkt == NULL);
3305 		KR_SLOT_DETACH_METADATA(ring, ksd);
3306 		*tailp = pkt;
3307 		tailp = &pkt->pkt_nextpkt;
3308 		idx = SLOT_NEXT(idx, ring->ckr_lim);
3309 	}
3310 	ring->ckr_rhead = end;
3311 	ring->ckr_rtail = ring->ckr_ktail;
3312 	return pkt_chain;
3313 }
3314 
3315 int
3316 netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3317     uint32_t flags)
3318 {
3319 	struct nexus_adapter *hwna;
3320 	struct nexus_netif_adapter *nifna;
3321 	struct nx_netif *nif;
3322 	struct __kern_packet *pkt_chain;
3323 	struct nexus_pkt_stats stats;
3324 	sk_protect_t protect;
3325 	slot_idx_t ktail;
3326 	int err = 0;
3327 
3328 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3329 	    SK_KVA(ring));
3330 
3331 	ASSERT(ring->ckr_tx == NR_RX);
3332 	ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3333 
3334 	err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3335 	if (err != 0) {
3336 		/* not a serious error, so no need to be chatty here */
3337 		SK_DF(SK_VERB_FSW,
3338 		    "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3339 		    "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3340 		    ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3341 		    CKRF_BITS, err);
3342 		goto out;
3343 	}
3344 	if (__improbable(KR_DROP(ring))) {
3345 		kr_exit(ring);
3346 		err = ENODEV;
3347 		goto out;
3348 	}
3349 	hwna = KRNA(ring);
3350 	nifna = NIFNA(hwna);
3351 	nif = nifna->nifna_netif;
3352 	if (__improbable(hwna->na_ifp == NULL)) {
3353 		kr_exit(ring);
3354 		err = ENODEV;
3355 		goto out;
3356 	}
3357 	protect = sk_sync_protect();
3358 	err = ring->ckr_na_sync(ring, p, 0);
3359 	if (err != 0 && err != EAGAIN) {
3360 		goto put_out;
3361 	}
3362 
3363 	/* read the tail pointer once */
3364 	ktail = ring->ckr_ktail;
3365 	if (__improbable(ring->ckr_khead == ktail)) {
3366 		SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3367 		    "how strange, interrupt with no packets on hwna "
3368 		    "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3369 		goto put_out;
3370 	}
3371 	ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3372 	    ktail, &ring->ckr_rate_limited);
3373 
3374 	pkt_chain = consume_pkts(ring, ktail);
3375 	if (pkt_chain != NULL) {
3376 		netif_receive(nifna, pkt_chain, &stats);
3377 
3378 		if (ring->ckr_netif_mit_stats != NULL &&
3379 		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3380 			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3381 			    stats.nps_bytes);
3382 		}
3383 	}
3384 
3385 put_out:
3386 	sk_sync_unprotect(protect);
3387 	kr_exit(ring);
3388 
3389 out:
3390 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3391 	    SK_KVA(ring), err);
3392 	return err;
3393 }
3394 
3395 int
3396 netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3397     uint32_t flags)
3398 {
3399 #pragma unused(p, flags)
3400 	sk_protect_t protect;
3401 	struct nexus_adapter *hwna;
3402 	struct nexus_pkt_stats stats = {};
3403 	uint32_t i, count;
3404 	int err = 0;
3405 
3406 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3407 	    SK_KVA(ring));
3408 
3409 	/* XXX
3410 	 * sk_sync_protect() is not needed for this case because
3411 	 * we are not using the dev ring. Unfortunately lots of
3412 	 * macros used by fsw still require this.
3413 	 */
3414 	protect = sk_sync_protect();
3415 	hwna = KRNA(ring);
3416 	count = na_get_nslots(hwna, NR_RX);
3417 	err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
3418 	if (__improbable(err != 0)) {
3419 		SK_ERR("nx_rx_sync_packets failed: %d", err);
3420 		DTRACE_SKYWALK2(rx__sync__packets__failed,
3421 		    struct __kern_channel_ring *, ring, int, err);
3422 		goto out;
3423 	}
3424 	DTRACE_SKYWALK1(chain__count, uint32_t, count);
3425 	for (i = 0; i < count; i++) {
3426 		struct __kern_packet *pkt_chain;
3427 
3428 		pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3429 		ASSERT(pkt_chain != NULL);
3430 		netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);
3431 
3432 		if (ring->ckr_netif_mit_stats != NULL &&
3433 		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3434 			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3435 			    stats.nps_bytes);
3436 		}
3437 	}
3438 out:
3439 	sk_sync_unprotect(protect);
3440 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3441 	    SK_KVA(ring), err);
3442 	return err;
3443 }
3444 
3445 
3446 /*
3447  * Configure the NA to operate in a particular mode.
3448  */
3449 static channel_ring_notify_t
3450 netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3451 {
3452 	channel_ring_notify_t notify = NULL;
3453 	boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3454 	    nx_has_rx_sync_packets(ring));
3455 
3456 	if (mode == NETIF_MODE_FSW) {
3457 		notify = (has_sync_pkts ? netif_rx_notify_fast :
3458 		    netif_rx_notify_default);
3459 	} else if (mode == NETIF_MODE_LLW) {
3460 		notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3461 		    netif_llw_rx_notify_default);
3462 	}
3463 	return notify;
3464 }
3465 
3466 
3467 static uint32_t
3468 netif_mode_to_flag(netif_mode_t mode)
3469 {
3470 	uint32_t flag = 0;
3471 
3472 	if (mode == NETIF_MODE_FSW) {
3473 		flag = NAF_MODE_FSW;
3474 	} else if (mode == NETIF_MODE_LLW) {
3475 		flag = NAF_MODE_LLW;
3476 	}
3477 	return flag;
3478 }
3479 
3480 static void
3481 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3482     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3483     struct nexus_pkt_stats *), boolean_t set)
3484 {
3485 	uint32_t i;
3486 	uint32_t flag;
3487 
3488 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
3489 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
3490 
3491 	for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3492 		struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3493 		channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);
3494 
3495 		if (set) {
3496 			kr->ckr_save_notify = kr->ckr_netif_notify;
3497 			kr->ckr_netif_notify = notify;
3498 		} else {
3499 			kr->ckr_netif_notify = kr->ckr_save_notify;
3500 			kr->ckr_save_notify = NULL;
3501 		}
3502 	}
3503 	if (set) {
3504 		hwna->na_rx = rx;
3505 		flag = netif_mode_to_flag(mode);
3506 		atomic_bitset_32(&hwna->na_flags, flag);
3507 	} else {
3508 		hwna->na_rx = NULL;
3509 		atomic_bitclear_32(&hwna->na_flags,
3510 		    (NAF_MODE_FSW | NAF_MODE_LLW));
3511 	}
3512 }
3513 
3514 void
3515 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3516     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3517     struct nexus_pkt_stats *))
3518 {
3519 	return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3520 }
3521 
3522 void
3523 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3524 {
3525 	return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3526 }
3527 
3528 static void
3529 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3530 {
3531 	struct nexus_netif_adapter *nifna = NIFNA(na);
3532 	struct nx_netif *nif = nifna->nifna_netif;
3533 	struct netif_stats *nifs = &nif->nif_stats;
3534 	struct __kern_channel_ring *r;
3535 	struct nexus_pkt_stats stats;
3536 	sk_protect_t protect;
3537 	boolean_t ring_drop = FALSE;
3538 	int err, dropcnt;
3539 
3540 	if (!NA_OWNED_BY_FSW(na)) {
3541 		DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3542 		goto fail;
3543 	}
3544 	ASSERT(na->na_rx != NULL);
3545 
3546 	/*
3547 	 * XXX
3548 	 * This function is called when a filter injects a packet back to the
3549 	 * regular RX path. We can assume the ring is 0 for now because RSS
3550 	 * is not supported. This needs to be revisited when we add support for
3551 	 * RSS.
3552 	 */
3553 	r = &na->na_rx_rings[0];
3554 	ASSERT(r->ckr_tx == NR_RX);
3555 	err = kr_enter(r, TRUE);
3556 	VERIFY(err == 0);
3557 
3558 	if (__improbable(KR_DROP(r))) {
3559 		kr_exit(r);
3560 		DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3561 		    struct __kern_channel_ring *, r);
3562 		ring_drop = TRUE;
3563 		goto fail;
3564 	}
3565 	protect = sk_sync_protect();
3566 	na->na_rx(na, pkt_chain, &stats);
3567 
3568 	if (r->ckr_netif_mit_stats != NULL &&
3569 	    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3570 		r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3571 	}
3572 	sk_sync_unprotect(protect);
3573 
3574 	kr_exit(r);
3575 	return;
3576 
3577 fail:
3578 	dropcnt = 0;
3579 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3580 	if (ring_drop) {
3581 		STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3582 	}
3583 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3584 }
3585 
3586 /*
3587  * This is called when an inbound packet has traversed all filters.
3588  */
3589 errno_t
3590 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3591     struct __kern_packet *fpkt_chain, uint32_t flags)
3592 {
3593 #pragma unused (flags)
3594 	struct nx_netif *nif = nifna->nifna_netif;
3595 	struct netif_stats *nifs = &nif->nif_stats;
3596 	struct nexus_adapter *na = &nifna->nifna_up;
3597 	struct __kern_packet *pkt_chain;
3598 	int err;
3599 
3600 	pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3601 	    fpkt_chain, NETIF_CONVERT_RX);
3602 	if (pkt_chain == NULL) {
3603 		return ENOMEM;
3604 	}
3605 	if (nif->nif_flow_cnt > 0) {
3606 		struct __kern_packet *remain = NULL;
3607 
3608 		err = nx_netif_demux(nifna, pkt_chain, &remain,
3609 		    NETIF_FLOW_INJECT);
3610 		if (remain == NULL) {
3611 			return err;
3612 		}
3613 		pkt_chain = remain;
3614 	}
3615 	if (na->na_rx != NULL) {
3616 		netif_inject_rx(na, pkt_chain);
3617 	} else {
3618 		int dropcnt = 0;
3619 		nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3620 		STATS_ADD(nifs,
3621 		    NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3622 		STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3623 	}
3624 	return 0;
3625 }
3626 
3627 /*
3628  * This is called when an outbound packet has traversed all filters.
3629  */
3630 errno_t
3631 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3632     struct __kern_packet *fpkt_chain, uint32_t flags)
3633 {
3634 #pragma unused (flags)
3635 	struct nx_netif *nif = nifna->nifna_netif;
3636 	struct nexus_adapter *na = &nifna->nifna_up;
3637 	int err;
3638 
3639 	if (NETIF_IS_COMPAT(nif)) {
3640 		struct mbuf *m_chain;
3641 		mbuf_svc_class_t sc;
3642 
3643 		m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3644 		    fpkt_chain, NETIF_CONVERT_TX);
3645 		if (m_chain == NULL) {
3646 			return ENOMEM;
3647 		}
3648 		/*
3649 		 * All packets in the chain have the same service class.
3650 		 * If the sc is missing or invalid, a valid value will be
3651 		 * returned.
3652 		 */
3653 		sc = mbuf_get_service_class(m_chain);
3654 		err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
3655 		    sc, m_chain);
3656 	} else {
3657 		struct __kern_packet *pkt_chain;
3658 		kern_packet_svc_class_t sc;
3659 
3660 		pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3661 		    fpkt_chain, NETIF_CONVERT_TX);
3662 		if (pkt_chain == NULL) {
3663 			return ENOMEM;
3664 		}
3665 		/*
3666 		 * All packets in the chain have the same service class.
3667 		 * If the sc is missing or invalid, a valid value will be
3668 		 * returned.
3669 		 */
3670 		sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
3671 		err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
3672 		    sc, pkt_chain);
3673 	}
3674 	/* Tell driver to resume dequeuing */
3675 	ifnet_start(na->na_ifp);
3676 	return err;
3677 }
3678 
3679 void
3680 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
3681     struct skmem_region_params *srp)
3682 {
3683 #pragma unused(na, srp)
3684 	return;
3685 }
3686 
3687 /* returns true, if starter thread is utilized */
3688 static bool
3689 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
3690 {
3691 #if (DEVELOPMENT || DEBUG)
3692 	if (__improbable(nx_netif_force_ifnet_start != 0)) {
3693 		ifnet_start(ifp);
3694 		return true;
3695 	}
3696 #endif /* !DEVELOPMENT && !DEBUG */
3697 	/*
3698 	 * use starter thread in following conditions:
3699 	 * - interface is not skywalk native
3700 	 * - interface attached to virtual driver (ipsec, utun)
3701 	 * - TBR is enabled
3702 	 * - delayed start mechanism is in use
3703 	 * - remaining stack space on the thread is not enough for driver
3704 	 * - caller is in rx workloop context
3705 	 * - caller is from the flowswitch path doing ARP resolving
3706 	 * - caller requires the use of starter thread (stack usage)
3707 	 */
3708 	if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
3709 	    !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
3710 	    ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
3711 	    IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
3712 	    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
3713 	    sk_is_rx_notify_protected() ||
3714 	    sk_is_async_transmit_protected() ||
3715 	    (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
3716 		DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
3717 		    uint32_t, flags);
3718 		ifnet_start(ifp);
3719 		return true;
3720 	}
3721 	lck_mtx_lock_spin(&ifp->if_start_lock);
3722 	/* interface is flow controlled */
3723 	if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
3724 		lck_mtx_unlock(&ifp->if_start_lock);
3725 		return true;
3726 	}
3727 	/* if starter thread is active, utilize it */
3728 	if (ifp->if_start_active) {
3729 		ifp->if_start_req++;
3730 		lck_mtx_unlock(&ifp->if_start_lock);
3731 		return true;
3732 	}
3733 	lck_mtx_unlock(&ifp->if_start_lock);
3734 	/* Check remaining stack space */
3735 	if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
3736 		ifnet_start(ifp);
3737 		return true;
3738 	}
3739 	return false;
3740 }
3741 
3742 void
3743 netif_transmit(struct ifnet *ifp, uint32_t flags)
3744 {
3745 	if (netif_use_starter_thread(ifp, flags)) {
3746 		return;
3747 	}
3748 	/*
3749 	 * If no longer attached, don't issue doorbell as ifp
3750 	 * is being destroyed; else hold an IO refcnt to
3751 	 * prevent the interface from being detached.
3752 	 */
3753 	if (!ifnet_datamov_begin(ifp)) {
3754 		return;
3755 	}
3756 	nx_netif_doorbell_internal(ifp, flags);
3757 	/*
3758 	 * Release the IO refcnt taken above.
3759 	 */
3760 	ifnet_datamov_end(ifp);
3761 }
3762 
3763 static struct ifclassq *
3764 netif_get_default_ifcq(struct nexus_adapter *hwna)
3765 {
3766 	struct nx_netif *nif;
3767 	struct ifclassq *ifcq;
3768 
3769 	nif = NX_NETIF_PRIVATE(hwna->na_nx);
3770 	if (NETIF_LLINK_ENABLED(nif)) {
3771 		struct netif_qset *qset;
3772 
3773 		/*
3774 		 * Use the default ifcq for now.
3775 		 * In the future this could be chosen by the caller.
3776 		 */
3777 		qset = nx_netif_get_default_qset_noref(nif);
3778 		ASSERT(qset != NULL);
3779 		ifcq = qset->nqs_ifcq;
3780 	} else {
3781 		ifcq = nif->nif_ifp->if_snd;
3782 	}
3783 	return ifcq;
3784 }
3785 
3786 static errno_t
3787 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
3788     uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
3789     boolean_t *pkts_pending, kern_packet_svc_class_t sc)
3790 {
3791 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
3792 	struct ifnet *ifp = hwna->na_ifp;
3793 	errno_t rc;
3794 
3795 	ASSERT(ifp != NULL);
3796 	ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
3797 	ASSERT((pkt_limit != 0) && (byte_limit != 0));
3798 
3799 	if (ifcq == NULL) {
3800 		ifcq = netif_get_default_ifcq(hwna);
3801 	}
3802 	if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
3803 		rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
3804 		    pkt_limit, byte_limit, &pkt_head, NULL, NULL, NULL);
3805 	} else {
3806 		rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
3807 		    &pkt_head, NULL, NULL, NULL);
3808 	}
3809 	ASSERT((rc == 0) || (rc == EAGAIN));
3810 	ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
3811 
3812 	if (IFCQ_LEN(ifcq) != 0) {
3813 		*pkts_pending = TRUE;
3814 	} else {
3815 		*pkts_pending = FALSE;
3816 	}
3817 
3818 	*head = pkt_head.cp_kpkt;
3819 	return rc;
3820 }
3821 
3822 #if SK_LOG
3823 /* Hoisted out of line to reduce kernel stack footprint */
3824 SK_LOG_ATTRIBUTE
3825 static void
3826 netif_no_ring_space_log(const struct nexus_adapter *na,
3827     const kern_channel_ring_t ring)
3828 {
3829 	SK_DF(SK_VERB_SYNC | SK_VERB_TX,
3830 	    "no ring space: na \"%s\" [%u] "
3831 	    "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)"
3832 	    "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)",
3833 	    na->na_name, ring->ckr_ring_id,
3834 	    ring->ckr_name, ring->ckr_khead,
3835 	    ring->ckr_ktail, ring->ckr_klease,
3836 	    ring->ckr_rhead, ring->ckr_rtail);
3837 }
3838 #endif /* SK_LOG */
3839 
3840 /*
3841  * netif refill function for rings
3842  */
3843 errno_t
3844 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
3845     uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
3846     boolean_t canblock)
3847 {
3848 	struct nexus_adapter *hwna;
3849 	struct ifnet *ifp;
3850 	struct __kern_packet *head = NULL;
3851 	sk_protect_t protect;
3852 	errno_t rc = 0;
3853 	errno_t sync_err = 0;
3854 	uint32_t npkts = 0, consumed = 0;
3855 	uint32_t flags;
3856 	slot_idx_t idx, ktail;
3857 	int ring_space = 0;
3858 
3859 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
3860 
3861 	VERIFY(ring != NULL);
3862 	hwna = KRNA(ring);
3863 	ifp = hwna->na_ifp;
3864 
3865 	ASSERT(hwna->na_type == NA_NETIF_DEV);
3866 	ASSERT(ring->ckr_tx == NR_TX);
3867 	*pkts_pending = FALSE;
3868 
3869 	if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
3870 		SK_ERR("invalid limits plim %d, blim %d",
3871 		    pkt_limit, byte_limit);
3872 		rc = EINVAL;
3873 		goto out;
3874 	}
3875 
3876 	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
3877 		SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
3878 		    SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
3879 		rc = ENXIO;
3880 		goto out;
3881 	}
3882 
3883 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
3884 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
3885 		    "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
3886 		rc = ENXIO;
3887 		goto out;
3888 	}
3889 
3890 	/*
3891 	 * if the ring is busy, it means another dequeue is in
3892 	 * progress, so ignore this request and return success.
3893 	 */
3894 	if (kr_enter(ring, canblock) != 0) {
3895 		rc = 0;
3896 		goto out;
3897 	}
3898 	/* mark thread with sync-in-progress flag */
3899 	protect = sk_sync_protect();
3900 
3901 	if (__improbable(KR_DROP(ring) ||
3902 	    !NA_IS_ACTIVE(ring->ckr_na))) {
3903 		SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
3904 		rc = ENXIO;
3905 		goto done;
3906 	}
3907 
3908 	idx = ring->ckr_rhead;
3909 	ktail = ring->ckr_ktail;
3910 	/* calculate available space on tx ring */
3911 	ring_space = ktail - idx;
3912 	if (ring_space < 0) {
3913 		ring_space += ring->ckr_num_slots;
3914 	}
3915 	if (ring_space == 0) {
3916 		struct ifclassq *ifcq;
3917 
3918 		/* no space in ring, driver should retry */
3919 #if SK_LOG
3920 		if (__improbable((sk_verbose &
3921 		    (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
3922 			netif_no_ring_space_log(hwna, ring);
3923 		}
3924 #endif /* SK_LOG */
3925 		ifcq = netif_get_default_ifcq(hwna);
3926 		if (IFCQ_LEN(ifcq) != 0) {
3927 			*pkts_pending = TRUE;
3928 		}
3929 		/*
3930 		 * We ran out of space in ring, most probably
3931 		 * because the driver is slow to drain its TX queue.
3932 		 * We want another doorbell to be generated as soon
3933 		 * as the TX notify completion happens; mark this
3934 		 * through ckr_pending_doorbell counter.  Do this
3935 		 * regardless of whether there's any pending packet.
3936 		 */
3937 		ring->ckr_pending_doorbell++;
3938 		rc = EAGAIN;
3939 		goto sync_ring;
3940 	}
3941 
3942 	if ((uint32_t)ring_space < pkt_limit) {
3943 		pkt_limit = ring_space;
3944 	}
3945 
3946 	if (tx_doorbell_ctxt &&
3947 	    ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
3948 		pkt_limit = MIN(pkt_limit,
3949 		    nx_netif_doorbell_max_dequeue);
3950 	}
3951 
3952 	rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
3953 	    &head, pkts_pending, ring->ckr_svc);
3954 
3955 	/*
3956 	 * There's room in ring; if we haven't dequeued everything,
3957 	 * mark ckr_pending_doorbell for the next TX notify to issue
3958 	 * a TX door bell; otherwise, clear it.  The next packet that
3959 	 * gets enqueued will trigger a door bell again.
3960 	 */
3961 	if (*pkts_pending) {
3962 		ring->ckr_pending_doorbell++;
3963 	} else if (ring->ckr_pending_doorbell != 0) {
3964 		ring->ckr_pending_doorbell = 0;
3965 	}
3966 
3967 	if (rc != 0) {
3968 		/*
3969 		 * This is expected sometimes as the IOSkywalkFamily
3970 		 * errs on the side of caution to perform an extra
3971 		 * dequeue when multiple doorbells are pending;
3972 		 * nothing to dequeue, do a sync if there are slots
3973 		 * to reclaim else just return.
3974 		 */
3975 		SK_DF(SK_VERB_SYNC | SK_VERB_TX,
3976 		    "nothing to dequeue, err %d", rc);
3977 
3978 		if ((uint32_t)ring_space == ring->ckr_lim) {
3979 			goto done;
3980 		} else {
3981 			goto sync_ring;
3982 		}
3983 	}
3984 	/* move the dequeued packets to tx ring */
3985 	while (head != NULL && idx != ktail) {
3986 		ASSERT(npkts <= pkt_limit);
3987 		struct __kern_packet *pkt = head;
3988 		KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
3989 		    (struct __kern_quantum *)pkt);
3990 		npkts++;
3991 		if (__improbable(pkt->pkt_trace_id != 0)) {
3992 			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
3993 			KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
3994 		}
3995 		idx = SLOT_NEXT(idx, ring->ckr_lim);
3996 		head = pkt->pkt_nextpkt;
3997 		pkt->pkt_nextpkt = NULL;
3998 	}
3999 
4000 	/*
4001 	 * We checked for ring space earlier so the ring should have enough
4002 	 * space for the entire chain.
4003 	 */
4004 	ASSERT(head == NULL);
4005 	ring->ckr_rhead = idx;
4006 
4007 sync_ring:
4008 	flags = NA_SYNCF_NETIF;
4009 	if (ring->ckr_pending_doorbell != 0) {
4010 		flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4011 	}
4012 
4013 	ring->ckr_khead_pre = ring->ckr_khead;
4014 	sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4015 	if (sync_err != 0 && sync_err != EAGAIN) {
4016 		SK_ERR("unexpected sync err %d", sync_err);
4017 		if (rc == 0) {
4018 			rc = sync_err;
4019 		}
4020 		goto done;
4021 	}
4022 	/*
4023 	 * Verify that the driver has detached packets from the consumed slots.
4024 	 */
4025 	idx = ring->ckr_khead_pre;
4026 	consumed = 0;
4027 	while (idx != ring->ckr_khead) {
4028 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4029 
4030 		consumed++;
4031 		VERIFY(!KSD_VALID_METADATA(ksd));
4032 		idx = SLOT_NEXT(idx, ring->ckr_lim);
4033 	}
4034 	ring->ckr_khead_pre = ring->ckr_khead;
4035 
4036 done:
4037 	sk_sync_unprotect(protect);
4038 	kr_exit(ring);
4039 out:
4040 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4041 	    SK_KVA(ring), rc, 0, npkts);
4042 
4043 	return rc;
4044 }
4045 
4046 void
4047 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4048     uint32_t count, uint32_t flags)
4049 {
4050 #pragma unused (count)
4051 	struct netif_queue *q = queue;
4052 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4053 	struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4054 	bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4055 	struct pktq *pktq = &q->nq_pktq;
4056 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4057 	struct nexus_pkt_stats stats;
4058 	sk_protect_t protect;
4059 
4060 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4061 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4062 		int drop_cnt = 0;
4063 
4064 		pp_free_packet_chain(pkt_chain, &drop_cnt);
4065 		STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4066 		return;
4067 	}
4068 	KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4069 	if (flush) {
4070 		pkt_chain = KPKTQ_FIRST(pktq);
4071 		KPKTQ_INIT(pktq);
4072 
4073 		protect = sk_sync_protect();
4074 		netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4075 		sk_sync_unprotect(protect);
4076 	}
4077 }
4078 
4079 errno_t
4080 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4081     uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4082 {
4083 	struct netif_queue *q = queue;
4084 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4085 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4086 	struct nexus_adapter *hwna;
4087 	struct __kern_packet *pkt_chain = NULL;
4088 	errno_t rc;
4089 
4090 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4091 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4092 		STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4093 		return ENXIO;
4094 	}
4095 	hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4096 
4097 	if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4098 	    sk_is_tx_notify_protected()) {
4099 		pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4100 	}
4101 	rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4102 	    byte_limit, &pkt_chain, pending, q->nq_svc);
4103 
4104 	if (pkt_chain != NULL) {
4105 		*ph_chain = SK_PKT2PH(pkt_chain);
4106 	}
4107 	return rc;
4108 }
4109 
4110 errno_t
4111 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4112     struct kern_nexus_netif_llink_init *llink_init)
4113 {
4114 	errno_t err;
4115 	struct nx_netif *nif;
4116 	struct netif_llink *llink;
4117 	struct netif_stats *nifs;
4118 
4119 	VERIFY(nx != NULL);
4120 	VERIFY(llink_init != NULL);
4121 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4122 
4123 	nif = NX_NETIF_PRIVATE(nx);
4124 	nifs = &nif->nif_stats;
4125 
4126 	err = nx_netif_validate_llink_config(llink_init, false);
4127 	if (err != 0) {
4128 		SK_ERR("Invalid llink init params");
4129 		STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4130 		return err;
4131 	}
4132 
4133 	err = nx_netif_llink_add(nif, llink_init, &llink);
4134 	return err;
4135 }
4136 
4137 errno_t
4138 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4139     kern_nexus_netif_llink_id_t llink_id)
4140 {
4141 	struct nx_netif *nif;
4142 
4143 	VERIFY(nx != NULL);
4144 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4145 
4146 	nif = NX_NETIF_PRIVATE(nx);
4147 	return nx_netif_llink_remove(nif, llink_id);
4148 }
4149 
4150 errno_t
4151 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4152     kern_packet_svc_class_t *svc)
4153 {
4154 	*svc = queue->nq_svc;
4155 	return 0;
4156 }
4157