xref: /xnu-12377.41.6/bsd/skywalk/nexus/netif/nx_netif.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * The netif nexus domain has two domain providers: native and compat, with
31  * the latter being the default provider of this domain. The compat provider
32  * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33  *
34  * A netif nexus instance can be in a native or compat mode; in either case,
35  * it is associated with two instances of a nexus_adapter structure, and allows
36  * at most two channels opened to the nexus.  Two two adapters correspond to
37  * host and device ports, respectively.
38  *
39  * By itself, a netif nexus isn't associated with a network interface. The
40  * association happens by attaching a network interface to the nexus instance.
41  * A channel can only be successfully opened to a netif nexus after it has an
42  * interface attached to it.
43  *
44  * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45  * structure refers to the attached netif nexus adapter via its if_na field.
46  * The nexus also holds a reference to the interface on its na_ifp field. Note
47  * that attaching to a netif_compat nexus does not alter the input/output data
48  * path, nor does it remove any of the interface's hardware offload flags. It
49  * merely associates the interface and netif nexus together.
50  *
51  * During a detach, the above references are dropped and the fields are cleared;
52  * the interface is also marked as non-Skywalk-capable. This detach can happen
53  * explicitly via a command down the nexus, or implicitly when the nexus goes
54  * away (assuming there's no channel opened to it.)
55  *
56  * A userland channel can be opened to a netif nexus via the usual ch_open()
57  * way, assuming the nexus provider is setup to allow access for the userland
58  * process (either by binding the nexus port to PID, etc. or by creating the
59  * nexus in the anonymous mode.)
60  *
61  * Alternatively, a kernel channel can also be opened to it by some kernel
62  * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63  * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64  * indicate that.
65  *
66  * Opening a channel to the host port of a native or compat netif causes the
67  * ifnet output path to be redirected to nx_netif_host_transmit().  We also,
68  * at present, disable any hardware offload features.
69  *
70  * Opening a channel to the device port of a compat netif causes the ifnet
71  * input path to be redirected to nx_netif_compat_receive().  This is specific
72  * to the compat variant, as the native variant's RX path already goes to
73  * the native netif.
74  *
75  * During channel close, we restore the original I/O callbacks, as well as the
76  * interface's offload flags.
77  */
78 
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87 #include <kern/uipc_domain.h>
88 
89 #define NX_NETIF_MAXRINGS       NX_MAX_NUM_RING_PAIR
90 #define NX_NETIF_MINSLOTS       2       /* XXX same as above */
91 #define NX_NETIF_MAXSLOTS       NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
92 #define NX_NETIF_TXRINGSIZE     512     /* default TX ring size */
93 #define NX_NETIF_RXRINGSIZE     1024    /* default RX ring size */
94 #define NX_NETIF_BUFSIZE        (2 * 1024)  /* default buffer size */
95 #define NX_NETIF_MINBUFSIZE     (128)  /* min buffer size */
96 #define NX_NETIF_MAXBUFSIZE     (32 * 1024) /* max buffer size */
97 
98 /*
99  * TODO: [email protected] -- minimum buflets for now; we will need to
100  * have a way to adjust this based on the underlying interface's
101  * parameters, e.g. jumbo MTU, large segment offload, etc.
102  */
103 #define NX_NETIF_UMD_SIZE       _USER_PACKET_SIZE(BUFLETS_MIN)
104 #define NX_NETIF_KMD_SIZE       _KERN_PACKET_SIZE(BUFLETS_MIN)
105 
106 /*
107  * minimum stack space required for IOSkywalkFamily and Driver execution.
108  */
109 #if XNU_TARGET_OS_OSX
110 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 1)
111 #else /* !XNU_TARGET_OS_OSX */
112 #define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 2)
113 #endif /* XNU_TARGET_OS_OSX */
114 
115 static void nx_netif_dom_init(struct nxdom *);
116 static void nx_netif_dom_terminate(struct nxdom *);
117 static void nx_netif_dom_fini(struct nxdom *);
118 static int nx_netif_prov_params_adjust(
119 	const struct kern_nexus_domain_provider *, const struct nxprov_params *,
120 	struct nxprov_adjusted_params *);
121 
122 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
123     struct nxbind *, void *);
124 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
125 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
126     struct kern_nexus *, struct kern_channel *, struct chreq *, struct nxbind *,
127     struct proc *);
128 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
129     struct kern_nexus *, struct kern_channel *);
130 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
131     struct kern_nexus *, struct kern_channel *, struct proc *);
132 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
133     struct kern_nexus *, struct kern_channel *, boolean_t);
134 
135 static void nx_netif_doorbell(struct ifnet *);
136 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
137     uint32_t);
138 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
139     uint32_t);
140 static void nx_netif_na_dtor(struct nexus_adapter *na);
141 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
142     uint32_t);
143 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
144     uint32_t);
145 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
146 
147 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
148     struct proc *);
149 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
150     struct proc *);
151 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
152 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
153 static void nx_netif_flags_init(struct nx_netif *);
154 static void nx_netif_flags_fini(struct nx_netif *);
155 static void nx_netif_callbacks_init(struct nx_netif *);
156 static void nx_netif_callbacks_fini(struct nx_netif *);
157 static void nx_netif_capabilities_fini(struct nx_netif *);
158 static errno_t nx_netif_interface_advisory_notify(void *,
159     const struct ifnet_interface_advisory *);
160 
161 struct nxdom nx_netif_dom_s = {
162 	.nxdom_prov_head =
163     STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
164 	.nxdom_type =           NEXUS_TYPE_NET_IF,
165 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
166 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
167 	.nxdom_name =           "netif",
168 	.nxdom_ports = {
169 		.nb_def = 2,
170 		.nb_min = 2,
171 		.nb_max = NX_NETIF_MAXPORTS,
172 	},
173 	.nxdom_tx_rings = {
174 		.nb_def = 1,
175 		.nb_min = 1,
176 		.nb_max = NX_NETIF_MAXRINGS,
177 	},
178 	.nxdom_rx_rings = {
179 		.nb_def = 1,
180 		.nb_min = 1,
181 		.nb_max = NX_NETIF_MAXRINGS,
182 	},
183 	.nxdom_tx_slots = {
184 		.nb_def = NX_NETIF_TXRINGSIZE,
185 		.nb_min = NX_NETIF_MINSLOTS,
186 		.nb_max = NX_NETIF_MAXSLOTS,
187 	},
188 	.nxdom_rx_slots = {
189 		.nb_def = NX_NETIF_RXRINGSIZE,
190 		.nb_min = NX_NETIF_MINSLOTS,
191 		.nb_max = NX_NETIF_MAXSLOTS,
192 	},
193 	.nxdom_buf_size = {
194 		.nb_def = NX_NETIF_BUFSIZE,
195 		.nb_min = NX_NETIF_MINBUFSIZE,
196 		.nb_max = NX_NETIF_MAXBUFSIZE,
197 	},
198 	.nxdom_large_buf_size = {
199 		.nb_def = 0,
200 		.nb_min = 0,
201 		.nb_max = 0,
202 	},
203 	.nxdom_meta_size = {
204 		.nb_def = NX_NETIF_UMD_SIZE,
205 		.nb_min = NX_NETIF_UMD_SIZE,
206 		.nb_max = NX_METADATA_USR_MAX_SZ,
207 	},
208 	.nxdom_stats_size = {
209 		.nb_def = 0,
210 		.nb_min = 0,
211 		.nb_max = NX_STATS_MAX_SZ,
212 	},
213 	.nxdom_pipes = {
214 		.nb_def = 0,
215 		.nb_min = 0,
216 		.nb_max = NX_UPIPE_MAXPIPES,
217 	},
218 	.nxdom_flowadv_max = {
219 		.nb_def = 0,
220 		.nb_min = 0,
221 		.nb_max = NX_FLOWADV_MAX,
222 	},
223 	.nxdom_nexusadv_size = {
224 		.nb_def = 0,
225 		.nb_min = 0,
226 		.nb_max = NX_NEXUSADV_MAX_SZ,
227 	},
228 	.nxdom_capabilities = {
229 		.nb_def = NXPCAP_USER_CHANNEL,
230 		.nb_min = 0,
231 		.nb_max = NXPCAP_USER_CHANNEL,
232 	},
233 	.nxdom_qmap = {
234 		.nb_def = NEXUS_QMAP_TYPE_DEFAULT,
235 		.nb_min = NEXUS_QMAP_TYPE_DEFAULT,
236 		.nb_max = NEXUS_QMAP_TYPE_WMM,
237 	},
238 	.nxdom_max_frags = {
239 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
240 		.nb_min = NX_PBUF_FRAGS_MIN,
241 		.nb_max = NX_PBUF_FRAGS_MAX,
242 	},
243 	.nxdom_init =           nx_netif_dom_init,
244 	.nxdom_terminate =      nx_netif_dom_terminate,
245 	.nxdom_fini =           nx_netif_dom_fini,
246 	.nxdom_find_port =      NULL,
247 	.nxdom_port_is_reserved = NULL,
248 	.nxdom_bind_port =      nx_netif_dom_bind_port,
249 	.nxdom_unbind_port =    nx_netif_dom_unbind_port,
250 	.nxdom_connect =        nx_netif_dom_connect,
251 	.nxdom_disconnect =     nx_netif_dom_disconnect,
252 	.nxdom_defunct =        nx_netif_dom_defunct,
253 	.nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
254 };
255 
256 struct kern_nexus_domain_provider nx_netif_prov_s = {
257 	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF,
258 	/*
259 	 * Don't install this as the default domain provider, i.e.
260 	 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
261 	 * provider to be the one handling userland-issued requests
262 	 * coming down thru nxprov_create() instead.
263 	 */
264 	.nxdom_prov_flags =             0,
265 	.nxdom_prov_cb = {
266 		.dp_cb_init =           nx_netif_prov_init,
267 		.dp_cb_fini =           nx_netif_prov_fini,
268 		.dp_cb_params =         nx_netif_prov_params,
269 		.dp_cb_mem_new =        nx_netif_prov_mem_new,
270 		.dp_cb_config =         nx_netif_prov_config,
271 		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
272 		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
273 		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
274 		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
275 		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
276 	},
277 };
278 
279 struct nexus_ifnet_ops na_netif_ops = {
280 	.ni_finalize = na_netif_finalize,
281 	.ni_reap = nx_netif_reap,
282 	.ni_dequeue = nx_netif_native_tx_dequeue,
283 	.ni_get_len = nx_netif_native_tx_get_len,
284 };
285 
286 #define NX_NETIF_DOORBELL_MAX_DEQUEUE    64
287 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
288 
289 #define NQ_TRANSFER_DECAY       2               /* ilog2 of EWMA decay rate (4) */
290 static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;
291 
292 #define NQ_ACCUMULATE_INTERVAL  2 /* 2 seconds */
293 static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;
294 
295 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
296     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
297 #if (DEVELOPMENT || DEBUG)
298 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
299     CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
300     "ifname prefix for enabling low latency support");
301 static uint32_t nx_netif_force_ifnet_start = 0;
302 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
303     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
304     "always use ifnet starter thread");
305 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
306     CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
307     NX_NETIF_DOORBELL_MAX_DEQUEUE,
308     "max packets to dequeue in doorbell context");
309 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
310     CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
311     NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
312 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
313     CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
314     NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
315 #endif /* !DEVELOPMENT && !DEBUG */
316 
317 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
318     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_netif_queue_stat_enable,
319     0, "enable/disable stats collection for netif queue");
320 
321 static SKMEM_TYPE_DEFINE(na_netif_zone, struct nexus_netif_adapter);
322 
323 static SKMEM_TYPE_DEFINE(nx_netif_zone, struct nx_netif);
324 
325 #define SKMEM_TAG_NETIF_MIT          "com.apple.skywalk.netif.mit"
326 static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
327 
328 #define SKMEM_TAG_NETIF_FILTER       "com.apple.skywalk.netif.filter"
329 SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
330 
331 #define SKMEM_TAG_NETIF_FLOW         "com.apple.skywalk.netif.flow"
332 SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
333 
334 #define SKMEM_TAG_NETIF_AGENT_FLOW   "com.apple.skywalk.netif.agent_flow"
335 SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
336 
337 #define SKMEM_TAG_NETIF_LLINK        "com.apple.skywalk.netif.llink"
338 SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
339 
340 #define SKMEM_TAG_NETIF_QSET         "com.apple.skywalk.netif.qset"
341 SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
342 
343 #define SKMEM_TAG_NETIF_LLINK_INFO   "com.apple.skywalk.netif.llink_info"
344 SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
345 
346 /* use this for any temporary allocations */
347 #define SKMEM_TAG_NETIF_TEMP         "com.apple.skywalk.netif.temp"
348 static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);
349 
350 static void
nx_netif_dom_init(struct nxdom * nxdom)351 nx_netif_dom_init(struct nxdom *nxdom)
352 {
353 	SK_LOCK_ASSERT_HELD();
354 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
355 
356 	static_assert(NEXUS_PORT_NET_IF_DEV == 0);
357 	static_assert(NEXUS_PORT_NET_IF_HOST == 1);
358 	static_assert(NEXUS_PORT_NET_IF_CLIENT == 2);
359 	static_assert(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
360 	static_assert(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
361 	static_assert(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
362 	static_assert(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
363 
364 	(void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
365 
366 	nx_netif_compat_init(nxdom);
367 
368 	ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
369 	    strbufcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
370 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
371 
372 	netif_gso_init();
373 }
374 
375 static void
nx_netif_dom_terminate(struct nxdom * nxdom)376 nx_netif_dom_terminate(struct nxdom *nxdom)
377 {
378 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
379 
380 	SK_LOCK_ASSERT_HELD();
381 
382 	netif_gso_fini();
383 	nx_netif_compat_fini();
384 
385 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
386 	    nxdom_prov_link, tnxdp) {
387 		(void) nxdom_prov_del(nxdom_prov);
388 	}
389 }
390 
391 static void
nx_netif_dom_fini(struct nxdom * nxdom)392 nx_netif_dom_fini(struct nxdom *nxdom)
393 {
394 #pragma unused(nxdom)
395 }
396 
397 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)398 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
399 {
400 #pragma unused(nxdom_prov)
401 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
402 	return 0;
403 }
404 
405 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)406 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
407     uint32_t flags)
408 {
409 #pragma unused(kring, p, flags)
410 	return ENXIO;
411 }
412 
413 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)414 nx_netif_prov_nx_stop(struct kern_nexus *nx)
415 {
416 	uint32_t r;
417 	struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
418 	struct nexus_netif_adapter *nifna = NIFNA(na);
419 
420 	SK_LOCK_ASSERT_HELD();
421 	ASSERT(nx != NULL);
422 
423 	/* place all rings in drop mode */
424 	na_kr_drop(na, TRUE);
425 
426 	/* ensure global visibility */
427 	os_atomic_thread_fence(seq_cst);
428 
429 	/* reset all TX notify callbacks */
430 	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
431 		while (!os_atomic_cmpxchg((void * volatile *)&na->na_tx_rings[r].ckr_na_notify,
432 		    ptrauth_nop_cast(void *__single, na->na_tx_rings[r].ckr_na_notify),
433 		    ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
434 			;
435 		}
436 		os_atomic_thread_fence(seq_cst);
437 		if (nifna->nifna_tx_mit != NULL) {
438 			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
439 		}
440 	}
441 	if (nifna->nifna_tx_mit != NULL) {
442 		skn_free_type_array_counted_by(tx, struct nx_netif_mit,
443 		    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
444 	}
445 
446 	/* reset all RX notify callbacks */
447 	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
448 		while (!os_atomic_cmpxchg((void * volatile *)&na->na_rx_rings[r].ckr_na_notify,
449 		    ptrauth_nop_cast(void *__single, na->na_rx_rings[r].ckr_na_notify),
450 		    ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
451 			;
452 		}
453 		os_atomic_thread_fence(seq_cst);
454 		if (nifna->nifna_rx_mit != NULL) {
455 			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
456 		}
457 	}
458 	if (nifna->nifna_rx_mit != NULL) {
459 		skn_free_type_array_counted_by(rx, struct nx_netif_mit,
460 		    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
461 	}
462 	return 0;
463 }
464 
465 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)466 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
467     ifnet_t ifp)
468 {
469 	const char *ifname;
470 
471 	ifname = __terminated_by_to_indexable(ifp->if_name);
472 	if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
473 		*(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
474 		*(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
475 	} else if (IFNET_IS_WIFI(ifp)) {
476 		if (ifname[0] == 'a' && ifname[1] == 'p' &&
477 		    ifname[2] == '\0') {
478 			/* Wi-Fi Access Point */
479 			*(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
480 			*(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
481 		} else if (ifp->if_eflags & IFEF_AWDL) {
482 			/* AWDL */
483 			*(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
484 			*(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
485 		} else {
486 			/* Wi-Fi infrastructure */
487 			*(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
488 			*(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
489 		}
490 	} else if (IFNET_IS_ETHERNET(ifp)) {
491 #if !XNU_TARGET_OS_OSX
492 		/*
493 		 * On non-macOS platforms, treat all compat Ethernet
494 		 * interfaces as USB Ethernet with reduced ring sizes.
495 		 */
496 		*(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
497 		*(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
498 #else /* XNU_TARGET_OS_OSX */
499 		if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
500 			*(adj->adj_rx_slots) =
501 			    sk_netif_compat_usb_eth_rx_ring_sz;
502 			*(adj->adj_tx_slots) =
503 			    sk_netif_compat_usb_eth_tx_ring_sz;
504 		}
505 #endif /* XNU_TARGET_OS_OSX */
506 	}
507 }
508 
509 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)510 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
511     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
512 {
513 	/*
514 	 * for netif compat adjust the following parameters for memory
515 	 * optimization:
516 	 * - change the size of buffer object to 128 bytes.
517 	 * - don't allocate rx ring for host port and tx ring for dev port.
518 	 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
519 	 *   Assumption here is that pdp_ip0 is always used as the data
520 	 *   interface.
521 	 * - reduce the ring size for AWDL interface.
522 	 * - reduce the ring size for USB ethernet interface.
523 	 */
524 	if (strbufcmp(nxdom_prov->nxdom_prov_name,
525 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
526 		/*
527 		 * Leave the parameters default if userspace access may be
528 		 * needed. We can't use skywalk_direct_allowed() here because
529 		 * the drivers have not attached yet.
530 		 */
531 		if (skywalk_netif_direct_enabled()) {
532 			goto done;
533 		}
534 
535 		*(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
536 		*(adj->adj_tx_rings) = 1;
537 		if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
538 			ifnet_t ifp;
539 			ifnet_head_lock_shared();
540 			ifp = ifindex2ifnet[nxp->nxp_ifindex];
541 			ifnet_head_done();
542 			VERIFY(ifp != NULL);
543 			nx_netif_compat_adjust_ring_size(adj, ifp);
544 		}
545 	} else { /* netif native */
546 		if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
547 			*(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
548 			*(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
549 		}
550 		/*
551 		 * Add another extra ring for host port. Note that if the
552 		 * nexus isn't configured to use the same pbufpool for all of
553 		 * its ports, we'd end up allocating extra here.
554 		 * Not a big deal since that case isn't the default.
555 		 */
556 		*(adj->adj_tx_rings) += 1;
557 		*(adj->adj_rx_rings) += 1;
558 
559 		if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
560 			SK_ERR("buf size too small, min (%d)",
561 			    PKT_MAX_PROTO_HEADER_SIZE);
562 			return EINVAL;
563 		}
564 		static_assert(sizeof(struct __kern_netif_intf_advisory) == NX_INTF_ADV_SIZE);
565 		*(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
566 	}
567 done:
568 	return 0;
569 }
570 
571 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)572 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
573     const uint32_t req, const struct nxprov_params *nxp0,
574     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
575     uint32_t pp_region_config_flags)
576 {
577 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
578 
579 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
580 	           nxdom, nxdom, nxdom, pp_region_config_flags,
581 	           nx_netif_prov_params_adjust);
582 }
583 
584 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)585 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
586     struct kern_nexus *nx, struct nexus_adapter *na)
587 {
588 #pragma unused(nxdom_prov)
589 	int err = 0;
590 	boolean_t pp_truncated_buf = FALSE;
591 	boolean_t allow_direct;
592 	boolean_t kernel_only;
593 
594 	SK_DF(SK_VERB_NETIF,
595 	    "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx),
596 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
597 	    SK_KVA(na));
598 
599 	ASSERT(na->na_arena == NULL);
600 	if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
601 	    (na->na_type == NA_NETIF_COMPAT_HOST)) {
602 		pp_truncated_buf = TRUE;
603 	}
604 	/*
605 	 * We do this check to determine whether to create the extra
606 	 * regions needed for userspace access. This is per interface.
607 	 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
608 	 */
609 	allow_direct = skywalk_netif_direct_allowed(
610 		__unsafe_null_terminated_from_indexable(na->na_name));
611 
612 	/*
613 	 * Both ports (host and dev) share the same packet buffer pool;
614 	 * the first time a port gets opened will allocate the pp that
615 	 * gets stored in the nexus, which will then be used by any
616 	 * subsequent opens.
617 	 */
618 	kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
619 	na->na_arena = skmem_arena_create_for_nexus(na,
620 	    NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
621 	    &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
622 	ASSERT(na->na_arena != NULL || err != 0);
623 	ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
624 	    NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
625 	    NX_DOM(nx)->nxdom_md_subtype));
626 
627 	return err;
628 }
629 
630 SK_NO_INLINE_ATTRIBUTE
631 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)632 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
633 {
634 	struct nx_llink_info_req *nlir = NULL;
635 	struct nx_netif *nif;
636 	struct netif_llink *llink;
637 	uint16_t llink_cnt;
638 	size_t len, user_len;
639 	int err, i;
640 
641 	nif = NX_NETIF_PRIVATE(nx);
642 	if (!NETIF_LLINK_ENABLED(nif)) {
643 		SK_ERR("llink mode not enabled");
644 		return ENOTSUP;
645 	}
646 	lck_rw_lock_shared(&nif->nif_llink_lock);
647 	llink_cnt = nif->nif_llink_cnt;
648 	if (llink_cnt == 0) {
649 		SK_ERR("zero llink cnt");
650 		err = ENXIO;
651 		goto done;
652 	}
653 	len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
654 	/* preserve sopt_valsize because it gets overwritten by copyin */
655 	user_len = sopt->sopt_valsize;
656 	if (user_len < len) {
657 		SK_ERR("buffer too small");
658 		err = ENOBUFS;
659 		goto done;
660 	}
661 	nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
662 	if (nlir == NULL) {
663 		SK_ERR("failed to allocate nlir");
664 		err = ENOMEM;
665 		goto done;
666 	}
667 	err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
668 	if (err != 0) {
669 		SK_ERR("copyin failed: %d", err);
670 		goto done;
671 	}
672 	if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
673 		SK_ERR("nlir version mismatch: %d != %d",
674 		    nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
675 		err = ENOTSUP;
676 		goto done;
677 	}
678 	nlir->nlir_llink_cnt = llink_cnt;
679 	i = 0;
680 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
681 		struct nx_llink_info *nli;
682 		struct netif_qset *qset;
683 		uint16_t qset_cnt;
684 		int j;
685 
686 		nli = &nlir->nlir_llink[i];
687 		nli->nli_link_id = llink->nll_link_id;
688 		nli->nli_link_id_internal = llink->nll_link_id_internal;
689 		nli->nli_state = llink->nll_state;
690 		nli->nli_flags = llink->nll_flags;
691 
692 		qset_cnt = llink->nll_qset_cnt;
693 		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
694 		nli->nli_qset_cnt = qset_cnt;
695 
696 		j = 0;
697 		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
698 			struct nx_qset_info *nqi;
699 
700 			nqi = &nli->nli_qset[j];
701 			nqi->nqi_id = qset->nqs_id;
702 			nqi->nqi_flags = qset->nqs_flags;
703 			nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
704 			nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
705 			j++;
706 		}
707 		ASSERT(j == qset_cnt);
708 		i++;
709 	}
710 	ASSERT(i == llink_cnt);
711 	sopt->sopt_valsize = user_len;
712 	err = sooptcopyout(sopt, nlir, len);
713 	if (err != 0) {
714 		SK_ERR("sooptcopyout failed: %d", err);
715 	}
716 done:
717 	lck_rw_unlock_shared(&nif->nif_llink_lock);
718 	if (nlir != NULL) {
719 		sk_free_data(nlir, len);
720 	}
721 	return err;
722 }
723 
724 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)725 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
726     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
727     struct proc *p, kauth_cred_t cred)
728 {
729 #pragma unused(nxdom_prov)
730 	struct sockopt sopt;
731 	int err = 0;
732 
733 	SK_LOCK_ASSERT_HELD();
734 
735 	/* proceed only if the client possesses netif entitlement */
736 	if ((err = skywalk_priv_check_cred(p, cred,
737 	    PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
738 		goto done;
739 	}
740 
741 	if (ncr->nc_req == USER_ADDR_NULL) {
742 		err = EINVAL;
743 		goto done;
744 	}
745 
746 	/* to make life easier for handling copies */
747 	bzero(&sopt, sizeof(sopt));
748 	sopt.sopt_dir = sopt_dir;
749 	sopt.sopt_val = ncr->nc_req;
750 	sopt.sopt_valsize = ncr->nc_req_len;
751 	sopt.sopt_p = p;
752 
753 	switch (ncr->nc_cmd) {
754 	case NXCFG_CMD_ATTACH:
755 	case NXCFG_CMD_DETACH: {
756 		struct nx_spec_req nsr;
757 
758 		bzero(&nsr, sizeof(nsr));
759 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
760 		if (err != 0) {
761 			goto done;
762 		}
763 
764 		/*
765 		 * Null-terminate in case this has an interface name;
766 		 * the union is already large enough for uuid_t.
767 		 */
768 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
769 		if (p != kernproc) {
770 			nsr.nsr_flags &= NXSPECREQ_MASK;
771 		}
772 
773 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
774 		if (err != 0) {
775 			goto done;
776 		}
777 
778 		/* XXX: [email protected] -- can this copyout fail? */
779 		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
780 		break;
781 	}
782 	case NXCFG_CMD_FLOW_ADD:
783 	case NXCFG_CMD_FLOW_DEL: {
784 		static_assert(offsetof(struct nx_flow_req, _nfr_kernel_field_end) == offsetof(struct nx_flow_req, _nfr_common_field_end));
785 		struct nx_flow_req nfr;
786 
787 		bzero(&nfr, sizeof(nfr));
788 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
789 		if (err != 0) {
790 			goto done;
791 		}
792 
793 		err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
794 		if (err != 0) {
795 			goto done;
796 		}
797 
798 		/* XXX: [email protected] -- can this copyout fail? */
799 		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
800 		break;
801 	}
802 	case NXCFG_CMD_GET_LLINK_INFO: {
803 		err = nx_netif_get_llink_info(&sopt, nx);
804 		break;
805 	}
806 	default:
807 		err = EINVAL;
808 		goto done;
809 	}
810 done:
811 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
812 	    "nexus %p (%s) cmd %d err %d", SK_KVA(nx),
813 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
814 	return err;
815 }
816 
817 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)818 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
819 {
820 #pragma unused(nxdom_prov)
821 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
822 }
823 
824 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)825 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
826 {
827 	struct nx_netif *n;
828 	char name[64];
829 	const char *__null_terminated nxadv_name = NULL;
830 	int error;
831 
832 	SK_LOCK_ASSERT_HELD();
833 	ASSERT(nx->nx_arg == NULL);
834 
835 	SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
836 
837 	nx->nx_arg = nx_netif_alloc(Z_WAITOK);
838 	n = NX_NETIF_PRIVATE(nx);
839 	if (NX_USER_CHANNEL_PROV(nx) &&
840 	    NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
841 		nxadv_name = tsnprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
842 		error = nx_advisory_alloc(nx, nxadv_name,
843 		    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
844 		    NEXUS_ADVISORY_TYPE_NETIF);
845 		if (error != 0) {
846 			nx_netif_free(n);
847 			return error;
848 		}
849 	}
850 	n->nif_nx = nx;
851 	SK_D("create new netif %p for nexus %p",
852 	    SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
853 	return 0;
854 }
855 
856 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)857 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
858 {
859 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
860 
861 	SK_LOCK_ASSERT_HELD();
862 
863 	SK_D("nexus %p (%s) netif %p", SK_KVA(nx),
864 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
865 
866 	/*
867 	 * XXX
868 	 * detach should be done separately to be symmetrical with attach.
869 	 */
870 	nx_advisory_free(nx);
871 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
872 		/* we're called by nx_detach(), so this cannot fail */
873 		int err = nx_netif_ctl_detach(nx, NULL);
874 		VERIFY(err == 0);
875 	}
876 	if (n->nif_dev_nxb != NULL) {
877 		nxb_free(n->nif_dev_nxb);
878 		n->nif_dev_nxb = NULL;
879 	}
880 	if (n->nif_host_nxb != NULL) {
881 		nxb_free(n->nif_host_nxb);
882 		n->nif_host_nxb = NULL;
883 	}
884 	SK_DF(SK_VERB_NETIF, "marking netif %p as free", SK_KVA(n));
885 	nx_netif_free(n);
886 	nx->nx_arg = NULL;
887 }
888 
889 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)890 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
891     struct kern_pbufpool **rpp)
892 {
893 	ASSERT(nx->nx_tx_pp != NULL);
894 	ASSERT(nx->nx_rx_pp != NULL);
895 
896 	if (tpp != NULL) {
897 		*tpp = nx->nx_tx_pp;
898 	}
899 	if (rpp != NULL) {
900 		*rpp = nx->nx_rx_pp;
901 	}
902 
903 	return 0;
904 }
905 
906 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)907 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
908 {
909 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
910 	struct ifnet *ifp = nif->nif_ifp;
911 	struct sk_stats_net_if *__single sns = out;
912 	size_t actual_space = sizeof(struct sk_stats_net_if);
913 
914 	if (out != NULL && actual_space <= len) {
915 		uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
916 		if (ifp != NULL) {
917 			(void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
918 		}
919 		sns->sns_nifs = nif->nif_stats;
920 	}
921 
922 	return actual_space;
923 }
924 
925 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)926 __netif_mib_get_llinks(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
927 {
928 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
929 	struct nx_llink_info *nli_list = out;
930 	size_t actual_space = 0;
931 	if (NETIF_LLINK_ENABLED(nif)) {
932 		lck_rw_lock_shared(&nif->nif_llink_lock);
933 		actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
934 
935 		if (out != NULL && actual_space <= len) {
936 			struct netif_llink *llink;
937 			int i = 0;
938 			STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
939 				struct nx_llink_info *nli;
940 				struct netif_qset *qset;
941 				uint16_t qset_cnt;
942 				int j;
943 
944 				nli = &nli_list[i];
945 				uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
946 				nli->nli_link_id = llink->nll_link_id;
947 				nli->nli_link_id_internal = llink->nll_link_id_internal;
948 				nli->nli_state = llink->nll_state;
949 				nli->nli_flags = llink->nll_flags;
950 
951 				qset_cnt = llink->nll_qset_cnt;
952 				ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
953 				nli->nli_qset_cnt = qset_cnt;
954 
955 				j = 0;
956 				SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
957 					struct nx_qset_info *nqi;
958 
959 					nqi = &nli->nli_qset[j];
960 					nqi->nqi_id = qset->nqs_id;
961 					nqi->nqi_flags = qset->nqs_flags;
962 					nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
963 					nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
964 					j++;
965 				}
966 				ASSERT(j == qset_cnt);
967 				i++;
968 			}
969 			ASSERT(i == nif->nif_llink_cnt);
970 		}
971 		lck_rw_unlock_shared(&nif->nif_llink_lock);
972 	}
973 
974 	return actual_space;
975 }
976 
977 static size_t
__netif_mib_get_queue_stats(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)978 __netif_mib_get_queue_stats(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
979 {
980 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
981 	uint8_t *itr = out;
982 	size_t actual_space = 0;
983 	if (!NETIF_LLINK_ENABLED(nif)) {
984 		return actual_space;
985 	}
986 
987 	lck_rw_lock_shared(&nif->nif_llink_lock);
988 	struct netif_llink *llink;
989 	struct netif_qset *qset;
990 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
991 		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
992 			actual_space += sizeof(struct netif_qstats_info) *
993 			    (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
994 		}
995 	}
996 	if (out == NULL || actual_space > len) {
997 		lck_rw_unlock_shared(&nif->nif_llink_lock);
998 		return actual_space;
999 	}
1000 
1001 	llink = NULL;
1002 	qset = NULL;
1003 	uint16_t i = 0, j = 0;
1004 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1005 		uint16_t qset_cnt;
1006 		j = 0;
1007 		qset_cnt = llink->nll_qset_cnt;
1008 		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
1009 		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1010 			int queue_cnt = qset->nqs_num_rx_queues +
1011 			    qset->nqs_num_tx_queues;
1012 			for (uint16_t k = 0; k < queue_cnt; k++) {
1013 				struct netif_qstats_info *nqi =
1014 				    (struct netif_qstats_info *)(void *)itr;
1015 				struct netif_queue *nq = &qset->nqs_driver_queues[k];
1016 				nqi->nqi_qset_id = qset->nqs_id;
1017 				nqi->nqi_queue_idx = k;
1018 				if (KPKT_VALID_SVC(nq->nq_svc)) {
1019 					nqi->nqi_svc = (packet_svc_class_t)nq->nq_svc;
1020 				}
1021 				if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
1022 					nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
1023 				}
1024 
1025 				struct netif_qstats *nq_out = &nqi->nqi_stats;
1026 				struct netif_qstats *nq_src = &nq->nq_stats;
1027 				memcpy(nq_out, nq_src, sizeof(struct netif_qstats));
1028 
1029 				itr += sizeof(struct netif_qstats_info);
1030 			}
1031 			j++;
1032 		}
1033 		ASSERT(j == qset_cnt);
1034 		i++;
1035 	}
1036 	ASSERT(i == nif->nif_llink_cnt);
1037 
1038 	lck_rw_unlock_shared(&nif->nif_llink_lock);
1039 	return actual_space;
1040 }
1041 
1042 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)1043 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1044     void *__sized_by(len) out, size_t len, struct proc *p)
1045 {
1046 #pragma unused(p)
1047 	size_t ret;
1048 
1049 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1050 	    (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
1051 		return 0;
1052 	}
1053 
1054 	switch (filter->nmf_type) {
1055 	case NXMIB_NETIF_STATS:
1056 		ret = __netif_mib_get_stats(nx, out, len);
1057 		break;
1058 	case NXMIB_LLINK_LIST:
1059 		ret = __netif_mib_get_llinks(nx, out, len);
1060 		break;
1061 	case NXMIB_NETIF_QUEUE_STATS:
1062 		ret = __netif_mib_get_queue_stats(nx, out, len);
1063 		break;
1064 	default:
1065 		ret = 0;
1066 		break;
1067 	}
1068 	return ret;
1069 }
1070 
1071 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)1072 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1073     struct nxbind *nxb, void *info)
1074 {
1075 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1076 	nexus_port_t first, last, port;
1077 	int error;
1078 
1079 	ASSERT(nx_port != NULL);
1080 	ASSERT(nxb != NULL);
1081 
1082 	port = *nx_port;
1083 
1084 	/*
1085 	 * If port is:
1086 	 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1087 	 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1088 	 *                    return back the assigned port.
1089 	 */
1090 	first = NEXUS_PORT_NET_IF_CLIENT;
1091 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
1092 	last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
1093 	ASSERT(first <= last);
1094 
1095 	NETIF_WLOCK(nif);
1096 
1097 	if (__improbable(first == last)) {
1098 		error = ENOMEM;
1099 	} else if (port != NEXUS_PORT_ANY) {
1100 		error = nx_port_bind_info(nx, port, nxb, info);
1101 		SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1102 	} else {
1103 		error = nx_port_find(nx, first, last - 1, &port);
1104 		ASSERT(error != 0 || (port >= first && port < last));
1105 		if (error == 0) {
1106 			error = nx_port_bind_info(nx, port, nxb, info);
1107 			SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1108 			    port, error);
1109 		}
1110 	}
1111 	NETIF_WUNLOCK(nif);
1112 
1113 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1114 	if (error == 0) {
1115 		*nx_port = port;
1116 	}
1117 
1118 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1119 	    "+++ netif %p nx_port %d, total %u active %u (err %d)",
1120 	    SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1121 	    nx->nx_active_ports, error);
1122 
1123 	return error;
1124 }
1125 
1126 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1127 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1128 {
1129 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1130 	int error = 0;
1131 
1132 	ASSERT(nx_port != NEXUS_PORT_ANY);
1133 
1134 	NETIF_WLOCK(nif);
1135 	error = nx_port_unbind(nx, nx_port);
1136 	NETIF_WUNLOCK(nif);
1137 
1138 	return error;
1139 }
1140 
1141 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p)1142 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1143     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1144     struct nxbind *nxb, struct proc *p)
1145 {
1146 #pragma unused(nxdom_prov)
1147 	int err = 0;
1148 
1149 	SK_LOCK_ASSERT_HELD();
1150 
1151 	ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1152 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1153 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
1154 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1155 	ASSERT(!(ch->ch_flags & CHANF_HOST));
1156 
1157 	switch (chr->cr_port) {
1158 	case NEXUS_PORT_NET_IF_DEV:
1159 		if (chr->cr_mode & CHMODE_HOST) {
1160 			err = EINVAL;
1161 			goto done;
1162 		}
1163 		break;
1164 
1165 	case NEXUS_PORT_NET_IF_HOST:
1166 		if (!(chr->cr_mode & CHMODE_HOST)) {
1167 			if (ch->ch_flags & CHANF_KERNEL) {
1168 				err = EINVAL;
1169 				goto done;
1170 			}
1171 			chr->cr_mode |= CHMODE_HOST;
1172 		}
1173 		/*
1174 		 * This channel is exclusively opened to the host
1175 		 * rings; don't notify the external provider.
1176 		 */
1177 		os_atomic_or(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP, relaxed);
1178 		break;
1179 
1180 	default:
1181 		/*
1182 		 * This channel is shared between netif and user process;
1183 		 * don't notify the external provider.
1184 		 */
1185 		os_atomic_or(&ch->ch_flags, CHANF_EXT_SKIP, relaxed);
1186 		break;
1187 	}
1188 
1189 	chr->cr_ring_set = RING_SET_DEFAULT;
1190 	chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1191 	(void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1192 	    nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1193 	    nx->nx_prov->nxprov_params->nxp_name);
1194 
1195 	if (ch->ch_flags & CHANF_KERNEL) {
1196 		err = na_connect_spec(nx, ch, chr, p);
1197 	} else {
1198 		err = na_connect(nx, ch, chr, nxb, p);
1199 	}
1200 
1201 	if (err == 0) {
1202 		/*
1203 		 * Mark the kernel slot descriptor region as busy; this
1204 		 * prevents it from being torn-down at channel defunct
1205 		 * time, as the (external) nexus owner may be calling
1206 		 * KPIs that require accessing the slots.
1207 		 */
1208 		skmem_arena_nexus_sd_set_noidle(
1209 			skmem_arena_nexus(ch->ch_na->na_arena), 1);
1210 	}
1211 
1212 done:
1213 	return err;
1214 }
1215 
1216 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1217 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1218     struct kern_nexus *nx, struct kern_channel *ch)
1219 {
1220 #pragma unused(nxdom_prov)
1221 	SK_LOCK_ASSERT_HELD();
1222 
1223 	SK_D("channel %p -!- nexus %p (%s:\"%s\":%u:%d)", SK_KVA(ch),
1224 	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1225 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1226 
1227 	/*
1228 	 * Release busy assertion held earlier in nx_netif_dom_connect();
1229 	 * this allows for the final arena teardown to succeed.
1230 	 */
1231 	skmem_arena_nexus_sd_set_noidle(
1232 		skmem_arena_nexus(ch->ch_na->na_arena), -1);
1233 
1234 	if (ch->ch_flags & CHANF_KERNEL) {
1235 		na_disconnect_spec(nx, ch);
1236 	} else {
1237 		na_disconnect(nx, ch);
1238 	}
1239 }
1240 
1241 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1242 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1243     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1244 {
1245 #pragma unused(nxdom_prov, nx)
1246 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1247 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1248 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1249 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1250 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1251 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1252 	    ch->ch_na->na_type == NA_NETIF_VP);
1253 
1254 	na_ch_rings_defunct(ch, p);
1255 }
1256 
1257 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1258 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1259     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1260 {
1261 #pragma unused(nxdom_prov)
1262 	struct ifnet *ifp;
1263 
1264 	if (!locked) {
1265 		SK_LOCK_ASSERT_NOTHELD();
1266 		SK_LOCK();
1267 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1268 	} else {
1269 		SK_LOCK_ASSERT_HELD();
1270 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1271 	}
1272 
1273 	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1274 	    ch->ch_na->na_type == NA_NETIF_HOST ||
1275 	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1276 	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1277 	    ch->ch_na->na_type == NA_NETIF_VP);
1278 
1279 	na_defunct(nx, ch, ch->ch_na, locked);
1280 	ifp = ch->ch_na->na_ifp;
1281 	if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
1282 	    ifnet_is_low_latency(ifp)) {
1283 		/*
1284 		 * We release the VPNA's ifp here instead of waiting for the
1285 		 * application to close the channel to trigger the release.
1286 		 */
1287 		DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
1288 		    ch->ch_na, struct ifnet *, ifp);
1289 		ifnet_decr_iorefcnt(ifp);
1290 		ch->ch_na->na_ifp = NULL;
1291 	}
1292 	SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d)",
1293 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1294 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1295 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1296 
1297 	if (!locked) {
1298 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1299 		SK_UNLOCK();
1300 	} else {
1301 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1302 		SK_LOCK_ASSERT_HELD();
1303 	}
1304 }
1305 
1306 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1307 na_netif_alloc(zalloc_flags_t how)
1308 {
1309 	static_assert(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1310 
1311 	return zalloc_flags(na_netif_zone, how | Z_ZERO);
1312 }
1313 
1314 void
na_netif_free(struct nexus_adapter * na)1315 na_netif_free(struct nexus_adapter *na)
1316 {
1317 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1318 
1319 	SK_LOCK_ASSERT_HELD();
1320 	SK_DF(SK_VERB_MEM, "nifna %p FREE", SK_KVA(nifna));
1321 
1322 	ASSERT(na->na_refcount == 0);
1323 	ASSERT(nifna->nifna_tx_mit == NULL);
1324 	ASSERT(nifna->nifna_rx_mit == NULL);
1325 	bzero(nifna, sizeof(*nifna));
1326 
1327 	zfree(na_netif_zone, nifna);
1328 }
1329 
1330 /* Process NXCFG_CMD_ATTACH */
1331 SK_NO_INLINE_ATTRIBUTE
1332 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1333 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1334     struct proc *p)
1335 {
1336 	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1337 	struct ifnet *ifp = NULL;
1338 	boolean_t compat;
1339 	int err = 0;
1340 
1341 	SK_LOCK_ASSERT_HELD();
1342 
1343 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1344 	compat = (strbufcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1345 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1346 
1347 	uuid_clear(nsr->nsr_if_uuid);
1348 	/*
1349 	 * The netif accepts either an interface name or a pointer to
1350 	 * an ifnet, but never a UUID.
1351 	 */
1352 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1353 		err = EINVAL;
1354 		goto done;
1355 	}
1356 	if (nsr->nsr_flags & NXSPECREQ_IFP) {
1357 		if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1358 			err = EINVAL;
1359 			goto done;
1360 		}
1361 	} else if ((ifp = ifunit_ref(__unsafe_null_terminated_from_indexable(
1362 		    nsr->nsr_name))) == NULL) {
1363 		err = ENXIO;
1364 		goto done;
1365 	}
1366 
1367 	if ((compat && SKYWALK_NATIVE(ifp)) ||
1368 	    (!compat && !SKYWALK_NATIVE(ifp))) {
1369 		/* native driver for netif; non-native for netif_compat  */
1370 		err = ENODEV;
1371 	} else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1372 		err = EBUSY;
1373 	} else {
1374 		ASSERT(uuid_is_null(n->nif_uuid));
1375 		/*
1376 		 * Upon success, callee will hold its own ifnet iorefcnt
1377 		 * as well as a retain count on the nexus adapter.
1378 		 */
1379 		if (compat) {
1380 			err = nx_netif_compat_attach(nx, ifp);
1381 		} else {
1382 			err = nx_netif_attach(nx, ifp);
1383 		}
1384 
1385 		if (err == 0) {
1386 			/* return the adapter UUID */
1387 			uuid_generate_random(n->nif_uuid);
1388 			uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1389 #if (DEVELOPMENT || DEBUG)
1390 			skoid_create(&n->nif_skoid,
1391 			    SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1392 			    CTLFLAG_RW);
1393 #endif /* !DEVELOPMENT && !DEBUG */
1394 		}
1395 	}
1396 done:
1397 	/* drop I/O refcnt from ifunit_ref() */
1398 	if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1399 		ifnet_decr_iorefcnt(ifp);
1400 	}
1401 
1402 #if SK_LOG
1403 	uuid_string_t uuidstr, ifuuidstr;
1404 	const char *nustr;
1405 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
1406 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1407 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1408 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "%p",
1409 		    SK_KVA(nsr->nsr_ifp));
1410 		nustr = uuidstr;
1411 	} else {
1412 		nustr = nsr->nsr_name;
1413 	}
1414 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1415 	    "nexus %p (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1416 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1417 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1418 #endif /* SK_LOG */
1419 
1420 	return err;
1421 }
1422 
1423 SK_NO_INLINE_ATTRIBUTE
1424 static int
nx_netif_clean(struct nx_netif * nif,boolean_t quiesce_needed)1425 nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
1426 {
1427 	struct kern_nexus *nx = nif->nif_nx;
1428 	struct ifnet *ifp;
1429 	boolean_t suspended = FALSE;
1430 
1431 	ifp = nif->nif_ifp;
1432 	if (ifp == NULL) {
1433 		return EALREADY;
1434 	}
1435 	/*
1436 	 * For regular kernel-attached interfaces, quiescing is handled by
1437 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1438 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1439 	 * are constructed on the fly and can also be torn down on the fly.
1440 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1441 	 * can be detached while the interface is still attached.
1442 	 */
1443 	if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
1444 		SK_UNLOCK();
1445 		suspended = TRUE;
1446 		ifnet_datamov_drain(ifp);
1447 		SK_LOCK();
1448 	}
1449 	nx_netif_callbacks_fini(nif);
1450 	nx_netif_agent_fini(nif);
1451 	nx_netif_capabilities_fini(nif);
1452 	nx_netif_flow_fini(nif);
1453 	nx_netif_filter_fini(nif);
1454 	nx_netif_llink_fini(nif);
1455 	nx_netif_flags_fini(nif);
1456 
1457 	uuid_clear(nif->nif_uuid);
1458 	/* nx_netif_{compat_}attach() held both references */
1459 	na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1460 	na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1461 	nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1462 	nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1463 
1464 	ifp->if_na_ops = NULL;
1465 	ifp->if_na = NULL;
1466 	nif->nif_ifp = NULL;
1467 	nif->nif_netif_nxadv = NULL;
1468 	SKYWALK_CLEAR_CAPABLE(ifp);
1469 	if (suspended) {
1470 		ifnet_datamov_resume(ifp);
1471 	}
1472 
1473 #if (DEVELOPMENT || DEBUG)
1474 	skoid_destroy(&nif->nif_skoid);
1475 #endif /* !DEVELOPMENT && !DEBUG */
1476 	return 0;
1477 }
1478 
1479 /* process NXCFG_CMD_DETACH */
1480 SK_NO_INLINE_ATTRIBUTE
1481 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1482 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1483 {
1484 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1485 	int err = 0;
1486 
1487 	SK_LOCK_ASSERT_HELD();
1488 
1489 	/*
1490 	 * nsr is NULL when we're called from the destructor, and it
1491 	 * implies that we'll detach whatever that is attached.
1492 	 */
1493 	if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1494 		err = EINVAL;
1495 	} else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1496 	    nif->nif_uuid) != 0) {
1497 		err = ESRCH;
1498 	} else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1499 		/* nx_netif_ctl_attach() not yet done or already detached */
1500 		err = ENXIO;
1501 	} else if (nx->nx_ch_count != 0) {
1502 		/*
1503 		 * There's at least a channel opened; we can't
1504 		 * yank the interface from underneath the nexus
1505 		 * since our dlil input/output handler may be
1506 		 * running now.  Bail out and come back here
1507 		 * again when the nexus detaches.
1508 		 */
1509 		err = EBUSY;
1510 	} else {
1511 		err = nx_netif_clean(nif, TRUE);
1512 	}
1513 
1514 #if SK_LOG
1515 	if (nsr != NULL) {
1516 		uuid_string_t ifuuidstr;
1517 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1518 		    "nexus %p (%s) if_uuid %s flags 0x%x err %d",
1519 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1520 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1521 		    nsr->nsr_flags, err);
1522 	} else {
1523 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1524 		    "nexus %p (%s) err %d", SK_KVA(nx),
1525 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
1526 	}
1527 #endif /* SK_LOG */
1528 
1529 	return err;
1530 }
1531 
1532 /*
1533  * XXX
1534  * These checks are copied from fsw.c
1535  * There are no tests exercising this code. Do we still need this?
1536  */
1537 SK_NO_INLINE_ATTRIBUTE
1538 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1539 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1540     struct proc *p, struct nx_flow_req *req)
1541 {
1542 #pragma unused(nif)
1543 	boolean_t need_check;
1544 	int error;
1545 
1546 	if (uuid_is_null(req->nfr_flow_uuid)) {
1547 		return EINVAL;
1548 	}
1549 	req->nfr_flags &= NXFLOWREQF_MASK;
1550 	req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1551 
1552 	if (cmd == NXCFG_CMD_FLOW_DEL) {
1553 		return 0;
1554 	}
1555 	need_check = FALSE;
1556 	if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1557 		need_check = TRUE;
1558 	} else if (!uuid_is_null(req->nfr_euuid)) {
1559 		uuid_t uuid;
1560 
1561 		/* get the UUID of the issuing process */
1562 		proc_getexecutableuuid(p, uuid, sizeof(uuid));
1563 
1564 		/*
1565 		 * If this is not issued by a process for its own
1566 		 * executable UUID and if the process does not have
1567 		 * the necessary privilege, reject the request.
1568 		 * The logic is similar to so_set_effective_uuid().
1569 		 */
1570 		if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1571 			need_check = TRUE;
1572 		}
1573 	}
1574 	if (need_check) {
1575 		kauth_cred_t cred = kauth_cred_proc_ref(p);
1576 		error = priv_check_cred(cred,
1577 		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1578 		kauth_cred_unref(&cred);
1579 		if (error != 0) {
1580 			return error;
1581 		}
1582 	}
1583 	return 0;
1584 }
1585 
1586 SK_NO_INLINE_ATTRIBUTE
1587 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1588 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1589     struct nx_flow_req *req)
1590 {
1591 	int err;
1592 
1593 	ASSERT(p != PROC_NULL);
1594 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1595 	if (err != 0) {
1596 		return err;
1597 	}
1598 
1599 	/* init kernel only fields */
1600 	nx_flow_req_internalize(req);
1601 	req->nfr_context = NULL;
1602 	req->nfr_flow_stats = NULL;
1603 	req->nfr_port_reservation = NULL;
1604 	req->nfr_pid = proc_pid(p);
1605 
1606 	err = nx_netif_netagent_flow_add(nif, req);
1607 	nx_flow_req_externalize(req);
1608 	return err;
1609 }
1610 
1611 SK_NO_INLINE_ATTRIBUTE
1612 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1613 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1614     struct nx_flow_req *req)
1615 {
1616 	int err;
1617 
1618 	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1619 	if (err != 0) {
1620 		return err;
1621 	}
1622 
1623 	nx_flow_req_internalize(req);
1624 	req->nfr_pid = proc_pid(p);
1625 
1626 	err = nx_netif_netagent_flow_del(nif, req);
1627 	nx_flow_req_externalize(req);
1628 	return err;
1629 }
1630 
1631 SK_NO_INLINE_ATTRIBUTE
1632 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1633 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1634     struct proc *p)
1635 {
1636 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1637 	struct nx_spec_req *__single nsr = data;
1638 	struct nx_flow_req *__single nfr = data;
1639 	int error = 0;
1640 
1641 	SK_LOCK_ASSERT_HELD();
1642 
1643 	switch (nc_cmd) {
1644 	case NXCFG_CMD_ATTACH:
1645 		error = nx_netif_ctl_attach(nx, nsr, p);
1646 		break;
1647 
1648 	case NXCFG_CMD_DETACH:
1649 		error = nx_netif_ctl_detach(nx, nsr);
1650 		break;
1651 
1652 	case NXCFG_CMD_FLOW_ADD:
1653 		error = nx_netif_ctl_flow_add(nif, p, nfr);
1654 		break;
1655 
1656 	case NXCFG_CMD_FLOW_DEL:
1657 		error = nx_netif_ctl_flow_del(nif, p, nfr);
1658 		break;
1659 
1660 	default:
1661 		SK_ERR("invalid cmd %u", nc_cmd);
1662 		error = EINVAL;
1663 		break;
1664 	}
1665 	return error;
1666 }
1667 
1668 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1669 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1670     uint32_t flags)
1671 {
1672 #pragma unused(flags)
1673 	struct netif_qset *qset;
1674 
1675 	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1676 		(void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1677 	}
1678 }
1679 
1680 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1681 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1682 {
1683 	struct nx_netif *nif;
1684 	struct netif_llink *llink;
1685 
1686 	nif = NX_NETIF_PRIVATE(nx);
1687 
1688 	lck_rw_lock_shared(&nif->nif_llink_lock);
1689 	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1690 		nx_netif_llink_notify(nx, llink, flags);
1691 	}
1692 	lck_rw_unlock_shared(&nif->nif_llink_lock);
1693 }
1694 
1695 /*
1696  * if_start() callback for native Skywalk interfaces, registered
1697  * at ifnet_allocate_extended() time, and invoked by the ifnet
1698  * starter thread.
1699  */
1700 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1701 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1702 {
1703 	if (__improbable(ifp->if_na == NULL)) {
1704 		return;
1705 	}
1706 
1707 	/*
1708 	 * Do this only if the nexus adapter is active, i.e. a channel
1709 	 * has been opened to it by the module above (flowswitch, etc.)
1710 	 */
1711 	struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1712 	if (__probable(NA_IS_ACTIVE(hwna))) {
1713 		struct kern_nexus *nx = hwna->na_nx;
1714 
1715 		/* update our work timestamp */
1716 		hwna->na_work_ts = net_uptime();
1717 
1718 		if (NX_LLINK_PROV(nx)) {
1719 			nx_netif_llink_notify_all(nx, flags);
1720 		} else {
1721 			struct __kern_channel_ring *kring;
1722 
1723 			/* for doorbell purposes, use TX ring 0 */
1724 			kring = &hwna->na_tx_rings[0];
1725 
1726 			/* Issue a synchronous TX doorbell on the netif device ring */
1727 			kring->ckr_na_sync(kring, PROC_NULL,
1728 			    (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1729 		}
1730 	} else {
1731 		struct netif_stats *nifs =
1732 		    &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1733 		STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1734 	}
1735 }
1736 
1737 static void
nx_netif_doorbell(struct ifnet * ifp)1738 nx_netif_doorbell(struct ifnet *ifp)
1739 {
1740 	nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1741 }
1742 
1743 /*
1744  * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1745  * perform synchronous TX doorbell to the driver, by invoking the driver's
1746  * doorbell callback directly in the same thread context.  It is also called
1747  * when the layer above performs a TX sync operation, where we might need
1748  * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1749  */
1750 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1751 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1752     uint32_t flags)
1753 {
1754 #pragma unused(p)
1755 	struct ifnet *ifp = KRNA(kring)->na_ifp;
1756 	boolean_t sync_only;
1757 	int ret = 0;
1758 
1759 	ASSERT(ifp != NULL);
1760 
1761 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1762 	    "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x",
1763 	    sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
1764 	    SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags);
1765 
1766 	if (__improbable(!ifnet_is_fully_attached(ifp))) {
1767 		SK_ERR("kr %p ifp %s (%p), interface not attached",
1768 		    SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1769 		return ENXIO;
1770 	}
1771 
1772 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1773 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr %p ifp %s (%p), "
1774 		    "flow control ON", SK_KVA(kring), if_name(ifp),
1775 		    SK_KVA(ifp));
1776 		return ENXIO;
1777 	}
1778 
1779 	/* update our work timestamp */
1780 	KRNA(kring)->na_work_ts = net_uptime();
1781 
1782 	sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1783 	    !KR_KERNEL_ONLY(kring);
1784 	/* regular sync (reclaim) */
1785 	if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1786 		ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1787 		    kring->ckr_pending_intr != 0);
1788 		kring->ckr_pending_intr = 0;
1789 
1790 		/* direct user channels do not need to use the doorbell */
1791 		if (__improbable(sync_only)) {
1792 			return ret;
1793 		}
1794 	}
1795 
1796 	/*
1797 	 * Doorbell call.  Here we do doorbell explicitly if the flag is
1798 	 * set or implicitly if we're opened directly by a user channel.
1799 	 * Synchronous vs. asynchronous depending on the context.
1800 	 */
1801 	if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1802 		if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1803 			ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1804 			    !(flags & NA_SYNCF_NETIF_ASYNC));
1805 			nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1806 		} else {
1807 			ifnet_start(ifp);
1808 		}
1809 	}
1810 
1811 	return ret;
1812 }
1813 
1814 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1815 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1816     uint32_t flags)
1817 {
1818 #pragma unused(p)
1819 	int ret;
1820 
1821 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1822 	    "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x",
1823 	    sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
1824 	    SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags);
1825 
1826 	ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1827 
1828 	/* update our work timestamp */
1829 	KRNA(kring)->na_work_ts = net_uptime();
1830 
1831 	ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1832 	    kring->ckr_pending_intr != 0);
1833 	kring->ckr_pending_intr = 0;
1834 
1835 	return ret;
1836 }
1837 
1838 static void
nx_netif_na_dtor(struct nexus_adapter * na)1839 nx_netif_na_dtor(struct nexus_adapter *na)
1840 {
1841 	struct ifnet *__single ifp;
1842 	struct nexus_netif_adapter *nifna = NIFNA(na);
1843 
1844 	SK_LOCK_ASSERT_HELD();
1845 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1846 
1847 	SK_DF(SK_VERB_NETIF, "na \"%s\" (%p)", na->na_name, SK_KVA(na));
1848 
1849 	/*
1850 	 * If the finalizer callback hasn't been called for whatever
1851 	 * reasons, pick up the embryonic ifnet stored in na_private.
1852 	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1853 	 */
1854 	if ((ifp = na->na_ifp) == NULL) {
1855 		ifp = na->na_private;
1856 		na->na_private = NULL;
1857 	} else {
1858 		ifnet_decr_iorefcnt(ifp);
1859 		na->na_ifp = NULL;
1860 	}
1861 
1862 	if (nifna->nifna_netif != NULL) {
1863 		nx_netif_release(nifna->nifna_netif);
1864 		nifna->nifna_netif = NULL;
1865 	}
1866 	ASSERT(SKYWALK_NATIVE(ifp));
1867 }
1868 
1869 /*
1870  * Dispatch rx/tx interrupts to the channel rings.
1871  *
1872  * The 'notify' routine depends on what the ring is attached to.
1873  * - for a channel file descriptor, do an event wakeup on the individual
1874  *   waitqueue, plus one on the global one if needed (see na_notify)
1875  * - for a device port connected to a FlowSwitch, call the proper
1876  *   forwarding routine; see nx_fsw_tx_hwna_notify()
1877  *   or nx_fsw_rx_hwna_notify().
1878  */
1879 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1880 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1881     uint32_t flags, uint32_t *work_done)
1882 {
1883 	struct netif_stats *nifs =
1884 	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1885 	int (*notify)(struct __kern_channel_ring *kring,
1886 	    struct proc *, uint32_t flags);
1887 	int ret;
1888 
1889 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1890 
1891 	SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1892 	    ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1893 	    "na \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x",
1894 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1895 	    SK_KVA(kring), kring->ckr_flags);
1896 
1897 	/* update our work timestamp */
1898 	KRNA(kring)->na_work_ts = net_uptime();
1899 
1900 	kring->ckr_pending_intr++;
1901 	if (work_done != NULL) {
1902 		*work_done = 1; /* do not fire again */
1903 	}
1904 	/*
1905 	 * We can't be calling ckr_na_notify here since we could already be
1906 	 * intercepting it, else we'd end up recursively calling ourselves.
1907 	 * Use the original na_notify callback saved during na_activate, or in
1908 	 * the case when the module above us is the flowswitch, the notify
1909 	 * routine that it has installed in place of our original one.
1910 	 */
1911 	if (__probable(!KR_DROP(kring) &&
1912 	    (notify = kring->ckr_netif_notify) != NULL)) {
1913 		ret = notify(kring, p, flags);
1914 	} else {
1915 		/*
1916 		 * If the ring is in drop mode, pretend as if it's busy.
1917 		 * This allows the mitigation thread to pause for a while
1918 		 * before attempting again.
1919 		 */
1920 		ret = EBUSY;
1921 	}
1922 	if (__improbable(ret != 0)) {
1923 		switch (kring->ckr_tx) {
1924 		case NR_RX:
1925 			if (ret == EBUSY) {
1926 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1927 			} else if (ret == EAGAIN) {
1928 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1929 			} else {
1930 				STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1931 			}
1932 			break;
1933 
1934 		case NR_TX:
1935 			if (ret == EBUSY) {
1936 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1937 			} else if (ret == EAGAIN) {
1938 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1939 			} else {
1940 				STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1941 			}
1942 			break;
1943 
1944 		default:
1945 			break;
1946 		}
1947 	}
1948 
1949 	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1950 
1951 	return ret;
1952 }
1953 
1954 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1955 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1956     uint32_t flags)
1957 {
1958 	return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1959 }
1960 
1961 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1962 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1963     uint32_t flags)
1964 {
1965 	int ret;
1966 
1967 	/*
1968 	 * In the event the mitigation thread is disabled, protect
1969 	 * against recursion by detecting if we're already in the
1970 	 * context of an RX notify.  IOSkywalkFamily may invoke the
1971 	 * notify callback as part of its RX sync callback.
1972 	 */
1973 	if (__probable(!sk_is_rx_notify_protected())) {
1974 		sk_protect_t protect;
1975 		uint32_t work_done;
1976 
1977 		protect = sk_rx_notify_protect();
1978 		ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1979 		sk_sync_unprotect(protect);
1980 	} else {
1981 		ret = EAGAIN;
1982 	}
1983 
1984 	return ret;
1985 }
1986 
1987 static int
nx_netif_na_notify_rx_redirect(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1988 nx_netif_na_notify_rx_redirect(struct __kern_channel_ring *kring, struct proc *p,
1989     uint32_t flags)
1990 {
1991 	struct netif_stats *nifs =
1992 	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1993 	uint32_t work_done;
1994 
1995 	ASSERT(kring->ckr_tx == NR_RX);
1996 	STATS_INC(nifs, NETIF_STATS_RX_IRQ);
1997 	return nx_netif_common_intr(kring, p, flags, &work_done);
1998 }
1999 
2000 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)2001 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
2002     boolean_t *tx_mit, boolean_t *tx_mit_simple,
2003     boolean_t *rx_mit, boolean_t *rx_mit_simple)
2004 {
2005 	struct nx_netif *nif = nifna->nifna_netif;
2006 
2007 	/*
2008 	 * TX mitigation is disabled by default, but can be
2009 	 * overridden via "sk_netif_tx_mit=N" boot-arg, where
2010 	 * N is one of SK_NETIF_MIT_FORCE_* values.
2011 	 */
2012 	*tx_mit = *tx_mit_simple = FALSE;
2013 	switch (sk_netif_tx_mit) {
2014 	case SK_NETIF_MIT_FORCE_SIMPLE:
2015 		*tx_mit_simple = TRUE;
2016 		OS_FALLTHROUGH;
2017 	case SK_NETIF_MIT_FORCE_ADVANCED:
2018 		*tx_mit = TRUE;
2019 		break;
2020 	case SK_NETIF_MIT_FORCE_OFF:
2021 	case SK_NETIF_MIT_AUTO:
2022 		ASSERT(*tx_mit == FALSE);
2023 		break;
2024 	default:
2025 		VERIFY(0);
2026 		/* NOTREACHED */
2027 		__builtin_unreachable();
2028 	}
2029 
2030 	/*
2031 	 * RX mitigation is enabled by default only for BSD-style
2032 	 * virtual network interfaces, but can be overridden
2033 	 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
2034 	 * SK_NETIF_MIT_FORCE_* values.
2035 	 */
2036 	*rx_mit = *rx_mit_simple = FALSE;
2037 	switch (sk_netif_rx_mit) {
2038 	case SK_NETIF_MIT_FORCE_OFF:
2039 		ASSERT(*rx_mit == FALSE);
2040 		break;
2041 	case SK_NETIF_MIT_FORCE_SIMPLE:
2042 		*rx_mit_simple = TRUE;
2043 		OS_FALLTHROUGH;
2044 	case SK_NETIF_MIT_FORCE_ADVANCED:
2045 		*rx_mit = TRUE;
2046 		break;
2047 	case SK_NETIF_MIT_AUTO:
2048 		*rx_mit_simple = TRUE;
2049 		/*
2050 		 * Enable RX mitigation thread only for BSD-style virtual (and
2051 		 * regular) interfaces, since otherwise we may run out of stack
2052 		 * when subjected to IPsec processing, etc.
2053 		 */
2054 		*rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
2055 		    NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
2056 		break;
2057 	default:
2058 		VERIFY(0);
2059 		/* NOTREACHED */
2060 		__builtin_unreachable();
2061 	}
2062 }
2063 
2064 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)2065 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2066 {
2067 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2068 	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2069 	struct nx_netif *nif = nifna->nifna_netif;
2070 	struct ifnet *ifp = na->na_ifp;
2071 	int error = 0;
2072 	uint32_t r;
2073 	/* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
2074 	struct nx_netif_mit *mit_tmp;
2075 	uint32_t nrings;
2076 
2077 	ASSERT(na->na_type == NA_NETIF_DEV);
2078 	ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2079 
2080 	SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s [%s]", na->na_name,
2081 	    SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2082 
2083 	switch (mode) {
2084 	case NA_ACTIVATE_MODE_ON:
2085 		ASSERT(SKYWALK_CAPABLE(ifp));
2086 
2087 		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
2088 		    &rx_mit, &rx_mit_simple);
2089 
2090 		/*
2091 		 * Init the mitigation support on all the dev TX rings.
2092 		 */
2093 		if (tx_mit) {
2094 			nrings = na_get_nrings(na, NR_TX);
2095 			mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
2096 			    nrings, Z_WAITOK, skmem_tag_netif_mit);
2097 			if (mit_tmp == NULL) {
2098 				SK_ERR("TX mitigation allocation failed");
2099 				error = ENOMEM;
2100 				goto out;
2101 			}
2102 			nifna->nifna_tx_mit = mit_tmp;
2103 			nifna->nifna_tx_mit_count = nrings;
2104 		} else {
2105 			ASSERT(nifna->nifna_tx_mit == NULL);
2106 		}
2107 
2108 		/*
2109 		 * Init the mitigation support on all the dev RX rings.
2110 		 */
2111 		if (rx_mit) {
2112 			nrings = na_get_nrings(na, NR_RX);
2113 			mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
2114 			    nrings, Z_WAITOK, skmem_tag_netif_mit);
2115 			if (mit_tmp == NULL) {
2116 				SK_ERR("RX mitigation allocation failed");
2117 				if (nifna->nifna_tx_mit != NULL) {
2118 					skn_free_type_array_counted_by(rx_fail,
2119 					    struct nx_netif_mit,
2120 					    nifna->nifna_tx_mit_count,
2121 					    nifna->nifna_tx_mit);
2122 				}
2123 				error = ENOMEM;
2124 				goto out;
2125 			}
2126 			nifna->nifna_rx_mit = mit_tmp;
2127 			nifna->nifna_rx_mit_count = nrings;
2128 		} else {
2129 			ASSERT(nifna->nifna_rx_mit == NULL);
2130 		}
2131 
2132 		/* intercept na_notify callback on the TX rings */
2133 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2134 			na->na_tx_rings[r].ckr_netif_notify =
2135 			    na->na_tx_rings[r].ckr_na_notify;
2136 			na->na_tx_rings[r].ckr_na_notify =
2137 			    nx_netif_na_notify_tx;
2138 			if (nifna->nifna_tx_mit != NULL) {
2139 				nx_netif_mit_init(nif, ifp,
2140 				    &nifna->nifna_tx_mit[r],
2141 				    &na->na_tx_rings[r], tx_mit_simple);
2142 			}
2143 		}
2144 
2145 		/* intercept na_notify callback on the RX rings */
2146 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2147 			na->na_rx_rings[r].ckr_netif_notify =
2148 			    na->na_rx_rings[r].ckr_na_notify;
2149 			na->na_rx_rings[r].ckr_na_notify = IFNET_IS_REDIRECT(ifp) ?
2150 			    nx_netif_na_notify_rx_redirect : nx_netif_na_notify_rx;
2151 			if (nifna->nifna_rx_mit != NULL) {
2152 				nx_netif_mit_init(nif, ifp,
2153 				    &nifna->nifna_rx_mit[r],
2154 				    &na->na_rx_rings[r], rx_mit_simple);
2155 			}
2156 		}
2157 		nx_netif_filter_enable(nif);
2158 		nx_netif_flow_enable(nif);
2159 		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
2160 
2161 		/* steer all start requests to netif; this must not fail */
2162 		lck_mtx_lock(&ifp->if_start_lock);
2163 		error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2164 		VERIFY(error == 0);
2165 		lck_mtx_unlock(&ifp->if_start_lock);
2166 		break;
2167 
2168 	case NA_ACTIVATE_MODE_DEFUNCT:
2169 		ASSERT(SKYWALK_CAPABLE(ifp));
2170 		break;
2171 
2172 	case NA_ACTIVATE_MODE_OFF:
2173 		/*
2174 		 * Note that here we cannot assert SKYWALK_CAPABLE()
2175 		 * as we're called in the destructor path.
2176 		 */
2177 		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
2178 		nx_netif_flow_disable(nif);
2179 		nx_netif_filter_disable(nif);
2180 
2181 		/*
2182 		 * Here we may block while holding sk_lock, but because
2183 		 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2184 		 * should immediately return.  A better approach would be
2185 		 * to drop sk_lock and add a monitor for this routine.
2186 		 */
2187 		lck_mtx_lock(&ifp->if_start_lock);
2188 		while (ifp->if_start_active != 0) {
2189 			++ifp->if_start_waiters;
2190 			(void) msleep(&ifp->if_start_waiters,
2191 			    &ifp->if_start_lock, (PZERO - 1),
2192 			    na->na_name, NULL);
2193 		}
2194 		/* steer all start requests to default handler */
2195 		ifnet_reset_start_handler(ifp);
2196 		lck_mtx_unlock(&ifp->if_start_lock);
2197 
2198 		/* reset all TX notify callbacks */
2199 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2200 			na->na_tx_rings[r].ckr_na_notify =
2201 			    na->na_tx_rings[r].ckr_netif_notify;
2202 			na->na_tx_rings[r].ckr_netif_notify = NULL;
2203 			if (nifna->nifna_tx_mit != NULL) {
2204 				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2205 				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2206 			}
2207 		}
2208 
2209 		if (nifna->nifna_tx_mit != NULL) {
2210 			skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
2211 			    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
2212 		}
2213 
2214 		/* reset all RX notify callbacks */
2215 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2216 			na->na_rx_rings[r].ckr_na_notify =
2217 			    na->na_rx_rings[r].ckr_netif_notify;
2218 			na->na_rx_rings[r].ckr_netif_notify = NULL;
2219 			if (nifna->nifna_rx_mit != NULL) {
2220 				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2221 				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2222 			}
2223 		}
2224 		if (nifna->nifna_rx_mit != NULL) {
2225 			skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
2226 			    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
2227 		}
2228 		break;
2229 
2230 	default:
2231 		VERIFY(0);
2232 		/* NOTREACHED */
2233 		__builtin_unreachable();
2234 	}
2235 out:
2236 	return error;
2237 }
2238 
2239 SK_NO_INLINE_ATTRIBUTE
2240 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2241 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2242 {
2243 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2244 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2245 	struct nexus_netif_adapter *devnifna = NULL;
2246 	struct nexus_netif_adapter *hostnifna = NULL;
2247 	struct nexus_adapter *__single devna = NULL;
2248 	struct nexus_adapter *__single hostna = NULL;
2249 	boolean_t embryonic = FALSE;
2250 	int retval = 0;
2251 	uint32_t na_flags;
2252 
2253 	SK_LOCK_ASSERT_HELD();
2254 	ASSERT(SKYWALK_NATIVE(ifp));
2255 	ASSERT(!SKYWALK_CAPABLE(ifp));
2256 	ASSERT(ifp->if_na == NULL);
2257 	ASSERT(ifp->if_na_ops == NULL);
2258 
2259 	devnifna = na_netif_alloc(Z_WAITOK);
2260 	hostnifna = na_netif_alloc(Z_WAITOK);
2261 
2262 	/*
2263 	 * We can be called for two different interface states:
2264 	 *
2265 	 * Fully attached: get an io ref count; upon success, this
2266 	 * holds a reference to the ifnet for the ifp pointer stored
2267 	 * in 'na_ifp' down below for both adapters.
2268 	 *
2269 	 * Embryonic: temporary hold the ifnet in na_private, which
2270 	 * upon a successful ifnet_attach(), will be moved over to
2271 	 * the 'na_ifp' with an io ref count held.
2272 	 *
2273 	 * The ifnet in 'na_ifp' will be released by na_release_locked().
2274 	 */
2275 	if (!ifnet_get_ioref(ifp)) {
2276 		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2277 			ifp = NULL;
2278 			retval = ENXIO;
2279 			goto err;
2280 		}
2281 		embryonic = TRUE;
2282 	}
2283 
2284 	/* initialize the device netif adapter */
2285 	devnifna->nifna_netif = nif;
2286 	nx_netif_retain(nif);
2287 	devna = &devnifna->nifna_up;
2288 	devna->na_type = NA_NETIF_DEV;
2289 	devna->na_free = na_netif_free;
2290 	(void) strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
2291 	uuid_generate_random(devna->na_uuid);
2292 	if (embryonic) {
2293 		/*
2294 		 * We will move this over to na_ifp once
2295 		 * the interface is fully attached.
2296 		 */
2297 		devna->na_private = ifp;
2298 		ASSERT(devna->na_ifp == NULL);
2299 	} else {
2300 		ASSERT(devna->na_private == NULL);
2301 		/* use I/O refcnt from ifnet_get_ioref() */
2302 		devna->na_ifp = ifp;
2303 	}
2304 	devna->na_activate = nx_netif_na_activate;
2305 	devna->na_txsync = nx_netif_na_txsync;
2306 	devna->na_rxsync = nx_netif_na_rxsync;
2307 	devna->na_dtor = nx_netif_na_dtor;
2308 	devna->na_krings_create = nx_netif_dev_krings_create;
2309 	devna->na_krings_delete = nx_netif_dev_krings_delete;
2310 	devna->na_special = nx_netif_na_special;
2311 
2312 	na_flags = NAF_NATIVE;
2313 	if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2314 		na_flags |= NAF_VIRTUAL_DEVICE;
2315 	}
2316 	if (NX_LLINK_PROV(nx)) {
2317 		/*
2318 		 * while operating in logical link mode, we don't need to
2319 		 * create backing memory regions for the rings as they are
2320 		 * not used.
2321 		 */
2322 		na_flags |= NAF_MEM_NO_INIT;
2323 	}
2324 	os_atomic_or(&devna->na_flags, na_flags, relaxed);
2325 	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2326 	    NEXUS_STATS_TYPE_INVALID;
2327 
2328 	na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2329 	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2330 	na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2331 	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2332 	/*
2333 	 * Verify upper bounds; the parameters must have already been
2334 	 * validated by nxdom_prov_params() by the time we get here.
2335 	 */
2336 	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2337 	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2338 	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2339 	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2340 
2341 	na_attach_common(devna, nx, &nx_netif_prov_s);
2342 
2343 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2344 	    nx, devna)) != 0) {
2345 		ASSERT(devna->na_arena == NULL);
2346 		goto err;
2347 	}
2348 	ASSERT(devna->na_arena != NULL);
2349 
2350 	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2351 	ASSERT(devna->na_flowadv_max == 0 ||
2352 	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2353 
2354 	/* setup packet copy routines */
2355 	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2356 		nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2357 		nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2358 		nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2359 	} else {
2360 		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2361 		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2362 		nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2363 	}
2364 
2365 	/* initialize the host netif adapter */
2366 	hostnifna->nifna_netif = nif;
2367 	nx_netif_retain(nif);
2368 	hostna = &hostnifna->nifna_up;
2369 	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2370 	    "%s^", devna->na_name);
2371 	uuid_generate_random(hostna->na_uuid);
2372 	if (embryonic) {
2373 		/*
2374 		 * We will move this over to na_ifp once
2375 		 * the interface is fully attached.
2376 		 */
2377 		hostna->na_private = ifp;
2378 		ASSERT(hostna->na_ifp == NULL);
2379 	} else {
2380 		ASSERT(hostna->na_private == NULL);
2381 		hostna->na_ifp = devna->na_ifp;
2382 		ifnet_incr_iorefcnt(hostna->na_ifp);
2383 	}
2384 	hostna->na_type = NA_NETIF_HOST;
2385 	hostna->na_free = na_netif_free;
2386 	hostna->na_activate = nx_netif_host_na_activate;
2387 	hostna->na_txsync = nx_netif_host_na_txsync;
2388 	hostna->na_rxsync = nx_netif_host_na_rxsync;
2389 	hostna->na_dtor = nx_netif_na_dtor;
2390 	hostna->na_krings_create = nx_netif_host_krings_create;
2391 	hostna->na_krings_delete = nx_netif_host_krings_delete;
2392 	hostna->na_special = nx_netif_host_na_special;
2393 
2394 	na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2395 	if (NX_LLINK_PROV(nx)) {
2396 		/*
2397 		 * while operating in logical link mode, we don't need to
2398 		 * create backing memory regions for the rings as they are
2399 		 * not used.
2400 		 */
2401 		na_flags |= NAF_MEM_NO_INIT;
2402 	}
2403 	os_atomic_or(&hostna->na_flags, na_flags, relaxed);
2404 	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2405 	    NEXUS_STATS_TYPE_INVALID;
2406 
2407 	na_set_nrings(hostna, NR_TX, 1);
2408 	na_set_nrings(hostna, NR_RX, 1);
2409 	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2410 	na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2411 
2412 	na_attach_common(hostna, nx, &nx_netif_prov_s);
2413 
2414 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2415 	    nx, hostna)) != 0) {
2416 		ASSERT(hostna->na_arena == NULL);
2417 		goto err;
2418 	}
2419 	ASSERT(hostna->na_arena != NULL);
2420 
2421 	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2422 	ASSERT(hostna->na_flowadv_max == 0 ||
2423 	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2424 
2425 	/* adjust the classq packet drop limit */
2426 	if (embryonic) {
2427 		uint32_t drop_lim;
2428 		struct kern_pbufpool_memory_info pp_info;
2429 
2430 		retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2431 		VERIFY(retval == 0);
2432 
2433 		/* set the drop limit as 75% of size of packet pool */
2434 		drop_lim = (pp_info.kpm_packets * 3) / 4;
2435 		VERIFY(drop_lim != 0);
2436 		IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2437 	}
2438 
2439 	/* these will be undone by destructor  */
2440 	ifp->if_na_ops = &na_netif_ops;
2441 	ifp->if_na = devnifna;
2442 	na_retain_locked(devna);
2443 	na_retain_locked(hostna);
2444 
2445 	SKYWALK_SET_CAPABLE(ifp);
2446 
2447 	NETIF_WLOCK(nif);
2448 	nif->nif_ifp = ifp;
2449 	nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2450 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2451 	    kernproc);
2452 	ASSERT(retval == 0);
2453 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2454 	    kernproc);
2455 	ASSERT(retval == 0);
2456 	NETIF_WUNLOCK(nif);
2457 
2458 #if SK_LOG
2459 	uuid_string_t uuidstr;
2460 	SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2461 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2462 	    sk_uuid_unparse(devna->na_uuid, uuidstr));
2463 	SK_DF(SK_VERB_NETIF, "  nx:          %p (\"%s\":\"%s\")",
2464 	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2465 	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2466 	SK_DF(SK_VERB_NETIF, "  flags:       0x%x", devna->na_flags);
2467 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
2468 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2469 	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2470 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2471 	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2472 #if CONFIG_NEXUS_USER_PIPE
2473 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
2474 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
2475 #endif /* CONFIG_NEXUS_USER_PIPE */
2476 	SK_DF(SK_VERB_NETIF, "  ifp:         %p %s [ioref %u]",
2477 	    SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio));
2478 	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2479 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
2480 	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
2481 	SK_DF(SK_VERB_NETIF, "  nx:          %p (\"%s\":\"%s\")",
2482 	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2483 	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2484 	SK_DF(SK_VERB_NETIF, "  flags:       0x%x", hostna->na_flags);
2485 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
2486 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
2487 	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2488 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
2489 	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2490 #if CONFIG_NEXUS_USER_PIPE
2491 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
2492 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
2493 #endif /* CONFIG_NEXUS_USER_PIPE */
2494 	SK_DF(SK_VERB_NETIF, "  ifp:         %p %s [ioref %u]",
2495 	    SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio));
2496 #endif /* SK_LOG */
2497 
2498 err:
2499 	if (retval != 0) {
2500 		if (ifp != NULL) {
2501 			if (!embryonic) {
2502 				ifnet_decr_iorefcnt(ifp);
2503 			}
2504 			ifp = NULL;
2505 		}
2506 		if (devna != NULL) {
2507 			if (devna->na_arena != NULL) {
2508 				skmem_arena_release(devna->na_arena);
2509 				devna->na_arena = NULL;
2510 			}
2511 			if (devna->na_ifp != NULL) {
2512 				ifnet_decr_iorefcnt(devna->na_ifp);
2513 				devna->na_ifp = NULL;
2514 			}
2515 			devna->na_private = NULL;
2516 		}
2517 		if (hostna != NULL) {
2518 			if (hostna->na_arena != NULL) {
2519 				skmem_arena_release(hostna->na_arena);
2520 				hostna->na_arena = NULL;
2521 			}
2522 			if (hostna->na_ifp != NULL) {
2523 				ifnet_decr_iorefcnt(hostna->na_ifp);
2524 				hostna->na_ifp = NULL;
2525 			}
2526 			hostna->na_private = NULL;
2527 		}
2528 		if (devnifna != NULL) {
2529 			if (devnifna->nifna_netif != NULL) {
2530 				nx_netif_release(devnifna->nifna_netif);
2531 				devnifna->nifna_netif = NULL;
2532 			}
2533 			na_netif_free((struct nexus_adapter *)devnifna);
2534 		}
2535 		if (hostnifna != NULL) {
2536 			if (hostnifna->nifna_netif != NULL) {
2537 				nx_netif_release(hostnifna->nifna_netif);
2538 				hostnifna->nifna_netif = NULL;
2539 			}
2540 			na_netif_free((struct nexus_adapter *)hostnifna);
2541 		}
2542 	}
2543 	return retval;
2544 }
2545 
2546 /*
2547  * Any per-netif state that can be discovered at attach time should be
2548  * initialized here.
2549  */
2550 static void
nx_netif_flags_init(struct nx_netif * nif)2551 nx_netif_flags_init(struct nx_netif *nif)
2552 {
2553 	ifnet_t ifp = nif->nif_ifp;
2554 	struct kern_nexus *nx = nif->nif_nx;
2555 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2556 
2557 	switch (devna->na_type) {
2558 	case NA_NETIF_DEV:
2559 		if (strlcmp(sk_ll_prefix, ifp->if_name, sizeof(sk_ll_prefix)) == 0) {
2560 			nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2561 			if_set_xflags(ifp, IFXF_LOW_LATENCY);
2562 		}
2563 		break;
2564 	case NA_NETIF_COMPAT_DEV:
2565 		nif->nif_flags |= NETIF_FLAG_COMPAT;
2566 		break;
2567 	default:
2568 		break;
2569 	}
2570 }
2571 
2572 /*
2573  * This is also supposed to check for any inconsistent state at detach time.
2574  */
2575 static void
nx_netif_flags_fini(struct nx_netif * nif)2576 nx_netif_flags_fini(struct nx_netif *nif)
2577 {
2578 	ifnet_t ifp = nif->nif_ifp;
2579 
2580 	if (ifp != NULL) {
2581 		if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2582 	}
2583 	nif->nif_flags = 0;
2584 }
2585 
2586 SK_NO_INLINE_ATTRIBUTE
2587 static void
nx_netif_callbacks_init(struct nx_netif * nif)2588 nx_netif_callbacks_init(struct nx_netif *nif)
2589 {
2590 	ifnet_t ifp = nif->nif_ifp;
2591 
2592 	/*
2593 	 * XXX
2594 	 * This function is meant to be called by na_netif_finalize(), which is
2595 	 * called by ifnet_attach() while holding if_lock exclusively.
2596 	 */
2597 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
2598 	if (ifnet_is_low_latency(ifp)) {
2599 		ifnet_set_detach_notify_locked(ifp,
2600 		    nx_netif_llw_detach_notify, ifp->if_na);
2601 	}
2602 }
2603 
2604 SK_NO_INLINE_ATTRIBUTE
2605 static void
nx_netif_callbacks_fini(struct nx_netif * nif)2606 nx_netif_callbacks_fini(struct nx_netif *nif)
2607 {
2608 	ifnet_t ifp = nif->nif_ifp;
2609 
2610 	if (ifnet_is_low_latency(ifp)) {
2611 		ifnet_set_detach_notify(ifp, NULL, NULL);
2612 	}
2613 }
2614 
2615 static void
configure_capab_interface_advisory(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2616 configure_capab_interface_advisory(struct nx_netif *nif,
2617     nxprov_capab_config_fn_t capab_fn)
2618 {
2619 	struct kern_nexus_capab_interface_advisory capab;
2620 	struct kern_nexus *nx = nif->nif_nx;
2621 	uint32_t capab_len;
2622 	int error;
2623 
2624 	/* check/configure interface advisory notifications */
2625 	if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
2626 		return;
2627 	}
2628 	bzero(&capab, sizeof(capab));
2629 	capab.kncia_version =
2630 	    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2631 	*__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2632 	    &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
2633 	*__DECONST(void **, &(capab.kncia_kern_context)) = nx;
2634 	capab_len = sizeof(capab);
2635 	error = capab_fn(NX_PROV(nx), nx,
2636 	    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
2637 	if (error != 0) {
2638 		DTRACE_SKYWALK2(interface__advisory__capab__error,
2639 		    struct nx_netif *, nif, int, error);
2640 		return;
2641 	}
2642 	VERIFY(capab.kncia_config != NULL);
2643 	VERIFY(capab.kncia_provider_context != NULL);
2644 	nif->nif_intf_adv_config = capab.kncia_config;
2645 	nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
2646 	nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
2647 }
2648 
2649 static void
unconfigure_capab_interface_advisory(struct nx_netif * nif)2650 unconfigure_capab_interface_advisory(struct nx_netif *nif)
2651 {
2652 	if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
2653 		return;
2654 	}
2655 	nif->nif_intf_adv_config = NULL;
2656 	nif->nif_intf_adv_prov_ctx = NULL;
2657 	nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
2658 }
2659 
2660 static void
configure_capab_qset_extensions(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2661 configure_capab_qset_extensions(struct nx_netif *nif,
2662     nxprov_capab_config_fn_t capab_fn)
2663 {
2664 	struct kern_nexus_capab_qset_extensions capab;
2665 	struct kern_nexus *nx = nif->nif_nx;
2666 	uint32_t capab_len;
2667 	int error;
2668 
2669 	if (!NX_LLINK_PROV(nx)) {
2670 		DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
2671 		return;
2672 	}
2673 	bzero(&capab, sizeof(capab));
2674 	capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
2675 	capab_len = sizeof(capab);
2676 	error = capab_fn(NX_PROV(nx), nx,
2677 	    KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
2678 	if (error != 0) {
2679 		DTRACE_SKYWALK2(qset__extensions__capab__error,
2680 		    struct nx_netif *, nif, int, error);
2681 		return;
2682 	}
2683 	VERIFY(capab.cqe_notify_steering_info != NULL);
2684 	VERIFY(capab.cqe_prov_ctx != NULL);
2685 	nif->nif_qset_extensions.qe_notify_steering_info =
2686 	    capab.cqe_notify_steering_info;
2687 	nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
2688 	nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
2689 }
2690 
2691 static void
unconfigure_capab_qset_extensions(struct nx_netif * nif)2692 unconfigure_capab_qset_extensions(struct nx_netif *nif)
2693 {
2694 	if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2695 		return;
2696 	}
2697 	bzero(&nif->nif_qset_extensions, sizeof(nif->nif_qset_extensions));
2698 	nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
2699 }
2700 
2701 int
nx_netif_notify_steering_info(struct nx_netif * nif,struct netif_qset * qset,struct ifnet_traffic_descriptor_common * td,bool add)2702 nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
2703     struct ifnet_traffic_descriptor_common *td, bool add)
2704 {
2705 	struct netif_qset_extensions *qset_ext;
2706 	int err;
2707 
2708 	if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2709 		return ENOTSUP;
2710 	}
2711 	qset_ext = &nif->nif_qset_extensions;
2712 	VERIFY(qset_ext->qe_prov_ctx != NULL);
2713 	VERIFY(qset_ext->qe_notify_steering_info != NULL);
2714 	err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
2715 	    qset->nqs_ctx, td, add);
2716 	return err;
2717 }
2718 
2719 static void
configure_capab_rx_flow_steering(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2720 configure_capab_rx_flow_steering(struct nx_netif *nif,
2721     nxprov_capab_config_fn_t capab_fn)
2722 {
2723 	struct kern_nexus_capab_rx_flow_steering capab;
2724 	struct kern_nexus *nx = nif->nif_nx;
2725 	uint32_t capab_len;
2726 	int error;
2727 
2728 	/* check/configure Rx flow steering */
2729 	if ((nif->nif_ifp->if_xflags & IFXF_RX_FLOW_STEERING) == 0) {
2730 		return;
2731 	}
2732 	bzero(&capab, sizeof(capab));
2733 	capab.kncrxfs_version =
2734 	    KERN_NEXUS_CAPAB_RX_FLOW_STEERING_VERSION_1;
2735 	capab_len = sizeof(capab);
2736 	error = capab_fn(NX_PROV(nx), nx,
2737 	    KERN_NEXUS_CAPAB_RX_FLOW_STEERING, &capab, &capab_len);
2738 	if (error != 0) {
2739 		DTRACE_SKYWALK2(rx__flow__steering__capab__error,
2740 		    struct nx_netif *, nif, int, error);
2741 		return;
2742 	}
2743 	VERIFY(capab.kncrxfs_config != NULL);
2744 	VERIFY(capab.kncrxfs_prov_ctx != NULL);
2745 	nif->nif_rx_flow_steering.config_fn = capab.kncrxfs_config;
2746 	nif->nif_rx_flow_steering.prov_ctx = capab.kncrxfs_prov_ctx;
2747 	nif->nif_extended_capabilities |= NETIF_CAPAB_RX_FLOW_STEERING;
2748 }
2749 
2750 static void
unconfigure_capab_rx_flow_steering(struct nx_netif * nif)2751 unconfigure_capab_rx_flow_steering(struct nx_netif *nif)
2752 {
2753 	if ((nif->nif_extended_capabilities & NETIF_CAPAB_RX_FLOW_STEERING) == 0) {
2754 		return;
2755 	}
2756 	bzero(&nif->nif_rx_flow_steering, sizeof(nif->nif_rx_flow_steering));
2757 	nif->nif_extended_capabilities &= ~NETIF_CAPAB_RX_FLOW_STEERING;
2758 }
2759 
2760 int
nx_netif_configure_rx_flow_steering(struct kern_nexus * nx,uint32_t id,struct ifnet_traffic_descriptor_common * td,rx_flow_steering_action_t action)2761 nx_netif_configure_rx_flow_steering(struct kern_nexus *nx, uint32_t id,
2762     struct ifnet_traffic_descriptor_common *td,
2763     rx_flow_steering_action_t action)
2764 {
2765 	struct netif_rx_flow_steering *rx_flow_steering = NULL;
2766 	struct nx_netif *nif;
2767 	int err = 0;
2768 
2769 	if ((nx->nx_flags & NXF_CLOSED) != 0) {
2770 		return ENXIO;
2771 	}
2772 
2773 	ASSERT(NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
2774 	nif = NX_NETIF_PRIVATE(nx);
2775 
2776 	if ((nif->nif_extended_capabilities & NETIF_CAPAB_RX_FLOW_STEERING) == 0) {
2777 		return ENOTSUP;
2778 	}
2779 
2780 	rx_flow_steering = &nif->nif_rx_flow_steering;
2781 	VERIFY(rx_flow_steering->prov_ctx != NULL);
2782 	VERIFY(rx_flow_steering->config_fn != NULL);
2783 	err = rx_flow_steering->config_fn(rx_flow_steering->prov_ctx, id,
2784 	    td, action);
2785 	return err;
2786 }
2787 
2788 static void
nx_netif_capabilities_init(struct nx_netif * nif)2789 nx_netif_capabilities_init(struct nx_netif *nif)
2790 {
2791 	struct kern_nexus *nx = nif->nif_nx;
2792 	nxprov_capab_config_fn_t capab_fn;
2793 
2794 	if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2795 	    KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2796 		capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2797 		ASSERT(capab_fn != NULL);
2798 	} else {
2799 		capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2800 	}
2801 	if (capab_fn == NULL) {
2802 		return;
2803 	}
2804 	configure_capab_interface_advisory(nif, capab_fn);
2805 	configure_capab_qset_extensions(nif, capab_fn);
2806 	configure_capab_rx_flow_steering(nif, capab_fn);
2807 }
2808 
2809 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2810 nx_netif_capabilities_fini(struct nx_netif *nif)
2811 {
2812 	unconfigure_capab_interface_advisory(nif);
2813 	unconfigure_capab_qset_extensions(nif);
2814 	unconfigure_capab_rx_flow_steering(nif);
2815 }
2816 
2817 static void
nx_netif_verify_tso_config(struct nx_netif * nif)2818 nx_netif_verify_tso_config(struct nx_netif *nif)
2819 {
2820 	ifnet_t ifp = nif->nif_ifp;
2821 	uint32_t tso_v4_mtu = 0;
2822 	uint32_t tso_v6_mtu = 0;
2823 
2824 	/*
2825 	 * compat interfaces always use 128-byte buffers on the device packet
2826 	 * pool side (for holding headers for classification) so no need to check
2827 	 * the size here.
2828 	 */
2829 	if (!SKYWALK_NATIVE(ifp)) {
2830 		return;
2831 	}
2832 
2833 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
2834 		tso_v4_mtu = ifp->if_tso_v4_mtu;
2835 	}
2836 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
2837 		tso_v6_mtu = ifp->if_tso_v6_mtu;
2838 	}
2839 	VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
2840 	    max(tso_v4_mtu, tso_v6_mtu));
2841 }
2842 
2843 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2844 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2845 {
2846 	struct nx_netif *nif = nifna->nifna_netif;
2847 	struct kern_nexus *nx = nif->nif_nx;
2848 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2849 	struct nexus_adapter *hostna = nx_port_get_na(nx,
2850 	    NEXUS_PORT_NET_IF_HOST);
2851 
2852 	ASSERT(devna != NULL);
2853 	ASSERT(hostna != NULL);
2854 
2855 	if (!ifnet_get_ioref(ifp)) {
2856 		VERIFY(0);
2857 		/* NOTREACHED */
2858 		__builtin_unreachable();
2859 	}
2860 
2861 	ASSERT(devna->na_private == ifp);
2862 	ASSERT(devna->na_ifp == NULL);
2863 	/* use I/O refcnt held by ifnet_get_ioref() above */
2864 	devna->na_ifp = devna->na_private;
2865 	devna->na_private = NULL;
2866 
2867 	ASSERT(hostna->na_private == ifp);
2868 	ASSERT(hostna->na_ifp == NULL);
2869 	hostna->na_ifp = hostna->na_private;
2870 	hostna->na_private = NULL;
2871 	ifnet_incr_iorefcnt(hostna->na_ifp);
2872 
2873 	nx_netif_flags_init(nif);
2874 	nx_netif_llink_init(nif);
2875 	nx_netif_filter_init(nif);
2876 	nx_netif_flow_init(nif);
2877 	nx_netif_capabilities_init(nif);
2878 	nx_netif_agent_init(nif);
2879 	(void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
2880 	    &ifp->if_inet_traffic_rule_count);
2881 	(void) nxctl_eth_traffic_rule_get_count(ifp->if_xname,
2882 	    &ifp->if_eth_traffic_rule_count);
2883 	nx_netif_verify_tso_config(nif);
2884 	nx_netif_callbacks_init(nif);
2885 }
2886 
2887 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2888 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2889     uint32_t thres, boolean_t low)
2890 {
2891 #pragma unused(ifp)
2892 	struct nx_netif *nif = nifna->nifna_netif;
2893 	struct kern_nexus *nx = nif->nif_nx;
2894 	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2895 	uint64_t now = net_uptime();
2896 	boolean_t purge;
2897 
2898 	ASSERT(thres != 0);
2899 
2900 	if (devna->na_work_ts == 0) {
2901 		return;
2902 	}
2903 
2904 	/*
2905 	 * Purge if it's has been inactive for some time (twice the drain
2906 	 * threshold), and clear the work timestamp to temporarily skip this
2907 	 * adapter until it's active again.  Purging cached objects can be
2908 	 * expensive since we'd need to allocate and construct them again,
2909 	 * so we do it only when necessary.
2910 	 */
2911 	if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2912 		devna->na_work_ts = 0;
2913 		purge = TRUE;
2914 	} else {
2915 		purge = FALSE;
2916 	}
2917 
2918 	SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2919 	    (purge ? "purging" : "pruning"), devna->na_name);
2920 
2921 	/*
2922 	 * Device and host adapters share the same packet buffer pool,
2923 	 * so just reap the arena belonging to the device instance.
2924 	 */
2925 	skmem_arena_reap(devna->na_arena, purge);
2926 }
2927 
2928 /*
2929  * The purpose of this callback is to forceably remove resources held by VPNAs
2930  * in event of an interface detach. Without this callback an application can
2931  * prevent the detach from completing indefinitely. Note that this is only needed
2932  * for low latency VPNAs. Userspace do get notified about interface detach events
2933  * for other NA types (custom ether and filter) and will do the necessary cleanup.
2934  * The cleanup is done in two phases:
2935  * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
2936  *    causes the device channel to be closed. All ifnet references held by VPNAs
2937  *    are also released.
2938  * 2) This cleans up the netif nexus and releases the two remaining ifnet
2939  *    references held by the device and host ports (nx_netif_clean()).
2940  */
2941 void
nx_netif_llw_detach_notify(void * arg)2942 nx_netif_llw_detach_notify(void *arg)
2943 {
2944 	struct nexus_netif_adapter *__single nifna = arg;
2945 	struct nx_netif *nif = nifna->nifna_netif;
2946 	struct kern_nexus *nx = nif->nif_nx;
2947 	struct kern_channel **ch_list = NULL;
2948 	struct kern_channel *ch;
2949 	int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
2950 	struct proc *p;
2951 
2952 	ASSERT(NETIF_IS_LOW_LATENCY(nif));
2953 	/*
2954 	 * kern_channel_defunct() requires sk_lock to be not held. We
2955 	 * will first find the list of channels we want to defunct and
2956 	 * then call kern_channel_defunct() on each of them. The number
2957 	 * of channels cannot increase after sk_lock is released since
2958 	 * this interface is being detached.
2959 	 */
2960 	SK_LOCK();
2961 	all_ch_cnt = nx->nx_ch_count;
2962 	if (all_ch_cnt == 0) {
2963 		DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
2964 		SK_UNLOCK();
2965 		return;
2966 	}
2967 	ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
2968 	    Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);
2969 
2970 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
2971 		struct nexus_adapter *na = ch->ch_na;
2972 
2973 		if (na != NULL && na->na_type == NA_NETIF_VP) {
2974 			ASSERT(vp_ch_cnt < all_ch_cnt);
2975 
2976 			/* retain channel to prevent it from being freed */
2977 			ch_retain_locked(ch);
2978 			ch_list[vp_ch_cnt] = ch;
2979 			DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
2980 			    struct kern_channel *, ch, struct nexus_adapter *, na);
2981 			vp_ch_cnt++;
2982 		}
2983 	}
2984 	if (vp_ch_cnt == 0) {
2985 		DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
2986 		sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2987 		SK_UNLOCK();
2988 		return;
2989 	}
2990 	/* prevents the netif from being freed */
2991 	nx_netif_retain(nif);
2992 	SK_UNLOCK();
2993 
2994 	for (i = 0; i < vp_ch_cnt; i++) {
2995 		ch = ch_list[i];
2996 		p = proc_find(ch->ch_pid);
2997 		if (p == NULL) {
2998 			SK_ERR("ch %p pid %d not found", SK_KVA(ch), ch->ch_pid);
2999 			DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
3000 			    struct kern_channel *, ch, pid_t, ch->ch_pid);
3001 			ch_release(ch);
3002 			continue;
3003 		}
3004 		/*
3005 		 * It is possible for the channel to be closed before defunct gets
3006 		 * called. We need to get the fd lock here to ensure that the check
3007 		 * for the closed state and the calling of channel defunct are done
3008 		 * atomically.
3009 		 */
3010 		proc_fdlock(p);
3011 		if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
3012 			kern_channel_defunct(p, ch);
3013 		}
3014 		proc_fdunlock(p);
3015 		proc_rele(p);
3016 		ch_release(ch);
3017 	}
3018 	sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
3019 
3020 	SK_LOCK();
3021 	/*
3022 	 * Quiescing is not needed because:
3023 	 * The defuncting above ensures that no more tx syncs could enter.
3024 	 * The driver layer ensures that ifnet_detach() (this path) does not get
3025 	 * called until RX upcalls have returned.
3026 	 *
3027 	 * Before sk_lock is reacquired above, userspace could close its channels
3028 	 * and cause the nexus's destructor to be called. This is fine because we
3029 	 * have retained the nif so it can't disappear.
3030 	 */
3031 	err = nx_netif_clean(nif, FALSE);
3032 	if (err != 0) {
3033 		SK_ERR("netif clean failed: err %d", err);
3034 		DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
3035 	}
3036 	nx_netif_release(nif);
3037 	SK_UNLOCK();
3038 }
3039 
3040 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)3041 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
3042     struct if_netif_stats *if_ns)
3043 {
3044 	struct nx_netif_mit *mit;
3045 	struct mit_cfg_tbl *mit_cfg;
3046 
3047 	if ((mit = nifna->nifna_rx_mit) == NULL) {
3048 		return;
3049 	}
3050 
3051 	if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
3052 		return;
3053 	}
3054 
3055 	if_ns->ifn_rx_mit_interval = mit->mit_interval;
3056 	if_ns->ifn_rx_mit_mode = mit->mit_mode;
3057 	if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
3058 	if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
3059 	if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
3060 	if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
3061 	if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
3062 	if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
3063 	if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
3064 
3065 	VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
3066 	mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
3067 	if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
3068 	if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
3069 	if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
3070 	if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
3071 	if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
3072 }
3073 
3074 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3075 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
3076     struct chreq *chr, nxspec_cmd_t spec_cmd)
3077 {
3078 	ASSERT(na->na_type == NA_NETIF_DEV ||
3079 	    na->na_type == NA_NETIF_COMPAT_DEV);
3080 	return nx_netif_na_special_common(na, ch, chr, spec_cmd);
3081 }
3082 
3083 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3084 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
3085     struct chreq *chr, nxspec_cmd_t spec_cmd)
3086 {
3087 	int error = 0;
3088 
3089 	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
3090 	    na->na_type == NA_NETIF_COMPAT_DEV ||
3091 	    na->na_type == NA_NETIF_COMPAT_HOST);
3092 	SK_LOCK_ASSERT_HELD();
3093 
3094 	switch (spec_cmd) {
3095 	case NXSPEC_CMD_CONNECT:
3096 		/*
3097 		 * netif adapter isn't created exclusively for kernel.
3098 		 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
3099 		 * na_special() connect and disconnect.
3100 		 */
3101 		if (NA_KERNEL_ONLY(na)) {
3102 			error = EBUSY;
3103 			goto done;
3104 		}
3105 		ASSERT(!(na->na_flags & NAF_SPEC_INIT));
3106 
3107 		os_atomic_or(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3108 		error = na_bind_channel(na, ch, chr);
3109 		if (error != 0) {
3110 			os_atomic_andnot(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3111 			goto done;
3112 		}
3113 		os_atomic_or(&na->na_flags, NAF_SPEC_INIT, relaxed);
3114 		break;
3115 
3116 	case NXSPEC_CMD_DISCONNECT:
3117 		ASSERT(NA_KERNEL_ONLY(na));
3118 		ASSERT(na->na_channels > 0);
3119 		ASSERT(na->na_flags & NAF_SPEC_INIT);
3120 		na_unbind_channel(ch);
3121 		os_atomic_andnot(&na->na_flags, (NAF_SPEC_INIT | NAF_KERNEL_ONLY), relaxed);
3122 		break;
3123 
3124 	case NXSPEC_CMD_START:
3125 		na_kr_drop(na, FALSE);
3126 		break;
3127 
3128 	case NXSPEC_CMD_STOP:
3129 		na_kr_drop(na, TRUE);
3130 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
3131 		lck_mtx_lock(&ch->ch_lock);
3132 		nxprov_advise_disconnect(na->na_nx, ch);
3133 		lck_mtx_unlock(&ch->ch_lock);
3134 		break;
3135 
3136 	default:
3137 		error = EINVAL;
3138 		break;
3139 	}
3140 
3141 done:
3142 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
3143 	    "ch %p from na \"%s\" (%p) naflags %x nx %p "
3144 	    "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
3145 	    na->na_flags, SK_KVA(ch->ch_nexus), spec_cmd, error);
3146 
3147 	return error;
3148 }
3149 
3150 /*
3151  * Get a skywalk netif adapter for the port.
3152  */
3153 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)3154 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
3155     struct chreq *chr, struct nxbind *nxb, struct proc *p,
3156     struct nexus_adapter **nap, boolean_t create)
3157 {
3158 #pragma unused(ch)
3159 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3160 	boolean_t anon = NX_ANONYMOUS_PROV(nx);
3161 	ch_endpoint_t ep = chr->cr_endpoint;
3162 	nexus_port_t nx_port = chr->cr_port;
3163 	struct nexus_adapter *__single na = NULL;
3164 	struct ifnet *ifp;
3165 	int err = 0;
3166 
3167 	SK_LOCK_ASSERT_HELD();
3168 	*nap = NULL; /* default */
3169 
3170 #if SK_LOG
3171 	uuid_string_t uuidstr;
3172 	SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%x pipe_id %u "
3173 	    "ring_id %d ring_set %u ep_type %u create %u%s",
3174 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
3175 	    (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id,
3176 	    (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create,
3177 	    (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
3178 #endif /* SK_LOG */
3179 
3180 	if (!create || ep != CH_ENDPOINT_NET_IF) {
3181 		err = ENODEV;
3182 		goto done;
3183 	}
3184 
3185 	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
3186 	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
3187 		err = ENXIO;
3188 		goto done;
3189 	}
3190 	ifp = nif->nif_ifp;
3191 	if (!(SKYWALK_CAPABLE(ifp))) {
3192 		SK_ERR("interface %s is no longer usable", if_name(ifp));
3193 		err = ENOTSUP;
3194 		goto done;
3195 	}
3196 
3197 	if (chr->cr_mode & CHMODE_LOW_LATENCY) {
3198 		SK_ERR("low latency is not supported for netif channel");
3199 		err = ENOTSUP;
3200 		goto done;
3201 	}
3202 
3203 	switch (nx_port) {
3204 	case NEXUS_PORT_NET_IF_DEV:
3205 		/*
3206 		 * We have to reject direct user open that's not explicitly
3207 		 * allowed because netif nexuses do not by default have
3208 		 * user memory regions.
3209 		 */
3210 		if (p != kernproc &&
3211 		    (!skywalk_netif_direct_allowed(ifp->if_xname) ||
3212 		    (kauth_cred_issuser(kauth_cred_get()) == 0 &&
3213 		    (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
3214 		    !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
3215 			DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
3216 			    ifp, struct chreq *, chr);
3217 			err = ENOTSUP;
3218 			goto done;
3219 		}
3220 		if (chr->cr_mode & CHMODE_EVENT_RING) {
3221 			SK_ERR("event ring is not supported for netif dev port channel");
3222 			err = ENOTSUP;
3223 			goto done;
3224 		}
3225 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
3226 		break;
3227 
3228 	case NEXUS_PORT_NET_IF_HOST:
3229 		if (p != kernproc) {
3230 			err = ENOTSUP;
3231 			goto done;
3232 		}
3233 		if (chr->cr_mode & CHMODE_EVENT_RING) {
3234 			SK_ERR("event ring is not supported for netif host port channel");
3235 			err = ENOTSUP;
3236 			goto done;
3237 		}
3238 		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
3239 		break;
3240 
3241 	default:
3242 		ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
3243 
3244 		NETIF_WLOCK(nif);
3245 		err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3246 		if (err != 0) {
3247 			NETIF_WUNLOCK(nif);
3248 			goto done;
3249 		}
3250 
3251 		if (na == NULL) {
3252 			if (chr->cr_mode & CHMODE_FILTER) {
3253 				err = netif_filter_na_create(nx, chr, &na);
3254 			} else {
3255 				err = netif_vp_na_create(nx, chr, &na);
3256 			}
3257 			if (err != 0) {
3258 				NETIF_WUNLOCK(nif);
3259 				goto done;
3260 			}
3261 			err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3262 			if (err != 0) {
3263 				NETIF_WUNLOCK(nif);
3264 				goto done;
3265 			}
3266 		}
3267 		NETIF_WUNLOCK(nif);
3268 
3269 		break;
3270 	}
3271 
3272 	ASSERT(err == 0);
3273 	ASSERT(na != NULL);
3274 
3275 #if CONFIG_NEXUS_USER_PIPE
3276 	if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
3277 #else /* !CONFIG_NEXUS_USER_PIPE */
3278 	if (NA_OWNED_BY_ANY(na)) {
3279 #endif /* !CONFIG_NEXUS_USER_PIPE */
3280 		err = EBUSY;
3281 		na = NULL;
3282 		goto done;
3283 	}
3284 
3285 	*nap = na;
3286 	na_retain_locked(na);
3287 
3288 done:
3289 	ASSERT(err != 0 || na != NULL);
3290 	if (err) {
3291 		SK_ERR("na not found, err(%d)", err);
3292 	} else {
3293 		SK_DF(SK_VERB_NETIF, "found na %p", SK_KVA(na));
3294 	}
3295 	return err;
3296 }
3297 
3298 /* na_krings_create callback for all netif device adapters */
3299 int
3300 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3301 {
3302 	int ret;
3303 
3304 	ASSERT(na->na_type == NA_NETIF_DEV ||
3305 	    na->na_type == NA_NETIF_COMPAT_DEV);
3306 	/*
3307 	 * Allocate context structures for native netif only, for
3308 	 * IOSkywalkFamily to store its object references.
3309 	 */
3310 	ret = na_rings_mem_setup(na, (na->na_flags & NAF_NATIVE), ch);
3311 
3312 	/*
3313 	 * We mark CKRF_DROP for kernel-only rings (kernel channel
3314 	 * opened by the flowswitch, etc.) to prevent packets from
3315 	 * going thru until after the client of the kernel channel
3316 	 * has fully plumbed things on its side.  For userland-facing
3317 	 * rings (regular channel opened to netif), this is not
3318 	 * required, and so don't mark CKRF_DROP there.
3319 	 */
3320 	if (ret == 0 && NA_KERNEL_ONLY(na)) {
3321 		na_kr_drop(na, TRUE);
3322 	}
3323 
3324 	return ret;
3325 }
3326 
3327 /* call with SK_LOCK held */
3328 void
3329 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3330     boolean_t defunct)
3331 {
3332 	ASSERT(na->na_type == NA_NETIF_DEV ||
3333 	    na->na_type == NA_NETIF_COMPAT_DEV);
3334 
3335 	/* see comments in nx_netif_dev_krings_create() */
3336 	if (NA_KERNEL_ONLY(na)) {
3337 		na_kr_drop(na, TRUE);
3338 	}
3339 
3340 	na_rings_mem_teardown(na, ch, defunct);
3341 }
3342 
3343 struct nx_netif *
3344 nx_netif_alloc(zalloc_flags_t how)
3345 {
3346 	struct nx_netif *n;
3347 
3348 	SK_LOCK_ASSERT_HELD();
3349 
3350 	n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
3351 	if (n == NULL) {
3352 		return NULL;
3353 	}
3354 
3355 	NETIF_RWINIT(n);
3356 	os_ref_init(&n->nif_refcnt, NULL);
3357 	SK_DF(SK_VERB_MEM, "netif %p", SK_KVA(n));
3358 
3359 	return n;
3360 }
3361 
3362 static void
3363 nx_netif_destroy(struct nx_netif *n)
3364 {
3365 	ASSERT(n->nif_dev_nxb == NULL);
3366 	ASSERT(n->nif_host_nxb == NULL);
3367 	ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3368 	nx_netif_llink_config_free(n);
3369 	SK_DF(SK_VERB_MEM, "netif %p", SK_KVA(n));
3370 	NETIF_RWDESTROY(n);
3371 	zfree(nx_netif_zone, n);
3372 }
3373 
3374 void
3375 nx_netif_release(struct nx_netif *n)
3376 {
3377 	SK_LOCK_ASSERT_HELD();
3378 
3379 	SK_DF(SK_VERB_MEM, "netif %p, refcnt %d", SK_KVA(n),
3380 	    os_ref_get_count(&n->nif_refcnt));
3381 	if (os_ref_release(&n->nif_refcnt) == 0) {
3382 		nx_netif_destroy(n);
3383 	}
3384 }
3385 
3386 void
3387 nx_netif_retain(struct nx_netif *n)
3388 {
3389 	SK_LOCK_ASSERT_HELD();
3390 
3391 	/* retaining an object with a zero refcount is not allowed */
3392 	ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3393 	os_ref_retain(&n->nif_refcnt);
3394 	SK_DF(SK_VERB_MEM, "netif %p, refcnt %d", SK_KVA(n),
3395 	    os_ref_get_count(&n->nif_refcnt));
3396 }
3397 
3398 void
3399 nx_netif_free(struct nx_netif *n)
3400 {
3401 	nx_netif_release(n);
3402 }
3403 
3404 static int
3405 nx_netif_interface_advisory_report(struct kern_nexus *nx,
3406     const struct ifnet_interface_advisory *advisory)
3407 {
3408 	struct kern_nexus *notify_nx;
3409 	struct __kern_netif_intf_advisory *intf_adv;
3410 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3411 	ifnet_t difp = nif->nif_ifp;
3412 	ifnet_t __single parent = NULL;
3413 
3414 	/* If we are a delegate, notify the parent instead */
3415 	if (ifnet_get_delegate_parent(difp, &parent) == 0) {
3416 		nif = parent->if_na->nifna_netif;
3417 	}
3418 	if (nif->nif_fsw_nxadv != NULL) {
3419 		ASSERT(nif->nif_fsw != NULL);
3420 		intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3421 		notify_nx = nif->nif_fsw->fsw_nx;
3422 	} else {
3423 		intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3424 		notify_nx = nif->nif_nx;
3425 	}
3426 	/*
3427 	 * copy the advisory report in shared memory
3428 	 */
3429 	intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3430 	    sizeof(*advisory), 0);
3431 	STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3432 	/*
3433 	 * notify user channels on advisory report availability
3434 	 */
3435 	nx_interface_advisory_notify(notify_nx);
3436 	if (parent != NULL) {
3437 		ifnet_release_delegate_parent(difp);
3438 	}
3439 	return 0;
3440 }
3441 
3442 static errno_t
3443 nx_netif_interface_advisory_notify(void *kern_ctx,
3444     const struct ifnet_interface_advisory *advisory)
3445 {
3446 	static_assert(offsetof(struct ifnet_interface_advisory, version) == offsetof(struct ifnet_interface_advisory, header.version));
3447 	static_assert(offsetof(struct ifnet_interface_advisory, direction) == offsetof(struct ifnet_interface_advisory, header.direction));
3448 	static_assert(offsetof(struct ifnet_interface_advisory, _reserved) == offsetof(struct ifnet_interface_advisory, header.interface_type));
3449 
3450 	if (__improbable(kern_ctx == NULL || advisory == NULL)) {
3451 		return EINVAL;
3452 	}
3453 	if (__improbable((advisory->header.version <
3454 	    IF_INTERFACE_ADVISORY_VERSION_MIN) ||
3455 	    (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
3456 		SK_ERR("Invalid advisory version %d", advisory->header.version);
3457 		return EINVAL;
3458 	}
3459 	if (__improbable((advisory->header.direction !=
3460 	    IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3461 	    (advisory->header.direction !=
3462 	    IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3463 		SK_ERR("Invalid advisory direction %d",
3464 		    advisory->header.direction);
3465 		return EINVAL;
3466 	}
3467 	if (__improbable(((advisory->header.interface_type <
3468 	    IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
3469 	    (advisory->header.interface_type >
3470 	    IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
3471 	    (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
3472 		SK_ERR("Invalid advisory interface type %d",
3473 		    advisory->header.interface_type);
3474 		return EINVAL;
3475 	}
3476 	return nx_netif_interface_advisory_report(kern_ctx, advisory);
3477 }
3478 
3479 void
3480 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3481 {
3482 	struct kern_nexus *nx_netif;
3483 	struct nx_netif *nif;
3484 
3485 	if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3486 		return;
3487 	}
3488 	if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3489 		struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3490 		nx_netif = fsw->fsw_nifna->na_nx;
3491 	} else {
3492 		nx_netif = nx;
3493 	}
3494 	ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3495 	nif = NX_NETIF_PRIVATE(nx_netif);
3496 	if (nif->nif_intf_adv_config != NULL) {
3497 		nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3498 	}
3499 }
3500 
3501 /*
3502  * This function has no use anymore since we are now passing truncated packets
3503  * to filters. We keep this logic just in case we need to prevent certain
3504  * packets from being passed to filters.
3505  */
3506 static boolean_t
3507 packet_is_filterable(struct nexus_netif_adapter *nifna,
3508     struct __kern_packet *pkt)
3509 {
3510 #pragma unused (nifna, pkt)
3511 	return TRUE;
3512 }
3513 
3514 /*
3515  * This function is only meant for supporting the RX path because the TX path
3516  * will not send packets > MTU size due to the disabling of TSO when filters
3517  * are enabled.
3518  */
3519 static void
3520 get_filterable_packets(struct nexus_netif_adapter *nifna,
3521     struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3522     struct __kern_packet **passthrough_chain)
3523 {
3524 	struct nx_netif *nif = nifna->nifna_netif;
3525 	struct netif_stats *nifs = &nif->nif_stats;
3526 	struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3527 	struct __kern_packet *__single fpkt_head = NULL;
3528 	struct __kern_packet *__single passthrough_head = NULL;
3529 	struct __kern_packet **fpkt_tailp = &fpkt_head;
3530 	struct __kern_packet **passthrough_tailp = &passthrough_head;
3531 	int fcnt = 0, pcnt = 0, dcnt = 0;
3532 
3533 	while (pkt != NULL) {
3534 		next = pkt->pkt_nextpkt;
3535 		pkt->pkt_nextpkt = NULL;
3536 
3537 		if (!packet_is_filterable(nifna, pkt)) {
3538 			pcnt++;
3539 			*passthrough_tailp = pkt;
3540 			passthrough_tailp = &pkt->pkt_nextpkt;
3541 			pkt = next;
3542 			continue;
3543 		}
3544 		fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3545 		if (fpkt != NULL) {
3546 			fcnt++;
3547 			*fpkt_tailp = fpkt;
3548 			fpkt_tailp = &fpkt->pkt_nextpkt;
3549 		} else {
3550 			dcnt++;
3551 		}
3552 		pkt = next;
3553 	}
3554 	*fpkt_chain = fpkt_head;
3555 	*passthrough_chain = passthrough_head;
3556 
3557 	/*
3558 	 * No need to increment drop stats because that's already
3559 	 * done in nx_netif_pkt_to_filter_pkt.
3560 	 */
3561 	STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3562 	DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3563 	    int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3564 	    fpkt_head, struct __kern_packet *, passthrough_head);
3565 }
3566 
3567 /*
3568  * This is only used by ring-based notify functions for now.
3569  * When a qset-based notify becomes available, this function can be used
3570  * unmodified.
3571  */
3572 void
3573 netif_receive(struct nexus_netif_adapter *nifna,
3574     struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3575 {
3576 	struct nx_netif *nif = nifna->nifna_netif;
3577 	struct nexus_adapter *na = &nifna->nifna_up;
3578 	struct netif_stats *nifs = &nif->nif_stats;
3579 	int err, dropcnt, dropstat = -1;
3580 
3581 	if ((nif->nif_ifp->if_xflags & IFXF_DISABLE_INPUT) != 0) {
3582 		uint64_t byte_cnt = 0;
3583 		struct __kern_packet *pkt;
3584 		struct ifnet *ifp = nif->nif_ifp;
3585 
3586 		dropcnt = 0;
3587 		for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
3588 			dropcnt++;
3589 			byte_cnt += ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) ?
3590 			    m_pktlen(pkt->pkt_mbuf) : pkt->pkt_length;
3591 		}
3592 		os_atomic_add(&ifp->if_data.ifi_ipackets, dropcnt, relaxed);
3593 		os_atomic_add(&ifp->if_data.ifi_ibytes, byte_cnt, relaxed);
3594 
3595 		dropstat = NETIF_STATS_DROP_INPUT_DISABLED;
3596 		goto drop;
3597 	}
3598 
3599 	/* update our work timestamp */
3600 	na->na_work_ts = net_uptime();
3601 
3602 	if (nif->nif_filter_cnt > 0) {
3603 		struct __kern_packet *__single fpkt_chain = NULL;
3604 		struct __kern_packet *__single passthrough_chain = NULL;
3605 
3606 		get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3607 		    &passthrough_chain);
3608 		if (fpkt_chain != NULL) {
3609 			(void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3610 			    NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3611 		}
3612 		if (passthrough_chain != NULL) {
3613 			pkt_chain = passthrough_chain;
3614 		} else {
3615 			return;
3616 		}
3617 	} else if (!NETIF_IS_LOW_LATENCY(nif) && nx_netif_filter_default_drop != 0) {
3618 		/*
3619 		 * Default drop is meant for dropping packets on interfaces without
3620 		 * interface filters attached. It can be skipped for LLW because it
3621 		 * doesn't have a network stack path.
3622 		 */
3623 		DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3624 		    struct __kern_packet *, pkt_chain);
3625 		dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3626 		goto drop;
3627 	}
3628 
3629 	if (nif->nif_flow_cnt > 0) {
3630 		struct __kern_packet *__single remain = NULL;
3631 
3632 		err = nx_netif_demux(nifna, pkt_chain, &remain, stats, NETIF_FLOW_SOURCE);
3633 		if (remain == NULL) {
3634 			return;
3635 		}
3636 		pkt_chain = remain;
3637 	}
3638 
3639 	if (na->na_rx != NULL) {
3640 		na->na_rx(na, pkt_chain, stats);
3641 	} else {
3642 		DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3643 		    struct __kern_packet *, pkt_chain);
3644 		dropstat = NETIF_STATS_DROP_NO_RX_CB;
3645 		goto drop;
3646 	}
3647 
3648 	return;
3649 
3650 drop:
3651 	dropcnt = 0;
3652 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3653 	if (dropstat != -1) {
3654 		STATS_ADD(nifs, dropstat, dropcnt);
3655 	}
3656 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3657 }
3658 
3659 static slot_idx_t
3660 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3661     slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3662 {
3663 	uint64_t elapsed;
3664 	uint64_t now;
3665 	struct __kern_packet *pkt;
3666 	clock_sec_t sec;
3667 	clock_usec_t usec;
3668 	slot_idx_t i;
3669 
3670 	if (__probable(rate == 0)) {
3671 		return end;
3672 	}
3673 
3674 	/* init tbr if not so */
3675 	if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3676 		r->ckr_tbr_token = rate;
3677 		r->ckr_tbr_depth = rate;
3678 		r->ckr_tbr_last = mach_absolute_time();
3679 	} else {
3680 		now = mach_absolute_time();
3681 		elapsed = now - r->ckr_tbr_last;
3682 		absolutetime_to_microtime(elapsed, &sec, &usec);
3683 		r->ckr_tbr_token +=
3684 		    ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3685 		if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3686 			r->ckr_tbr_token = r->ckr_tbr_depth;
3687 		}
3688 		r->ckr_tbr_last = now;
3689 	}
3690 
3691 	*rate_limited = FALSE;
3692 	for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3693 		pkt = KR_KSD(r, i)->sd_pkt;
3694 		if (__improbable(pkt == NULL)) {
3695 			continue;
3696 		}
3697 		if (__improbable(r->ckr_tbr_token <= 0)) {
3698 			end = i;
3699 			*rate_limited = TRUE;
3700 			break;
3701 		}
3702 		r->ckr_tbr_token -= pkt->pkt_length * 8;
3703 	}
3704 
3705 	SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3706 	    r, r->ckr_name, i);
3707 
3708 	return end;
3709 }
3710 
3711 SK_NO_INLINE_ATTRIBUTE
3712 static struct __kern_packet *
3713 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3714 {
3715 	struct __kern_packet *__single pkt_chain = NULL;
3716 	struct __kern_packet **tailp = &pkt_chain;
3717 	slot_idx_t idx = ring->ckr_rhead;
3718 
3719 	while (idx != end) {
3720 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3721 		struct __kern_packet *pkt = ksd->sd_pkt;
3722 
3723 		ASSERT(pkt->pkt_nextpkt == NULL);
3724 		KR_SLOT_DETACH_METADATA(ring, ksd);
3725 		*tailp = pkt;
3726 		tailp = &pkt->pkt_nextpkt;
3727 		idx = SLOT_NEXT(idx, ring->ckr_lim);
3728 	}
3729 	ring->ckr_rhead = end;
3730 	ring->ckr_rtail = ring->ckr_ktail;
3731 	return pkt_chain;
3732 }
3733 
3734 int
3735 netif_rx_notify(struct __kern_channel_ring *ring, struct proc *p,
3736     uint32_t flags)
3737 {
3738 	struct nexus_adapter *hwna;
3739 	struct nexus_netif_adapter *nifna;
3740 	struct nx_netif *nif;
3741 	struct __kern_packet *pkt_chain;
3742 	struct nexus_pkt_stats stats = {0};
3743 	sk_protect_t protect;
3744 	slot_idx_t ktail;
3745 	int err = 0;
3746 
3747 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3748 	    SK_KVA(ring));
3749 
3750 	ASSERT(ring->ckr_tx == NR_RX);
3751 	ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3752 
3753 	err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3754 	if (err != 0) {
3755 		/* not a serious error, so no need to be chatty here */
3756 		SK_DF(SK_VERB_FSW,
3757 		    "hwna \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x "
3758 		    "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3759 		    ring->ckr_name, SK_KVA(ring), ring->ckr_flags, err);
3760 		goto out;
3761 	}
3762 	if (__improbable(KR_DROP(ring))) {
3763 		kr_exit(ring);
3764 		err = ENODEV;
3765 		goto out;
3766 	}
3767 	hwna = KRNA(ring);
3768 	nifna = NIFNA(hwna);
3769 	nif = nifna->nifna_netif;
3770 	if (__improbable(hwna->na_ifp == NULL)) {
3771 		kr_exit(ring);
3772 		err = ENODEV;
3773 		goto out;
3774 	}
3775 	protect = sk_sync_protect();
3776 	err = ring->ckr_na_sync(ring, p, 0);
3777 	if (err != 0 && err != EAGAIN) {
3778 		goto put_out;
3779 	}
3780 
3781 	/* read the tail pointer once */
3782 	ktail = ring->ckr_ktail;
3783 	if (__improbable(ring->ckr_khead == ktail)) {
3784 		SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3785 		    "how strange, interrupt with no packets on hwna "
3786 		    "\"%s\" (%p)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3787 		goto put_out;
3788 	}
3789 	ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3790 	    ktail, &ring->ckr_rate_limited);
3791 
3792 	pkt_chain = consume_pkts(ring, ktail);
3793 	if (pkt_chain != NULL) {
3794 		netif_receive(nifna, pkt_chain, &stats);
3795 
3796 		if (ring->ckr_netif_mit_stats != NULL &&
3797 		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3798 			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3799 			    stats.nps_bytes);
3800 		}
3801 	}
3802 
3803 put_out:
3804 	sk_sync_unprotect(protect);
3805 	kr_exit(ring);
3806 
3807 out:
3808 	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3809 	    SK_KVA(ring), err);
3810 	return err;
3811 }
3812 
3813 /*
3814  * Configure the NA to operate in a particular mode.
3815  */
3816 static channel_ring_notify_t
3817 netif_hwna_get_notify(netif_mode_t mode)
3818 {
3819 	channel_ring_notify_t notify = NULL;
3820 
3821 	if (mode == NETIF_MODE_FSW) {
3822 		notify = netif_rx_notify;
3823 	} else if (mode == NETIF_MODE_LLW) {
3824 		notify = netif_llw_rx_notify;
3825 	}
3826 	return notify;
3827 }
3828 
3829 static uint32_t
3830 netif_mode_to_flag(netif_mode_t mode)
3831 {
3832 	uint32_t flag = 0;
3833 
3834 	if (mode == NETIF_MODE_FSW) {
3835 		flag = NAF_MODE_FSW;
3836 	} else if (mode == NETIF_MODE_LLW) {
3837 		flag = NAF_MODE_LLW;
3838 	}
3839 	return flag;
3840 }
3841 
3842 static void
3843 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3844     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3845     struct nexus_pkt_stats *), boolean_t set)
3846 {
3847 	uint32_t i;
3848 	uint32_t flag;
3849 
3850 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
3851 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
3852 
3853 	for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3854 		struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3855 		channel_ring_notify_t notify = netif_hwna_get_notify(mode);
3856 
3857 		if (set) {
3858 			kr->ckr_save_notify = kr->ckr_netif_notify;
3859 			kr->ckr_netif_notify = notify;
3860 		} else {
3861 			kr->ckr_netif_notify = kr->ckr_save_notify;
3862 			kr->ckr_save_notify = NULL;
3863 		}
3864 	}
3865 	if (set) {
3866 		hwna->na_rx = rx;
3867 		flag = netif_mode_to_flag(mode);
3868 		os_atomic_or(&hwna->na_flags, flag, relaxed);
3869 	} else {
3870 		hwna->na_rx = NULL;
3871 		os_atomic_andnot(&hwna->na_flags, (NAF_MODE_FSW | NAF_MODE_LLW), relaxed);
3872 	}
3873 }
3874 
3875 void
3876 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3877     void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3878     struct nexus_pkt_stats *))
3879 {
3880 	return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3881 }
3882 
3883 void
3884 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3885 {
3886 	return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3887 }
3888 
3889 static void
3890 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3891 {
3892 	struct nexus_netif_adapter *nifna = NIFNA(na);
3893 	struct nx_netif *nif = nifna->nifna_netif;
3894 	struct netif_stats *nifs = &nif->nif_stats;
3895 	struct __kern_channel_ring *r;
3896 	struct nexus_pkt_stats stats;
3897 	sk_protect_t protect;
3898 	boolean_t ring_drop = FALSE;
3899 	int err, dropcnt;
3900 
3901 	if (!NA_OWNED_BY_FSW(na)) {
3902 		DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3903 		goto fail;
3904 	}
3905 	ASSERT(na->na_rx != NULL);
3906 
3907 	/*
3908 	 * XXX
3909 	 * This function is called when a filter injects a packet back to the
3910 	 * regular RX path. We can assume the ring is 0 for now because RSS
3911 	 * is not supported. This needs to be revisited when we add support for
3912 	 * RSS.
3913 	 */
3914 	r = &na->na_rx_rings[0];
3915 	ASSERT(r->ckr_tx == NR_RX);
3916 	err = kr_enter(r, TRUE);
3917 	VERIFY(err == 0);
3918 
3919 	if (__improbable(KR_DROP(r))) {
3920 		kr_exit(r);
3921 		DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3922 		    struct __kern_channel_ring *, r);
3923 		ring_drop = TRUE;
3924 		goto fail;
3925 	}
3926 	protect = sk_sync_protect();
3927 	na->na_rx(na, pkt_chain, &stats);
3928 
3929 	if (r->ckr_netif_mit_stats != NULL &&
3930 	    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3931 		r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3932 	}
3933 	sk_sync_unprotect(protect);
3934 
3935 	kr_exit(r);
3936 	return;
3937 
3938 fail:
3939 	dropcnt = 0;
3940 	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3941 	if (ring_drop) {
3942 		STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3943 	}
3944 	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3945 }
3946 
3947 /*
3948  * This is called when an inbound packet has traversed all filters.
3949  */
3950 errno_t
3951 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3952     struct __kern_packet *fpkt_chain, uint32_t flags)
3953 {
3954 #pragma unused (flags)
3955 	struct nx_netif *nif = nifna->nifna_netif;
3956 	struct netif_stats *nifs = &nif->nif_stats;
3957 	struct nexus_adapter *na = &nifna->nifna_up;
3958 	struct __kern_packet *pkt_chain;
3959 	int err;
3960 
3961 	pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3962 	    fpkt_chain, NETIF_CONVERT_RX);
3963 	if (pkt_chain == NULL) {
3964 		return ENOMEM;
3965 	}
3966 	if (nif->nif_flow_cnt > 0) {
3967 		struct __kern_packet *__single remain = NULL;
3968 
3969 		err = nx_netif_demux(nifna, pkt_chain, &remain,
3970 		    NULL, NETIF_FLOW_INJECT);
3971 		if (remain == NULL) {
3972 			return err;
3973 		}
3974 		pkt_chain = remain;
3975 	}
3976 	if (na->na_rx != NULL) {
3977 		netif_inject_rx(na, pkt_chain);
3978 	} else {
3979 		int dropcnt = 0;
3980 		nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3981 		STATS_ADD(nifs,
3982 		    NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3983 		STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3984 	}
3985 	return 0;
3986 }
3987 
3988 /*
3989  * This is called when an outbound packet has traversed all filters.
3990  */
3991 errno_t
3992 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3993     struct __kern_packet *fpkt_chain, uint32_t flags)
3994 {
3995 #pragma unused (flags)
3996 	struct nx_netif *nif = nifna->nifna_netif;
3997 	struct nexus_adapter *na = &nifna->nifna_up;
3998 	int err;
3999 
4000 	if (NETIF_IS_COMPAT(nif)) {
4001 		struct mbuf *m_chain;
4002 		mbuf_svc_class_t sc;
4003 
4004 		m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
4005 		    fpkt_chain, NETIF_CONVERT_TX);
4006 		if (m_chain == NULL) {
4007 			return ENOMEM;
4008 		}
4009 		/*
4010 		 * All packets in the chain have the same service class.
4011 		 * If the sc is missing or invalid, a valid value will be
4012 		 * returned.
4013 		 */
4014 		sc = mbuf_get_service_class(m_chain);
4015 		err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
4016 		    sc, m_chain);
4017 	} else {
4018 		struct __kern_packet *pkt_chain;
4019 		kern_packet_svc_class_t sc;
4020 
4021 		pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
4022 		    fpkt_chain, NETIF_CONVERT_TX);
4023 		if (pkt_chain == NULL) {
4024 			return ENOMEM;
4025 		}
4026 		/*
4027 		 * All packets in the chain have the same service class.
4028 		 * If the sc is missing or invalid, a valid value will be
4029 		 * returned.
4030 		 */
4031 		sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
4032 		err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
4033 		    sc, pkt_chain);
4034 	}
4035 	/* Tell driver to resume dequeuing */
4036 	ifnet_start(na->na_ifp);
4037 	return err;
4038 }
4039 
4040 void
4041 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
4042     struct skmem_region_params *srp)
4043 {
4044 #pragma unused(na, srp)
4045 	return;
4046 }
4047 
4048 /* returns true, if starter thread is utilized */
4049 static bool
4050 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
4051 {
4052 #if (DEVELOPMENT || DEBUG)
4053 	if (__improbable(nx_netif_force_ifnet_start != 0)) {
4054 		ifnet_start(ifp);
4055 		return true;
4056 	}
4057 #endif /* !DEVELOPMENT && !DEBUG */
4058 	/*
4059 	 * use starter thread in following conditions:
4060 	 * - interface is not skywalk native
4061 	 * - interface attached to virtual driver (ipsec, utun)
4062 	 * - TBR is enabled
4063 	 * - delayed start mechanism is in use
4064 	 * - remaining stack space on the thread is not enough for driver
4065 	 * - caller is in rx workloop context
4066 	 * - caller is from the flowswitch path doing ARP resolving
4067 	 * - caller requires the use of starter thread (stack usage)
4068 	 * - caller requires starter thread for pacing
4069 	 */
4070 	if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
4071 	    !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
4072 	    ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
4073 	    IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
4074 	    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4075 	    (flags & NETIF_XMIT_FLAG_PACING) != 0 ||
4076 	    sk_is_rx_notify_protected() ||
4077 	    sk_is_async_transmit_protected() ||
4078 	    (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
4079 		DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
4080 		    uint32_t, flags);
4081 		ifnet_start(ifp);
4082 		return true;
4083 	}
4084 	lck_mtx_lock_spin(&ifp->if_start_lock);
4085 	/* interface is flow controlled */
4086 	if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
4087 		lck_mtx_unlock(&ifp->if_start_lock);
4088 		return true;
4089 	}
4090 	/* if starter thread is active, utilize it */
4091 	if (ifp->if_start_active) {
4092 		ifp->if_start_req++;
4093 		lck_mtx_unlock(&ifp->if_start_lock);
4094 		return true;
4095 	}
4096 	lck_mtx_unlock(&ifp->if_start_lock);
4097 	/* Check remaining stack space */
4098 	if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
4099 		ifnet_start(ifp);
4100 		return true;
4101 	}
4102 	return false;
4103 }
4104 
4105 void
4106 netif_transmit(struct ifnet *ifp, uint32_t flags)
4107 {
4108 	if (netif_use_starter_thread(ifp, flags)) {
4109 		return;
4110 	}
4111 	nx_netif_doorbell_internal(ifp, flags);
4112 }
4113 
4114 static struct ifclassq *
4115 netif_get_default_ifcq(struct nexus_adapter *hwna)
4116 {
4117 	struct nx_netif *nif;
4118 	struct ifclassq *ifcq;
4119 
4120 	nif = NX_NETIF_PRIVATE(hwna->na_nx);
4121 	if (NETIF_LLINK_ENABLED(nif)) {
4122 		struct netif_qset *qset;
4123 
4124 		/*
4125 		 * Use the default ifcq for now.
4126 		 * In the future this could be chosen by the caller.
4127 		 */
4128 		qset = nx_netif_get_default_qset_noref(nif);
4129 		ASSERT(qset != NULL);
4130 		ifcq = qset->nqs_ifcq;
4131 	} else {
4132 		ifcq = nif->nif_ifp->if_snd;
4133 	}
4134 	return ifcq;
4135 }
4136 
4137 static errno_t
4138 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
4139     uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
4140     boolean_t *pkts_pending, kern_packet_svc_class_t sc,
4141     uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
4142 {
4143 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
4144 	struct ifnet *ifp = hwna->na_ifp;
4145 	uint32_t pkts_cnt;
4146 	uint32_t bytes_cnt;
4147 	uint32_t sch_model = ifp->if_output_sched_model;
4148 	mbuf_svc_class_t svc;
4149 	errno_t rc;
4150 
4151 	ASSERT(ifp != NULL);
4152 	ASSERT(IFNET_MODEL_IS_VALID(ifp->if_output_sched_model));
4153 	ASSERT((pkt_limit != 0) && (byte_limit != 0));
4154 
4155 	if (ifcq == NULL) {
4156 		ifcq = netif_get_default_ifcq(hwna);
4157 	}
4158 
4159 	svc = (sch_model & IFNET_SCHED_DRIVER_MANGED_MODELS) ?
4160 	    (mbuf_svc_class_t)sc : MBUF_SC_UNSPEC;
4161 	rc = ifclassq_dequeue(ifcq, svc, pkt_limit, byte_limit, &pkt_head, NULL,
4162 	    pkt_cnt, bytes, qset_idx);
4163 
4164 	ASSERT((rc == 0) || (rc == EAGAIN));
4165 	ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
4166 
4167 	ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
4168 	    &pkts_cnt, &bytes_cnt);
4169 	*pkts_pending = pkts_cnt > 0;
4170 
4171 	*head = pkt_head.cp_kpkt;
4172 	return rc;
4173 }
4174 
4175 #if SK_LOG
4176 /* Hoisted out of line to reduce kernel stack footprint */
4177 SK_LOG_ATTRIBUTE
4178 static void
4179 netif_no_ring_space_log(const struct nexus_adapter *na,
4180     const kern_channel_ring_t ring)
4181 {
4182 	SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4183 	    "no ring space: na \"%s\" [%u] "
4184 	    "\"%s\"(kh %u kt %u | rh %u rt %u)",
4185 	    na->na_name, ring->ckr_ring_id,
4186 	    ring->ckr_name, ring->ckr_khead,
4187 	    ring->ckr_ktail, ring->ckr_rhead,
4188 	    ring->ckr_rtail);
4189 }
4190 #endif /* SK_LOG */
4191 
4192 /*
4193  * netif refill function for rings
4194  */
4195 errno_t
4196 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
4197     uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
4198     boolean_t canblock)
4199 {
4200 	struct nexus_adapter *hwna;
4201 	struct ifnet *ifp;
4202 	struct __kern_packet *__single head = NULL;
4203 	sk_protect_t protect;
4204 	errno_t rc = 0;
4205 	errno_t sync_err = 0;
4206 	uint32_t npkts = 0, consumed = 0;
4207 	uint32_t flags;
4208 	slot_idx_t idx, ktail;
4209 	int ring_space = 0;
4210 
4211 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
4212 
4213 	VERIFY(ring != NULL);
4214 	hwna = KRNA(ring);
4215 	ifp = hwna->na_ifp;
4216 
4217 	ASSERT(hwna->na_type == NA_NETIF_DEV);
4218 	ASSERT(ring->ckr_tx == NR_TX);
4219 	*pkts_pending = FALSE;
4220 
4221 	if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
4222 		SK_ERR("invalid limits plim %d, blim %d",
4223 		    pkt_limit, byte_limit);
4224 		rc = EINVAL;
4225 		goto out;
4226 	}
4227 
4228 	if (__improbable(!ifnet_is_fully_attached(ifp))) {
4229 		SK_ERR("hwna %p ifp %s (%p), interface not attached",
4230 		    SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4231 		rc = ENXIO;
4232 		goto out;
4233 	}
4234 
4235 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
4236 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna %p ifp %s (%p), "
4237 		    "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4238 		rc = ENXIO;
4239 		goto out;
4240 	}
4241 
4242 	/*
4243 	 * if the ring is busy, it means another dequeue is in
4244 	 * progress, so ignore this request and return success.
4245 	 */
4246 	if (kr_enter(ring, canblock) != 0) {
4247 		rc = 0;
4248 		goto out;
4249 	}
4250 	/* mark thread with sync-in-progress flag */
4251 	protect = sk_sync_protect();
4252 
4253 	if (__improbable(KR_DROP(ring) ||
4254 	    !NA_IS_ACTIVE(ring->ckr_na))) {
4255 		SK_ERR("hw-kr %p stopped", SK_KVA(ring));
4256 		rc = ENXIO;
4257 		goto done;
4258 	}
4259 
4260 	idx = ring->ckr_rhead;
4261 	ktail = ring->ckr_ktail;
4262 	/* calculate available space on tx ring */
4263 	ring_space = ktail - idx;
4264 	if (ring_space < 0) {
4265 		ring_space += ring->ckr_num_slots;
4266 	}
4267 	if (ring_space == 0) {
4268 		struct ifclassq *ifcq;
4269 
4270 		/* no space in ring, driver should retry */
4271 #if SK_LOG
4272 		if (__improbable((sk_verbose &
4273 		    (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
4274 			netif_no_ring_space_log(hwna, ring);
4275 		}
4276 #endif /* SK_LOG */
4277 		ifcq = netif_get_default_ifcq(hwna);
4278 		if (IFCQ_LEN(ifcq) != 0) {
4279 			*pkts_pending = TRUE;
4280 		}
4281 		/*
4282 		 * We ran out of space in ring, most probably
4283 		 * because the driver is slow to drain its TX queue.
4284 		 * We want another doorbell to be generated as soon
4285 		 * as the TX notify completion happens; mark this
4286 		 * through ckr_pending_doorbell counter.  Do this
4287 		 * regardless of whether there's any pending packet.
4288 		 */
4289 		ring->ckr_pending_doorbell++;
4290 		rc = EAGAIN;
4291 		goto sync_ring;
4292 	}
4293 
4294 	if ((uint32_t)ring_space < pkt_limit) {
4295 		pkt_limit = ring_space;
4296 	}
4297 
4298 	if (tx_doorbell_ctxt &&
4299 	    ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4300 		pkt_limit = MIN(pkt_limit,
4301 		    nx_netif_doorbell_max_dequeue);
4302 	}
4303 
4304 	rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4305 	    &head, pkts_pending, ring->ckr_svc, NULL, NULL, 0);
4306 
4307 	/*
4308 	 * There's room in ring; if we haven't dequeued everything,
4309 	 * mark ckr_pending_doorbell for the next TX notify to issue
4310 	 * a TX door bell; otherwise, clear it.  The next packet that
4311 	 * gets enqueued will trigger a door bell again.
4312 	 */
4313 	if (*pkts_pending) {
4314 		ring->ckr_pending_doorbell++;
4315 	} else if (ring->ckr_pending_doorbell != 0) {
4316 		ring->ckr_pending_doorbell = 0;
4317 	}
4318 
4319 	if (rc != 0) {
4320 		/*
4321 		 * This is expected sometimes as the IOSkywalkFamily
4322 		 * errs on the side of caution to perform an extra
4323 		 * dequeue when multiple doorbells are pending;
4324 		 * nothing to dequeue, do a sync if there are slots
4325 		 * to reclaim else just return.
4326 		 */
4327 		SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4328 		    "nothing to dequeue, err %d", rc);
4329 
4330 		if ((uint32_t)ring_space == ring->ckr_lim) {
4331 			goto done;
4332 		} else {
4333 			goto sync_ring;
4334 		}
4335 	}
4336 	/* move the dequeued packets to tx ring */
4337 	while (head != NULL && idx != ktail) {
4338 		ASSERT(npkts <= pkt_limit);
4339 		struct __kern_packet *pkt = head;
4340 		KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
4341 		    (struct __kern_quantum *)pkt);
4342 		npkts++;
4343 		if (__improbable(pkt->pkt_trace_id != 0)) {
4344 			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4345 			KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4346 		}
4347 		idx = SLOT_NEXT(idx, ring->ckr_lim);
4348 		head = pkt->pkt_nextpkt;
4349 		pkt->pkt_nextpkt = NULL;
4350 	}
4351 
4352 	/*
4353 	 * We checked for ring space earlier so the ring should have enough
4354 	 * space for the entire chain.
4355 	 */
4356 	ASSERT(head == NULL);
4357 	ring->ckr_rhead = idx;
4358 
4359 sync_ring:
4360 	flags = NA_SYNCF_NETIF;
4361 	if (ring->ckr_pending_doorbell != 0) {
4362 		flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4363 	}
4364 
4365 	ring->ckr_khead_pre = ring->ckr_khead;
4366 	sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4367 	if (sync_err != 0 && sync_err != EAGAIN) {
4368 		SK_ERR("unexpected sync err %d", sync_err);
4369 		if (rc == 0) {
4370 			rc = sync_err;
4371 		}
4372 		goto done;
4373 	}
4374 	/*
4375 	 * Verify that the driver has detached packets from the consumed slots.
4376 	 */
4377 	idx = ring->ckr_khead_pre;
4378 	consumed = 0;
4379 	while (idx != ring->ckr_khead) {
4380 		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4381 
4382 		consumed++;
4383 		VERIFY(!KSD_VALID_METADATA(ksd));
4384 		idx = SLOT_NEXT(idx, ring->ckr_lim);
4385 	}
4386 	ring->ckr_khead_pre = ring->ckr_khead;
4387 
4388 done:
4389 	sk_sync_unprotect(protect);
4390 	kr_exit(ring);
4391 out:
4392 	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4393 	    SK_KVA(ring), rc, 0, npkts);
4394 
4395 	return rc;
4396 }
4397 
4398 #define NQ_EWMA(old, new, decay) do {                               \
4399 	u_int64_t _avg;                                                 \
4400 	if (__probable((_avg = (old)) > 0))                             \
4401 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
4402 	else                                                            \
4403 	        _avg = (new);                                           \
4404 	(old) = _avg;                                                   \
4405 } while (0)
4406 
4407 void
4408 kern_netif_increment_queue_stats(kern_netif_queue_t queue,
4409     uint32_t pkt_count, uint32_t byte_count)
4410 {
4411 	struct netif_llink *llink = queue->nq_qset->nqs_llink;
4412 	struct ifnet *ifp = llink->nll_nif->nif_ifp;
4413 	if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
4414 		os_atomic_add(&ifp->if_data.ifi_opackets, pkt_count, relaxed);
4415 		os_atomic_add(&ifp->if_data.ifi_obytes, byte_count, relaxed);
4416 	} else {
4417 		os_atomic_add(&ifp->if_data.ifi_ipackets, pkt_count, relaxed);
4418 		os_atomic_add(&ifp->if_data.ifi_ibytes, byte_count, relaxed);
4419 	}
4420 
4421 	if (ifp->if_data_threshold != 0) {
4422 		ifnet_notify_data_threshold(ifp);
4423 	}
4424 
4425 	uint64_t now;
4426 	uint64_t diff_secs;
4427 	struct netif_qstats *stats = &queue->nq_stats;
4428 
4429 	if (sk_netif_queue_stat_enable == 0) {
4430 		return;
4431 	}
4432 
4433 	if (__improbable(pkt_count == 0)) {
4434 		return;
4435 	}
4436 
4437 	stats->nq_num_xfers++;
4438 	stats->nq_total_bytes += byte_count;
4439 	stats->nq_total_pkts += pkt_count;
4440 	if (pkt_count > stats->nq_max_pkts) {
4441 		stats->nq_max_pkts = pkt_count;
4442 	}
4443 	if (stats->nq_min_pkts == 0 ||
4444 	    pkt_count < stats->nq_min_pkts) {
4445 		stats->nq_min_pkts = pkt_count;
4446 	}
4447 
4448 	now = net_uptime();
4449 	if (__probable(queue->nq_accumulate_start != 0)) {
4450 		diff_secs = now - queue->nq_accumulate_start;
4451 		if (diff_secs >= nq_accumulate_interval) {
4452 			uint64_t        bps;
4453 			uint64_t        pps;
4454 			uint64_t        pps_ma;
4455 
4456 			/* bytes per second */
4457 			bps = queue->nq_accumulated_bytes / diff_secs;
4458 			NQ_EWMA(stats->nq_bytes_ps_ma,
4459 			    bps, nq_transfer_decay);
4460 			stats->nq_bytes_ps = bps;
4461 
4462 			/* pkts per second */
4463 			pps = queue->nq_accumulated_pkts / diff_secs;
4464 			pps_ma = stats->nq_pkts_ps_ma;
4465 			NQ_EWMA(pps_ma, pps, nq_transfer_decay);
4466 			stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
4467 			stats->nq_pkts_ps = (uint32_t)pps;
4468 
4469 			/* start over */
4470 			queue->nq_accumulate_start = now;
4471 			queue->nq_accumulated_bytes = 0;
4472 			queue->nq_accumulated_pkts = 0;
4473 
4474 			stats->nq_min_pkts = 0;
4475 			stats->nq_max_pkts = 0;
4476 		}
4477 	} else {
4478 		queue->nq_accumulate_start = now;
4479 	}
4480 	queue->nq_accumulated_bytes += byte_count;
4481 	queue->nq_accumulated_pkts += pkt_count;
4482 }
4483 
4484 void
4485 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4486     uint32_t count, uint32_t flags)
4487 {
4488 #pragma unused (count)
4489 	struct netif_queue *q = queue;
4490 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4491 	struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4492 	bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4493 	struct pktq *pktq = &q->nq_pktq;
4494 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4495 	struct nexus_pkt_stats stats = {0};
4496 	sk_protect_t protect;
4497 
4498 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4499 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4500 		int drop_cnt = 0;
4501 
4502 		pp_free_packet_chain(pkt_chain, &drop_cnt);
4503 		STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4504 		return;
4505 	}
4506 	KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4507 	if (flush) {
4508 		pkt_chain = KPKTQ_FIRST(pktq);
4509 		KPKTQ_INIT(pktq);
4510 
4511 		protect = sk_sync_protect();
4512 		netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4513 		sk_sync_unprotect(protect);
4514 		kern_netif_increment_queue_stats(queue, (uint32_t)stats.nps_pkts,
4515 		    (uint32_t)stats.nps_bytes);
4516 	}
4517 }
4518 
4519 errno_t
4520 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4521     uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4522 {
4523 	struct netif_queue *q = queue;
4524 	struct netif_llink *llink = q->nq_qset->nqs_llink;
4525 	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4526 	struct nexus_adapter *hwna;
4527 	struct __kern_packet *__single pkt_chain = NULL;
4528 	uint32_t bytes = 0, pkt_cnt = 0;
4529 	errno_t rc;
4530 
4531 	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4532 	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4533 		STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4534 		return ENXIO;
4535 	}
4536 	hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4537 
4538 	if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4539 	    sk_is_tx_notify_protected()) {
4540 		pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4541 	}
4542 	rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4543 	    byte_limit, &pkt_chain, pending, q->nq_svc, &pkt_cnt, &bytes,
4544 	    q->nq_qset->nqs_idx);
4545 
4546 	if (pkt_cnt > 0) {
4547 		kern_netif_increment_queue_stats(queue, pkt_cnt, bytes);
4548 	}
4549 	if (pkt_chain != NULL) {
4550 		*ph_chain = SK_PKT2PH(pkt_chain);
4551 	}
4552 	return rc;
4553 }
4554 
4555 errno_t
4556 kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
4557     uint32_t * pkts_cnt, uint32_t * bytes_cnt)
4558 {
4559 	VERIFY(qset != NULL);
4560 	VERIFY(pkts_cnt != NULL);
4561 	VERIFY(bytes_cnt != NULL);
4562 
4563 	return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
4564 	           bytes_cnt);
4565 }
4566 
4567 errno_t
4568 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4569     struct kern_nexus_netif_llink_init *llink_init)
4570 {
4571 	errno_t err;
4572 	struct nx_netif *nif;
4573 	struct netif_llink *__single llink;
4574 	struct netif_stats *nifs;
4575 
4576 	VERIFY(nx != NULL);
4577 	VERIFY(llink_init != NULL);
4578 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4579 
4580 	nif = NX_NETIF_PRIVATE(nx);
4581 	nifs = &nif->nif_stats;
4582 
4583 	err = nx_netif_validate_llink_config(llink_init, false);
4584 	if (err != 0) {
4585 		SK_ERR("Invalid llink init params");
4586 		STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4587 		return err;
4588 	}
4589 
4590 	err = nx_netif_llink_add(nif, llink_init, &llink);
4591 	return err;
4592 }
4593 
4594 errno_t
4595 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4596     kern_nexus_netif_llink_id_t llink_id)
4597 {
4598 	struct nx_netif *nif;
4599 
4600 	VERIFY(nx != NULL);
4601 	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4602 
4603 	nif = NX_NETIF_PRIVATE(nx);
4604 	return nx_netif_llink_remove(nif, llink_id);
4605 }
4606 
4607 errno_t
4608 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4609     kern_packet_svc_class_t *svc)
4610 {
4611 	*svc = queue->nq_svc;
4612 	return 0;
4613 }
4614