1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * The netif nexus domain has two domain providers: native and compat, with
31 * the latter being the default provider of this domain. The compat provider
32 * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33 *
34 * A netif nexus instance can be in a native or compat mode; in either case,
35 * it is associated with two instances of a nexus_adapter structure, and allows
36 * at most two channels opened to the nexus. Two two adapters correspond to
37 * host and device ports, respectively.
38 *
39 * By itself, a netif nexus isn't associated with a network interface. The
40 * association happens by attaching a network interface to the nexus instance.
41 * A channel can only be successfully opened to a netif nexus after it has an
42 * interface attached to it.
43 *
44 * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45 * structure refers to the attached netif nexus adapter via its if_na field.
46 * The nexus also holds a reference to the interface on its na_ifp field. Note
47 * that attaching to a netif_compat nexus does not alter the input/output data
48 * path, nor does it remove any of the interface's hardware offload flags. It
49 * merely associates the interface and netif nexus together.
50 *
51 * During a detach, the above references are dropped and the fields are cleared;
52 * the interface is also marked as non-Skywalk-capable. This detach can happen
53 * explicitly via a command down the nexus, or implicitly when the nexus goes
54 * away (assuming there's no channel opened to it.)
55 *
56 * A userland channel can be opened to a netif nexus via the usual ch_open()
57 * way, assuming the nexus provider is setup to allow access for the userland
58 * process (either by binding the nexus port to PID, etc. or by creating the
59 * nexus in the anonymous mode.)
60 *
61 * Alternatively, a kernel channel can also be opened to it by some kernel
62 * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63 * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64 * indicate that.
65 *
66 * Opening a channel to the host port of a native or compat netif causes the
67 * ifnet output path to be redirected to nx_netif_host_transmit(). We also,
68 * at present, disable any hardware offload features.
69 *
70 * Opening a channel to the device port of a compat netif causes the ifnet
71 * input path to be redirected to nx_netif_compat_receive(). This is specific
72 * to the compat variant, as the native variant's RX path already goes to
73 * the native netif.
74 *
75 * During channel close, we restore the original I/O callbacks, as well as the
76 * interface's offload flags.
77 */
78
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87
88 #define NX_NETIF_MAXRINGS NX_MAX_NUM_RING_PAIR
89 #define NX_NETIF_MINSLOTS 2 /* XXX same as above */
90 #define NX_NETIF_MAXSLOTS NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91 #define NX_NETIF_TXRINGSIZE 512 /* default TX ring size */
92 #define NX_NETIF_RXRINGSIZE 1024 /* default RX ring size */
93 #define NX_NETIF_BUFSIZE (2 * 1024) /* default buffer size */
94 #define NX_NETIF_MINBUFSIZE (128) /* min buffer size */
95 #define NX_NETIF_MAXBUFSIZE (32 * 1024) /* max buffer size */
96
97 /*
98 * TODO: [email protected] -- minimum buflets for now; we will need to
99 * have a way to adjust this based on the underlying interface's
100 * parameters, e.g. jumbo MTU, large segment offload, etc.
101 */
102 #define NX_NETIF_UMD_SIZE _USER_PACKET_SIZE(BUFLETS_MIN)
103 #define NX_NETIF_KMD_SIZE _KERN_PACKET_SIZE(BUFLETS_MIN)
104
105 /*
106 * minimum stack space required for IOSkywalkFamily and Driver execution.
107 */
108 #if XNU_TARGET_OS_OSX
109 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 1)
110 #else /* !XNU_TARGET_OS_OSX */
111 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 2)
112 #endif /* XNU_TARGET_OS_OSX */
113
114 static void nx_netif_dom_init(struct nxdom *);
115 static void nx_netif_dom_terminate(struct nxdom *);
116 static void nx_netif_dom_fini(struct nxdom *);
117 static int nx_netif_prov_params_adjust(
118 const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 struct nxprov_adjusted_params *);
120
121 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122 struct nxbind *, void *);
123 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125 struct kern_nexus *, struct kern_channel *, struct chreq *,
126 struct kern_channel *, struct nxbind *, struct proc *);
127 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128 struct kern_nexus *, struct kern_channel *);
129 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130 struct kern_nexus *, struct kern_channel *, struct proc *);
131 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132 struct kern_nexus *, struct kern_channel *, boolean_t);
133
134 static void nx_netif_doorbell(struct ifnet *);
135 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136 uint32_t);
137 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138 uint32_t);
139 static void nx_netif_na_dtor(struct nexus_adapter *na);
140 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141 uint32_t);
142 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143 uint32_t);
144 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145
146 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147 struct proc *);
148 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149 struct proc *);
150 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152 static void nx_netif_flags_init(struct nx_netif *);
153 static void nx_netif_flags_fini(struct nx_netif *);
154 static void nx_netif_callbacks_init(struct nx_netif *);
155 static void nx_netif_callbacks_fini(struct nx_netif *);
156 static void nx_netif_capabilities_fini(struct nx_netif *);
157 static errno_t nx_netif_interface_advisory_notify(void *,
158 const struct ifnet_interface_advisory *);
159
160 struct nxdom nx_netif_dom_s = {
161 .nxdom_prov_head =
162 STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
163 .nxdom_type = NEXUS_TYPE_NET_IF,
164 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
165 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
166 .nxdom_name = "netif",
167 .nxdom_ports = {
168 .nb_def = 2,
169 .nb_min = 2,
170 .nb_max = NX_NETIF_MAXPORTS,
171 },
172 .nxdom_tx_rings = {
173 .nb_def = 1,
174 .nb_min = 1,
175 .nb_max = NX_NETIF_MAXRINGS,
176 },
177 .nxdom_rx_rings = {
178 .nb_def = 1,
179 .nb_min = 1,
180 .nb_max = NX_NETIF_MAXRINGS,
181 },
182 .nxdom_tx_slots = {
183 .nb_def = NX_NETIF_TXRINGSIZE,
184 .nb_min = NX_NETIF_MINSLOTS,
185 .nb_max = NX_NETIF_MAXSLOTS,
186 },
187 .nxdom_rx_slots = {
188 .nb_def = NX_NETIF_RXRINGSIZE,
189 .nb_min = NX_NETIF_MINSLOTS,
190 .nb_max = NX_NETIF_MAXSLOTS,
191 },
192 .nxdom_buf_size = {
193 .nb_def = NX_NETIF_BUFSIZE,
194 .nb_min = NX_NETIF_MINBUFSIZE,
195 .nb_max = NX_NETIF_MAXBUFSIZE,
196 },
197 .nxdom_large_buf_size = {
198 .nb_def = 0,
199 .nb_min = 0,
200 .nb_max = 0,
201 },
202 .nxdom_meta_size = {
203 .nb_def = NX_NETIF_UMD_SIZE,
204 .nb_min = NX_NETIF_UMD_SIZE,
205 .nb_max = NX_METADATA_USR_MAX_SZ,
206 },
207 .nxdom_stats_size = {
208 .nb_def = 0,
209 .nb_min = 0,
210 .nb_max = NX_STATS_MAX_SZ,
211 },
212 .nxdom_pipes = {
213 .nb_def = 0,
214 .nb_min = 0,
215 .nb_max = NX_UPIPE_MAXPIPES,
216 },
217 .nxdom_flowadv_max = {
218 .nb_def = 0,
219 .nb_min = 0,
220 .nb_max = NX_FLOWADV_MAX,
221 },
222 .nxdom_nexusadv_size = {
223 .nb_def = 0,
224 .nb_min = 0,
225 .nb_max = NX_NEXUSADV_MAX_SZ,
226 },
227 .nxdom_capabilities = {
228 .nb_def = NXPCAP_USER_CHANNEL,
229 .nb_min = 0,
230 .nb_max = NXPCAP_USER_CHANNEL,
231 },
232 .nxdom_qmap = {
233 .nb_def = NEXUS_QMAP_TYPE_DEFAULT,
234 .nb_min = NEXUS_QMAP_TYPE_DEFAULT,
235 .nb_max = NEXUS_QMAP_TYPE_WMM,
236 },
237 .nxdom_max_frags = {
238 .nb_def = NX_PBUF_FRAGS_DEFAULT,
239 .nb_min = NX_PBUF_FRAGS_MIN,
240 .nb_max = NX_PBUF_FRAGS_MAX,
241 },
242 .nxdom_init = nx_netif_dom_init,
243 .nxdom_terminate = nx_netif_dom_terminate,
244 .nxdom_fini = nx_netif_dom_fini,
245 .nxdom_find_port = NULL,
246 .nxdom_port_is_reserved = NULL,
247 .nxdom_bind_port = nx_netif_dom_bind_port,
248 .nxdom_unbind_port = nx_netif_dom_unbind_port,
249 .nxdom_connect = nx_netif_dom_connect,
250 .nxdom_disconnect = nx_netif_dom_disconnect,
251 .nxdom_defunct = nx_netif_dom_defunct,
252 .nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
253 };
254
255 struct kern_nexus_domain_provider nx_netif_prov_s = {
256 .nxdom_prov_name = NEXUS_PROVIDER_NET_IF,
257 /*
258 * Don't install this as the default domain provider, i.e.
259 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
260 * provider to be the one handling userland-issued requests
261 * coming down thru nxprov_create() instead.
262 */
263 .nxdom_prov_flags = 0,
264 .nxdom_prov_cb = {
265 .dp_cb_init = nx_netif_prov_init,
266 .dp_cb_fini = nx_netif_prov_fini,
267 .dp_cb_params = nx_netif_prov_params,
268 .dp_cb_mem_new = nx_netif_prov_mem_new,
269 .dp_cb_config = nx_netif_prov_config,
270 .dp_cb_nx_ctor = nx_netif_prov_nx_ctor,
271 .dp_cb_nx_dtor = nx_netif_prov_nx_dtor,
272 .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info,
273 .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get,
274 .dp_cb_nx_stop = nx_netif_prov_nx_stop,
275 },
276 };
277
278 struct nexus_ifnet_ops na_netif_ops = {
279 .ni_finalize = na_netif_finalize,
280 .ni_reap = nx_netif_reap,
281 .ni_dequeue = nx_netif_native_tx_dequeue,
282 .ni_get_len = nx_netif_native_tx_get_len,
283 };
284
285 #define NX_NETIF_DOORBELL_MAX_DEQUEUE 64
286 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
287
288 #define NQ_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
289 static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;
290
291 #define NQ_ACCUMULATE_INTERVAL 2 /* 2 seconds */
292 static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;
293
294 static uint32_t nq_stat_enable = 0;
295
296 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
297 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
298 #if (DEVELOPMENT || DEBUG)
299 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
300 CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
301 "ifname prefix for enabling low latency support");
302 static uint32_t nx_netif_force_ifnet_start = 0;
303 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
304 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
305 "always use ifnet starter thread");
306 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
307 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
308 NX_NETIF_DOORBELL_MAX_DEQUEUE,
309 "max packets to dequeue in doorbell context");
310 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
311 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
312 NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
313 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
314 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
315 NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
316 #endif /* !DEVELOPMENT && !DEBUG */
317
318 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
319 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_stat_enable,
320 0, "enable/disable stats collection for netif queue");
321
322 static SKMEM_TYPE_DEFINE(na_netif_zone, struct nexus_netif_adapter);
323
324 static SKMEM_TYPE_DEFINE(nx_netif_zone, struct nx_netif);
325
326 #define SKMEM_TAG_NETIF_MIT "com.apple.skywalk.netif.mit"
327 static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
328
329 #define SKMEM_TAG_NETIF_FILTER "com.apple.skywalk.netif.filter"
330 SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
331
332 #define SKMEM_TAG_NETIF_FLOW "com.apple.skywalk.netif.flow"
333 SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
334
335 #define SKMEM_TAG_NETIF_AGENT_FLOW "com.apple.skywalk.netif.agent_flow"
336 SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
337
338 #define SKMEM_TAG_NETIF_LLINK "com.apple.skywalk.netif.llink"
339 SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
340
341 #define SKMEM_TAG_NETIF_QSET "com.apple.skywalk.netif.qset"
342 SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
343
344 #define SKMEM_TAG_NETIF_LLINK_INFO "com.apple.skywalk.netif.llink_info"
345 SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
346
347 /* use this for any temporary allocations */
348 #define SKMEM_TAG_NETIF_TEMP "com.apple.skywalk.netif.temp"
349 static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);
350
351 static void
nx_netif_dom_init(struct nxdom * nxdom)352 nx_netif_dom_init(struct nxdom *nxdom)
353 {
354 SK_LOCK_ASSERT_HELD();
355 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
356
357 _CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
358 _CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
359 _CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
360 _CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
361 _CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
362 _CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
363 _CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
364
365 (void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
366
367 nx_netif_compat_init(nxdom);
368
369 ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
370 strbufcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
371 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
372
373 netif_gso_init();
374 }
375
376 static void
nx_netif_dom_terminate(struct nxdom * nxdom)377 nx_netif_dom_terminate(struct nxdom *nxdom)
378 {
379 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
380
381 SK_LOCK_ASSERT_HELD();
382
383 netif_gso_fini();
384 nx_netif_compat_fini();
385
386 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
387 nxdom_prov_link, tnxdp) {
388 (void) nxdom_prov_del(nxdom_prov);
389 }
390 }
391
392 static void
nx_netif_dom_fini(struct nxdom * nxdom)393 nx_netif_dom_fini(struct nxdom *nxdom)
394 {
395 #pragma unused(nxdom)
396 }
397
398 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)399 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
400 {
401 #pragma unused(nxdom_prov)
402 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
403 return 0;
404 }
405
406 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)407 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
408 uint32_t flags)
409 {
410 #pragma unused(kring, p, flags)
411 return ENXIO;
412 }
413
414 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)415 nx_netif_prov_nx_stop(struct kern_nexus *nx)
416 {
417 uint32_t r;
418 struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
419 struct nexus_netif_adapter *nifna = NIFNA(na);
420
421 SK_LOCK_ASSERT_HELD();
422 ASSERT(nx != NULL);
423
424 /* place all rings in drop mode */
425 na_kr_drop(na, TRUE);
426
427 /* ensure global visibility */
428 os_atomic_thread_fence(seq_cst);
429
430 /* reset all TX notify callbacks */
431 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
432 while (!os_atomic_cmpxchg((void * volatile *)&na->na_tx_rings[r].ckr_na_notify,
433 ptrauth_nop_cast(void *__single, na->na_tx_rings[r].ckr_na_notify),
434 ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
435 ;
436 }
437 os_atomic_thread_fence(seq_cst);
438 if (nifna->nifna_tx_mit != NULL) {
439 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
440 }
441 }
442 if (nifna->nifna_tx_mit != NULL) {
443 skn_free_type_array_counted_by(tx, struct nx_netif_mit,
444 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
445 }
446
447 /* reset all RX notify callbacks */
448 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
449 while (!os_atomic_cmpxchg((void * volatile *)&na->na_rx_rings[r].ckr_na_notify,
450 ptrauth_nop_cast(void *__single, na->na_rx_rings[r].ckr_na_notify),
451 ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
452 ;
453 }
454 os_atomic_thread_fence(seq_cst);
455 if (nifna->nifna_rx_mit != NULL) {
456 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
457 }
458 }
459 if (nifna->nifna_rx_mit != NULL) {
460 skn_free_type_array_counted_by(rx, struct nx_netif_mit,
461 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
462 }
463 return 0;
464 }
465
466 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)467 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
468 ifnet_t ifp)
469 {
470 const char *ifname;
471
472 ifname = __terminated_by_to_indexable(ifp->if_name);
473 if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
474 *(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
475 *(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
476 } else if (IFNET_IS_WIFI(ifp)) {
477 if (ifname[0] == 'a' && ifname[1] == 'p' &&
478 ifname[2] == '\0') {
479 /* Wi-Fi Access Point */
480 *(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
481 *(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
482 } else if (ifp->if_eflags & IFEF_AWDL) {
483 /* AWDL */
484 *(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
485 *(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
486 } else {
487 /* Wi-Fi infrastructure */
488 *(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
489 *(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
490 }
491 } else if (IFNET_IS_ETHERNET(ifp)) {
492 #if !XNU_TARGET_OS_OSX
493 /*
494 * On non-macOS platforms, treat all compat Ethernet
495 * interfaces as USB Ethernet with reduced ring sizes.
496 */
497 *(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
498 *(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
499 #else /* XNU_TARGET_OS_OSX */
500 if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
501 *(adj->adj_rx_slots) =
502 sk_netif_compat_usb_eth_rx_ring_sz;
503 *(adj->adj_tx_slots) =
504 sk_netif_compat_usb_eth_tx_ring_sz;
505 }
506 #endif /* XNU_TARGET_OS_OSX */
507 }
508 }
509
510 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)511 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
512 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
513 {
514 /*
515 * for netif compat adjust the following parameters for memory
516 * optimization:
517 * - change the size of buffer object to 128 bytes.
518 * - don't allocate rx ring for host port and tx ring for dev port.
519 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
520 * Assumption here is that pdp_ip0 is always used as the data
521 * interface.
522 * - reduce the ring size for AWDL interface.
523 * - reduce the ring size for USB ethernet interface.
524 */
525 if (strbufcmp(nxdom_prov->nxdom_prov_name,
526 NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
527 /*
528 * Leave the parameters default if userspace access may be
529 * needed. We can't use skywalk_direct_allowed() here because
530 * the drivers have not attached yet.
531 */
532 if (skywalk_netif_direct_enabled()) {
533 goto done;
534 }
535
536 *(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
537 *(adj->adj_tx_rings) = 1;
538 if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
539 ifnet_t ifp;
540 ifnet_head_lock_shared();
541 ifp = ifindex2ifnet[nxp->nxp_ifindex];
542 ifnet_head_done();
543 VERIFY(ifp != NULL);
544 nx_netif_compat_adjust_ring_size(adj, ifp);
545 }
546 } else { /* netif native */
547 if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
548 *(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
549 *(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
550 }
551 /*
552 * Add another extra ring for host port. Note that if the
553 * nexus isn't configured to use the same pbufpool for all of
554 * its ports, we'd end up allocating extra here.
555 * Not a big deal since that case isn't the default.
556 */
557 *(adj->adj_tx_rings) += 1;
558 *(adj->adj_rx_rings) += 1;
559
560 if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
561 SK_ERR("buf size too small, min (%d)",
562 PKT_MAX_PROTO_HEADER_SIZE);
563 return EINVAL;
564 }
565 _CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
566 NX_INTF_ADV_SIZE);
567 *(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
568 }
569 done:
570 return 0;
571 }
572
573 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)574 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
575 const uint32_t req, const struct nxprov_params *nxp0,
576 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
577 uint32_t pp_region_config_flags)
578 {
579 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
580
581 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
582 nxdom, nxdom, nxdom, pp_region_config_flags,
583 nx_netif_prov_params_adjust);
584 }
585
586 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)587 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
588 struct kern_nexus *nx, struct nexus_adapter *na)
589 {
590 #pragma unused(nxdom_prov)
591 int err = 0;
592 boolean_t pp_truncated_buf = FALSE;
593 boolean_t allow_direct;
594 boolean_t kernel_only;
595
596 SK_DF(SK_VERB_NETIF,
597 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
598 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
599 SK_KVA(na));
600
601 ASSERT(na->na_arena == NULL);
602 if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
603 (na->na_type == NA_NETIF_COMPAT_HOST)) {
604 pp_truncated_buf = TRUE;
605 }
606 /*
607 * We do this check to determine whether to create the extra
608 * regions needed for userspace access. This is per interface.
609 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
610 */
611 allow_direct = skywalk_netif_direct_allowed(
612 __unsafe_null_terminated_from_indexable(na->na_name));
613
614 /*
615 * Both ports (host and dev) share the same packet buffer pool;
616 * the first time a port gets opened will allocate the pp that
617 * gets stored in the nexus, which will then be used by any
618 * subsequent opens.
619 */
620 kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
621 na->na_arena = skmem_arena_create_for_nexus(na,
622 NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
623 &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
624 ASSERT(na->na_arena != NULL || err != 0);
625 ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
626 NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
627 NX_DOM(nx)->nxdom_md_subtype));
628
629 return err;
630 }
631
632 SK_NO_INLINE_ATTRIBUTE
633 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)634 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
635 {
636 struct nx_llink_info_req *nlir = NULL;
637 struct nx_netif *nif;
638 struct netif_llink *llink;
639 uint16_t llink_cnt;
640 size_t len, user_len;
641 int err, i;
642
643 nif = NX_NETIF_PRIVATE(nx);
644 if (!NETIF_LLINK_ENABLED(nif)) {
645 SK_ERR("llink mode not enabled");
646 return ENOTSUP;
647 }
648 lck_rw_lock_shared(&nif->nif_llink_lock);
649 llink_cnt = nif->nif_llink_cnt;
650 if (llink_cnt == 0) {
651 SK_ERR("zero llink cnt");
652 err = ENXIO;
653 goto done;
654 }
655 len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
656 /* preserve sopt_valsize because it gets overwritten by copyin */
657 user_len = sopt->sopt_valsize;
658 if (user_len < len) {
659 SK_ERR("buffer too small");
660 err = ENOBUFS;
661 goto done;
662 }
663 nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
664 if (nlir == NULL) {
665 SK_ERR("failed to allocate nlir");
666 err = ENOMEM;
667 goto done;
668 }
669 err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
670 if (err != 0) {
671 SK_ERR("copyin failed: %d", err);
672 goto done;
673 }
674 if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
675 SK_ERR("nlir version mismatch: %d != %d",
676 nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
677 err = ENOTSUP;
678 goto done;
679 }
680 nlir->nlir_llink_cnt = llink_cnt;
681 i = 0;
682 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
683 struct nx_llink_info *nli;
684 struct netif_qset *qset;
685 uint16_t qset_cnt;
686 int j;
687
688 nli = &nlir->nlir_llink[i];
689 nli->nli_link_id = llink->nll_link_id;
690 nli->nli_link_id_internal = llink->nll_link_id_internal;
691 nli->nli_state = llink->nll_state;
692 nli->nli_flags = llink->nll_flags;
693
694 qset_cnt = llink->nll_qset_cnt;
695 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
696 nli->nli_qset_cnt = qset_cnt;
697
698 j = 0;
699 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
700 struct nx_qset_info *nqi;
701
702 nqi = &nli->nli_qset[j];
703 nqi->nqi_id = qset->nqs_id;
704 nqi->nqi_flags = qset->nqs_flags;
705 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
706 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
707 j++;
708 }
709 ASSERT(j == qset_cnt);
710 i++;
711 }
712 ASSERT(i == llink_cnt);
713 sopt->sopt_valsize = user_len;
714 err = sooptcopyout(sopt, nlir, len);
715 if (err != 0) {
716 SK_ERR("sooptcopyout failed: %d", err);
717 }
718 done:
719 lck_rw_unlock_shared(&nif->nif_llink_lock);
720 if (nlir != NULL) {
721 sk_free_data(nlir, len);
722 }
723 return err;
724 }
725
726 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)727 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
728 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
729 struct proc *p, kauth_cred_t cred)
730 {
731 #pragma unused(nxdom_prov)
732 struct sockopt sopt;
733 int err = 0;
734
735 SK_LOCK_ASSERT_HELD();
736
737 /* proceed only if the client possesses netif entitlement */
738 if ((err = skywalk_priv_check_cred(p, cred,
739 PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
740 goto done;
741 }
742
743 if (ncr->nc_req == USER_ADDR_NULL) {
744 err = EINVAL;
745 goto done;
746 }
747
748 /* to make life easier for handling copies */
749 bzero(&sopt, sizeof(sopt));
750 sopt.sopt_dir = sopt_dir;
751 sopt.sopt_val = ncr->nc_req;
752 sopt.sopt_valsize = ncr->nc_req_len;
753 sopt.sopt_p = p;
754
755 switch (ncr->nc_cmd) {
756 case NXCFG_CMD_ATTACH:
757 case NXCFG_CMD_DETACH: {
758 struct nx_spec_req nsr;
759
760 bzero(&nsr, sizeof(nsr));
761 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
762 if (err != 0) {
763 goto done;
764 }
765
766 /*
767 * Null-terminate in case this has an interface name;
768 * the union is already large enough for uuid_t.
769 */
770 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
771 if (p != kernproc) {
772 nsr.nsr_flags &= NXSPECREQ_MASK;
773 }
774
775 err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
776 if (err != 0) {
777 goto done;
778 }
779
780 /* XXX: [email protected] -- can this copyout fail? */
781 (void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
782 break;
783 }
784 case NXCFG_CMD_FLOW_ADD:
785 case NXCFG_CMD_FLOW_DEL: {
786 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
787 offsetof(struct nx_flow_req, _nfr_common_field_end));
788 struct nx_flow_req nfr;
789
790 bzero(&nfr, sizeof(nfr));
791 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
792 if (err != 0) {
793 goto done;
794 }
795
796 err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
797 if (err != 0) {
798 goto done;
799 }
800
801 /* XXX: [email protected] -- can this copyout fail? */
802 (void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
803 break;
804 }
805 case NXCFG_CMD_GET_LLINK_INFO: {
806 err = nx_netif_get_llink_info(&sopt, nx);
807 break;
808 }
809 default:
810 err = EINVAL;
811 goto done;
812 }
813 done:
814 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
815 "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
816 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
817 return err;
818 }
819
820 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)821 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
822 {
823 #pragma unused(nxdom_prov)
824 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
825 }
826
827 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)828 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
829 {
830 struct nx_netif *n;
831 char name[64];
832 const char *__null_terminated nxadv_name = NULL;
833 int error;
834
835 SK_LOCK_ASSERT_HELD();
836 ASSERT(nx->nx_arg == NULL);
837
838 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
839
840 nx->nx_arg = nx_netif_alloc(Z_WAITOK);
841 n = NX_NETIF_PRIVATE(nx);
842 if (NX_USER_CHANNEL_PROV(nx) &&
843 NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
844 nxadv_name = tsnprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
845 error = nx_advisory_alloc(nx, nxadv_name,
846 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
847 NEXUS_ADVISORY_TYPE_NETIF);
848 if (error != 0) {
849 nx_netif_free(n);
850 return error;
851 }
852 }
853 n->nif_nx = nx;
854 SK_D("create new netif 0x%llx for nexus 0x%llx",
855 SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
856 return 0;
857 }
858
859 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)860 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
861 {
862 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
863
864 SK_LOCK_ASSERT_HELD();
865
866 SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
867 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
868
869 /*
870 * XXX
871 * detach should be done separately to be symmetrical with attach.
872 */
873 nx_advisory_free(nx);
874 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
875 /* we're called by nx_detach(), so this cannot fail */
876 int err = nx_netif_ctl_detach(nx, NULL);
877 VERIFY(err == 0);
878 }
879 if (n->nif_dev_nxb != NULL) {
880 nxb_free(n->nif_dev_nxb);
881 n->nif_dev_nxb = NULL;
882 }
883 if (n->nif_host_nxb != NULL) {
884 nxb_free(n->nif_host_nxb);
885 n->nif_host_nxb = NULL;
886 }
887 SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
888 nx_netif_free(n);
889 nx->nx_arg = NULL;
890 }
891
892 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)893 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
894 struct kern_pbufpool **rpp)
895 {
896 ASSERT(nx->nx_tx_pp != NULL);
897 ASSERT(nx->nx_rx_pp != NULL);
898
899 if (tpp != NULL) {
900 *tpp = nx->nx_tx_pp;
901 }
902 if (rpp != NULL) {
903 *rpp = nx->nx_rx_pp;
904 }
905
906 return 0;
907 }
908
909 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)910 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
911 {
912 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
913 struct ifnet *ifp = nif->nif_ifp;
914 struct sk_stats_net_if *__single sns = out;
915 size_t actual_space = sizeof(struct sk_stats_net_if);
916
917 if (out != NULL && actual_space <= len) {
918 uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
919 if (ifp != NULL) {
920 (void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
921 }
922 sns->sns_nifs = nif->nif_stats;
923 }
924
925 return actual_space;
926 }
927
928 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)929 __netif_mib_get_llinks(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
930 {
931 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
932 struct nx_llink_info *nli_list = out;
933 size_t actual_space = 0;
934 if (NETIF_LLINK_ENABLED(nif)) {
935 lck_rw_lock_shared(&nif->nif_llink_lock);
936 actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
937
938 if (out != NULL && actual_space <= len) {
939 struct netif_llink *llink;
940 int i = 0;
941 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
942 struct nx_llink_info *nli;
943 struct netif_qset *qset;
944 uint16_t qset_cnt;
945 int j;
946
947 nli = &nli_list[i];
948 uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
949 nli->nli_link_id = llink->nll_link_id;
950 nli->nli_link_id_internal = llink->nll_link_id_internal;
951 nli->nli_state = llink->nll_state;
952 nli->nli_flags = llink->nll_flags;
953
954 qset_cnt = llink->nll_qset_cnt;
955 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
956 nli->nli_qset_cnt = qset_cnt;
957
958 j = 0;
959 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
960 struct nx_qset_info *nqi;
961
962 nqi = &nli->nli_qset[j];
963 nqi->nqi_id = qset->nqs_id;
964 nqi->nqi_flags = qset->nqs_flags;
965 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
966 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
967 j++;
968 }
969 ASSERT(j == qset_cnt);
970 i++;
971 }
972 ASSERT(i == nif->nif_llink_cnt);
973 }
974 lck_rw_unlock_shared(&nif->nif_llink_lock);
975 }
976
977 return actual_space;
978 }
979
980 static size_t
__netif_mib_get_queue_stats(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)981 __netif_mib_get_queue_stats(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
982 {
983 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
984 uint8_t *itr = out;
985 size_t actual_space = 0;
986 if (!NETIF_LLINK_ENABLED(nif)) {
987 return actual_space;
988 }
989
990 lck_rw_lock_shared(&nif->nif_llink_lock);
991 struct netif_llink *llink;
992 struct netif_qset *qset;
993 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
994 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
995 actual_space += sizeof(struct netif_qstats_info) *
996 (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
997 }
998 }
999 if (out == NULL || actual_space > len) {
1000 lck_rw_unlock_shared(&nif->nif_llink_lock);
1001 return actual_space;
1002 }
1003
1004 llink = NULL;
1005 qset = NULL;
1006 uint16_t i = 0, j = 0;
1007 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1008 uint16_t qset_cnt;
1009 j = 0;
1010 qset_cnt = llink->nll_qset_cnt;
1011 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
1012 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1013 int queue_cnt = qset->nqs_num_rx_queues +
1014 qset->nqs_num_tx_queues;
1015 for (uint16_t k = 0; k < queue_cnt; k++) {
1016 struct netif_qstats_info *nqi =
1017 (struct netif_qstats_info *)(void *)itr;
1018 struct netif_queue *nq = &qset->nqs_driver_queues[k];
1019 nqi->nqi_qset_id = qset->nqs_id;
1020 nqi->nqi_queue_idx = k;
1021 if (KPKT_VALID_SVC(nq->nq_svc)) {
1022 nqi->nqi_svc = (packet_svc_class_t)nq->nq_svc;
1023 }
1024 if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
1025 nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
1026 }
1027
1028 struct netif_qstats *nq_out = &nqi->nqi_stats;
1029 struct netif_qstats *nq_src = &nq->nq_stats;
1030 memcpy(nq_out, nq_src, sizeof(struct netif_qstats));
1031
1032 itr += sizeof(struct netif_qstats_info);
1033 }
1034 j++;
1035 }
1036 ASSERT(j == qset_cnt);
1037 i++;
1038 }
1039 ASSERT(i == nif->nif_llink_cnt);
1040
1041 lck_rw_unlock_shared(&nif->nif_llink_lock);
1042 return actual_space;
1043 }
1044
1045 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)1046 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1047 void *__sized_by(len) out, size_t len, struct proc *p)
1048 {
1049 #pragma unused(p)
1050 size_t ret;
1051
1052 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1053 (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
1054 return 0;
1055 }
1056
1057 switch (filter->nmf_type) {
1058 case NXMIB_NETIF_STATS:
1059 ret = __netif_mib_get_stats(nx, out, len);
1060 break;
1061 case NXMIB_LLINK_LIST:
1062 ret = __netif_mib_get_llinks(nx, out, len);
1063 break;
1064 case NXMIB_NETIF_QUEUE_STATS:
1065 ret = __netif_mib_get_queue_stats(nx, out, len);
1066 break;
1067 default:
1068 ret = 0;
1069 break;
1070 }
1071 return ret;
1072 }
1073
1074 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)1075 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1076 struct nxbind *nxb, void *info)
1077 {
1078 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1079 nexus_port_t first, last, port;
1080 int error;
1081
1082 ASSERT(nx_port != NULL);
1083 ASSERT(nxb != NULL);
1084
1085 port = *nx_port;
1086
1087 /*
1088 * If port is:
1089 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1090 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1091 * return back the assigned port.
1092 */
1093 first = NEXUS_PORT_NET_IF_CLIENT;
1094 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
1095 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
1096 ASSERT(first <= last);
1097
1098 NETIF_WLOCK(nif);
1099
1100 if (__improbable(first == last)) {
1101 error = ENOMEM;
1102 } else if (port != NEXUS_PORT_ANY) {
1103 error = nx_port_bind_info(nx, port, nxb, info);
1104 SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1105 } else {
1106 error = nx_port_find(nx, first, last - 1, &port);
1107 ASSERT(error != 0 || (port >= first && port < last));
1108 if (error == 0) {
1109 error = nx_port_bind_info(nx, port, nxb, info);
1110 SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1111 port, error);
1112 }
1113 }
1114 NETIF_WUNLOCK(nif);
1115
1116 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1117 if (error == 0) {
1118 *nx_port = port;
1119 }
1120
1121 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1122 "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1123 SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1124 nx->nx_active_ports, error);
1125
1126 return error;
1127 }
1128
1129 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1130 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1131 {
1132 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1133 int error = 0;
1134
1135 ASSERT(nx_port != NEXUS_PORT_ANY);
1136
1137 NETIF_WLOCK(nif);
1138 error = nx_port_unbind(nx, nx_port);
1139 NETIF_WUNLOCK(nif);
1140
1141 return error;
1142 }
1143
1144 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)1145 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1146 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1147 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1148 {
1149 #pragma unused(nxdom_prov)
1150 int err = 0;
1151
1152 SK_LOCK_ASSERT_HELD();
1153
1154 ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1155 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1156 nxdom_prov->nxdom_prov_dom->nxdom_type &&
1157 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1158 ASSERT(!(ch->ch_flags & CHANF_HOST));
1159
1160 switch (chr->cr_port) {
1161 case NEXUS_PORT_NET_IF_DEV:
1162 if (chr->cr_mode & CHMODE_HOST) {
1163 err = EINVAL;
1164 goto done;
1165 }
1166 break;
1167
1168 case NEXUS_PORT_NET_IF_HOST:
1169 if (!(chr->cr_mode & CHMODE_HOST)) {
1170 if (ch->ch_flags & CHANF_KERNEL) {
1171 err = EINVAL;
1172 goto done;
1173 }
1174 chr->cr_mode |= CHMODE_HOST;
1175 }
1176 /*
1177 * This channel is exclusively opened to the host
1178 * rings; don't notify the external provider.
1179 */
1180 os_atomic_or(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP, relaxed);
1181 break;
1182
1183 default:
1184 /*
1185 * This channel is shared between netif and user process;
1186 * don't notify the external provider.
1187 */
1188 os_atomic_or(&ch->ch_flags, CHANF_EXT_SKIP, relaxed);
1189 break;
1190 }
1191
1192 chr->cr_ring_set = RING_SET_DEFAULT;
1193 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1194 (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1195 nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1196 nx->nx_prov->nxprov_params->nxp_name);
1197
1198 if (ch->ch_flags & CHANF_KERNEL) {
1199 err = na_connect_spec(nx, ch, chr, p);
1200 } else {
1201 err = na_connect(nx, ch, chr, ch0, nxb, p);
1202 }
1203
1204 if (err == 0) {
1205 /*
1206 * Mark the kernel slot descriptor region as busy; this
1207 * prevents it from being torn-down at channel defunct
1208 * time, as the (external) nexus owner may be calling
1209 * KPIs that require accessing the slots.
1210 */
1211 skmem_arena_nexus_sd_set_noidle(
1212 skmem_arena_nexus(ch->ch_na->na_arena), 1);
1213 }
1214
1215 done:
1216 return err;
1217 }
1218
1219 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1220 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1221 struct kern_nexus *nx, struct kern_channel *ch)
1222 {
1223 #pragma unused(nxdom_prov)
1224 SK_LOCK_ASSERT_HELD();
1225
1226 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1227 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1228 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1229
1230 /*
1231 * Release busy assertion held earlier in nx_netif_dom_connect();
1232 * this allows for the final arena teardown to succeed.
1233 */
1234 skmem_arena_nexus_sd_set_noidle(
1235 skmem_arena_nexus(ch->ch_na->na_arena), -1);
1236
1237 if (ch->ch_flags & CHANF_KERNEL) {
1238 na_disconnect_spec(nx, ch);
1239 } else {
1240 na_disconnect(nx, ch);
1241 }
1242 }
1243
1244 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1245 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1246 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1247 {
1248 #pragma unused(nxdom_prov, nx)
1249 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1250 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1251 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1252 ch->ch_na->na_type == NA_NETIF_HOST ||
1253 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1254 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1255 ch->ch_na->na_type == NA_NETIF_VP);
1256
1257 na_ch_rings_defunct(ch, p);
1258 }
1259
1260 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1261 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1262 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1263 {
1264 #pragma unused(nxdom_prov)
1265 struct ifnet *ifp;
1266
1267 if (!locked) {
1268 SK_LOCK_ASSERT_NOTHELD();
1269 SK_LOCK();
1270 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1271 } else {
1272 SK_LOCK_ASSERT_HELD();
1273 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1274 }
1275
1276 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1277 ch->ch_na->na_type == NA_NETIF_HOST ||
1278 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1279 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1280 ch->ch_na->na_type == NA_NETIF_VP);
1281
1282 na_defunct(nx, ch, ch->ch_na, locked);
1283 ifp = ch->ch_na->na_ifp;
1284 if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
1285 ifnet_is_low_latency(ifp)) {
1286 /*
1287 * We release the VPNA's ifp here instead of waiting for the
1288 * application to close the channel to trigger the release.
1289 */
1290 DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
1291 ch->ch_na, struct ifnet *, ifp);
1292 ifnet_decr_iorefcnt(ifp);
1293 ch->ch_na->na_ifp = NULL;
1294 }
1295 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1296 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1297 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1298 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1299
1300 if (!locked) {
1301 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1302 SK_UNLOCK();
1303 } else {
1304 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1305 SK_LOCK_ASSERT_HELD();
1306 }
1307 }
1308
1309 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1310 na_netif_alloc(zalloc_flags_t how)
1311 {
1312 _CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1313
1314 return zalloc_flags(na_netif_zone, how | Z_ZERO);
1315 }
1316
1317 void
na_netif_free(struct nexus_adapter * na)1318 na_netif_free(struct nexus_adapter *na)
1319 {
1320 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1321
1322 SK_LOCK_ASSERT_HELD();
1323 SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1324
1325 ASSERT(na->na_refcount == 0);
1326 ASSERT(nifna->nifna_tx_mit == NULL);
1327 ASSERT(nifna->nifna_rx_mit == NULL);
1328 bzero(nifna, sizeof(*nifna));
1329
1330 zfree(na_netif_zone, nifna);
1331 }
1332
1333 /* Process NXCFG_CMD_ATTACH */
1334 SK_NO_INLINE_ATTRIBUTE
1335 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1336 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1337 struct proc *p)
1338 {
1339 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1340 struct ifnet *ifp = NULL;
1341 boolean_t compat;
1342 int err = 0;
1343
1344 SK_LOCK_ASSERT_HELD();
1345
1346 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1347 compat = (strbufcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1348 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1349
1350 uuid_clear(nsr->nsr_if_uuid);
1351 /*
1352 * The netif accepts either an interface name or a pointer to
1353 * an ifnet, but never a UUID.
1354 */
1355 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1356 err = EINVAL;
1357 goto done;
1358 }
1359 if (nsr->nsr_flags & NXSPECREQ_IFP) {
1360 if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1361 err = EINVAL;
1362 goto done;
1363 }
1364 } else if ((ifp = ifunit_ref(__unsafe_null_terminated_from_indexable(
1365 nsr->nsr_name))) == NULL) {
1366 err = ENXIO;
1367 goto done;
1368 }
1369
1370 if ((compat && SKYWALK_NATIVE(ifp)) ||
1371 (!compat && !SKYWALK_NATIVE(ifp))) {
1372 /* native driver for netif; non-native for netif_compat */
1373 err = ENODEV;
1374 } else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1375 err = EBUSY;
1376 } else {
1377 ASSERT(uuid_is_null(n->nif_uuid));
1378 /*
1379 * Upon success, callee will hold its own ifnet iorefcnt
1380 * as well as a retain count on the nexus adapter.
1381 */
1382 if (compat) {
1383 err = nx_netif_compat_attach(nx, ifp);
1384 } else {
1385 err = nx_netif_attach(nx, ifp);
1386 }
1387
1388 if (err == 0) {
1389 /* return the adapter UUID */
1390 uuid_generate_random(n->nif_uuid);
1391 uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1392 #if (DEVELOPMENT || DEBUG)
1393 skoid_create(&n->nif_skoid,
1394 SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1395 CTLFLAG_RW);
1396 #endif /* !DEVELOPMENT && !DEBUG */
1397 }
1398 }
1399 done:
1400 /* drop I/O refcnt from ifunit_ref() */
1401 if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1402 ifnet_decr_iorefcnt(ifp);
1403 }
1404
1405 #if SK_LOG
1406 uuid_string_t uuidstr, ifuuidstr;
1407 const char *nustr;
1408 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1409 nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1410 } else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1411 (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1412 SK_KVA(nsr->nsr_ifp));
1413 nustr = uuidstr;
1414 } else {
1415 nustr = nsr->nsr_name;
1416 }
1417 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1418 "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1419 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1420 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1421 #endif /* SK_LOG */
1422
1423 return err;
1424 }
1425
1426 SK_NO_INLINE_ATTRIBUTE
1427 static int
nx_netif_clean(struct nx_netif * nif,boolean_t quiesce_needed)1428 nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
1429 {
1430 struct kern_nexus *nx = nif->nif_nx;
1431 struct ifnet *ifp;
1432 boolean_t suspended = FALSE;
1433
1434 ifp = nif->nif_ifp;
1435 if (ifp == NULL) {
1436 return EALREADY;
1437 }
1438 /*
1439 * For regular kernel-attached interfaces, quiescing is handled by
1440 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1441 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1442 * are constructed on the fly and can also be torn down on the fly.
1443 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1444 * can be detached while the interface is still attached.
1445 */
1446 if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
1447 SK_UNLOCK();
1448 suspended = TRUE;
1449 ifnet_datamov_drain(ifp);
1450 SK_LOCK();
1451 }
1452 nx_netif_callbacks_fini(nif);
1453 nx_netif_agent_fini(nif);
1454 nx_netif_capabilities_fini(nif);
1455 nx_netif_flow_fini(nif);
1456 nx_netif_filter_fini(nif);
1457 nx_netif_llink_fini(nif);
1458 nx_netif_flags_fini(nif);
1459
1460 uuid_clear(nif->nif_uuid);
1461 /* nx_netif_{compat_}attach() held both references */
1462 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1463 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1464 nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1465 nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1466
1467 ifp->if_na_ops = NULL;
1468 ifp->if_na = NULL;
1469 nif->nif_ifp = NULL;
1470 nif->nif_netif_nxadv = NULL;
1471 SKYWALK_CLEAR_CAPABLE(ifp);
1472 if (suspended) {
1473 ifnet_datamov_resume(ifp);
1474 }
1475
1476 #if (DEVELOPMENT || DEBUG)
1477 skoid_destroy(&nif->nif_skoid);
1478 #endif /* !DEVELOPMENT && !DEBUG */
1479 return 0;
1480 }
1481
1482 /* process NXCFG_CMD_DETACH */
1483 SK_NO_INLINE_ATTRIBUTE
1484 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1485 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1486 {
1487 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1488 int err = 0;
1489
1490 SK_LOCK_ASSERT_HELD();
1491
1492 /*
1493 * nsr is NULL when we're called from the destructor, and it
1494 * implies that we'll detach whatever that is attached.
1495 */
1496 if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1497 err = EINVAL;
1498 } else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1499 nif->nif_uuid) != 0) {
1500 err = ESRCH;
1501 } else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1502 /* nx_netif_ctl_attach() not yet done or already detached */
1503 err = ENXIO;
1504 } else if (nx->nx_ch_count != 0) {
1505 /*
1506 * There's at least a channel opened; we can't
1507 * yank the interface from underneath the nexus
1508 * since our dlil input/output handler may be
1509 * running now. Bail out and come back here
1510 * again when the nexus detaches.
1511 */
1512 err = EBUSY;
1513 } else {
1514 err = nx_netif_clean(nif, TRUE);
1515 }
1516
1517 #if SK_LOG
1518 if (nsr != NULL) {
1519 uuid_string_t ifuuidstr;
1520 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1521 "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1522 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1523 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1524 nsr->nsr_flags, err);
1525 } else {
1526 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1527 "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1528 NX_DOM_PROV(nx)->nxdom_prov_name, err);
1529 }
1530 #endif /* SK_LOG */
1531
1532 return err;
1533 }
1534
1535 /*
1536 * XXX
1537 * These checks are copied from fsw.c
1538 * There are no tests exercising this code. Do we still need this?
1539 */
1540 SK_NO_INLINE_ATTRIBUTE
1541 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1542 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1543 struct proc *p, struct nx_flow_req *req)
1544 {
1545 #pragma unused(nif)
1546 boolean_t need_check;
1547 int error;
1548
1549 if (uuid_is_null(req->nfr_flow_uuid)) {
1550 return EINVAL;
1551 }
1552 req->nfr_flags &= NXFLOWREQF_MASK;
1553 req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1554
1555 if (cmd == NXCFG_CMD_FLOW_DEL) {
1556 return 0;
1557 }
1558 need_check = FALSE;
1559 if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1560 need_check = TRUE;
1561 } else if (!uuid_is_null(req->nfr_euuid)) {
1562 uuid_t uuid;
1563
1564 /* get the UUID of the issuing process */
1565 proc_getexecutableuuid(p, uuid, sizeof(uuid));
1566
1567 /*
1568 * If this is not issued by a process for its own
1569 * executable UUID and if the process does not have
1570 * the necessary privilege, reject the request.
1571 * The logic is similar to so_set_effective_uuid().
1572 */
1573 if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1574 need_check = TRUE;
1575 }
1576 }
1577 if (need_check) {
1578 kauth_cred_t cred = kauth_cred_proc_ref(p);
1579 error = priv_check_cred(cred,
1580 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1581 kauth_cred_unref(&cred);
1582 if (error != 0) {
1583 return error;
1584 }
1585 }
1586 return 0;
1587 }
1588
1589 SK_NO_INLINE_ATTRIBUTE
1590 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1591 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1592 struct nx_flow_req *req)
1593 {
1594 int err;
1595
1596 ASSERT(p != PROC_NULL);
1597 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1598 if (err != 0) {
1599 return err;
1600 }
1601
1602 /* init kernel only fields */
1603 nx_flow_req_internalize(req);
1604 req->nfr_context = NULL;
1605 req->nfr_flow_stats = NULL;
1606 req->nfr_port_reservation = NULL;
1607 req->nfr_pid = proc_pid(p);
1608
1609 err = nx_netif_netagent_flow_add(nif, req);
1610 nx_flow_req_externalize(req);
1611 return err;
1612 }
1613
1614 SK_NO_INLINE_ATTRIBUTE
1615 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1616 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1617 struct nx_flow_req *req)
1618 {
1619 int err;
1620
1621 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1622 if (err != 0) {
1623 return err;
1624 }
1625
1626 nx_flow_req_internalize(req);
1627 req->nfr_pid = proc_pid(p);
1628
1629 err = nx_netif_netagent_flow_del(nif, req);
1630 nx_flow_req_externalize(req);
1631 return err;
1632 }
1633
1634 SK_NO_INLINE_ATTRIBUTE
1635 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1636 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1637 struct proc *p)
1638 {
1639 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1640 struct nx_spec_req *__single nsr = data;
1641 struct nx_flow_req *__single nfr = data;
1642 int error = 0;
1643
1644 SK_LOCK_ASSERT_HELD();
1645
1646 switch (nc_cmd) {
1647 case NXCFG_CMD_ATTACH:
1648 error = nx_netif_ctl_attach(nx, nsr, p);
1649 break;
1650
1651 case NXCFG_CMD_DETACH:
1652 error = nx_netif_ctl_detach(nx, nsr);
1653 break;
1654
1655 case NXCFG_CMD_FLOW_ADD:
1656 error = nx_netif_ctl_flow_add(nif, p, nfr);
1657 break;
1658
1659 case NXCFG_CMD_FLOW_DEL:
1660 error = nx_netif_ctl_flow_del(nif, p, nfr);
1661 break;
1662
1663 default:
1664 SK_ERR("invalid cmd %u", nc_cmd);
1665 error = EINVAL;
1666 break;
1667 }
1668 return error;
1669 }
1670
1671 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1672 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1673 uint32_t flags)
1674 {
1675 #pragma unused(flags)
1676 struct netif_qset *qset;
1677
1678 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1679 (void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1680 }
1681 }
1682
1683 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1684 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1685 {
1686 struct nx_netif *nif;
1687 struct netif_llink *llink;
1688
1689 nif = NX_NETIF_PRIVATE(nx);
1690
1691 lck_rw_lock_shared(&nif->nif_llink_lock);
1692 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1693 nx_netif_llink_notify(nx, llink, flags);
1694 }
1695 lck_rw_unlock_shared(&nif->nif_llink_lock);
1696 }
1697
1698 /*
1699 * if_start() callback for native Skywalk interfaces, registered
1700 * at ifnet_allocate_extended() time, and invoked by the ifnet
1701 * starter thread.
1702 */
1703 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1704 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1705 {
1706 if (__improbable(ifp->if_na == NULL)) {
1707 return;
1708 }
1709
1710 /*
1711 * Do this only if the nexus adapter is active, i.e. a channel
1712 * has been opened to it by the module above (flowswitch, etc.)
1713 */
1714 struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1715 if (__probable(NA_IS_ACTIVE(hwna))) {
1716 struct kern_nexus *nx = hwna->na_nx;
1717
1718 /* update our work timestamp */
1719 hwna->na_work_ts = _net_uptime;
1720
1721 if (NX_LLINK_PROV(nx)) {
1722 nx_netif_llink_notify_all(nx, flags);
1723 } else {
1724 struct __kern_channel_ring *kring;
1725
1726 /* for doorbell purposes, use TX ring 0 */
1727 kring = &hwna->na_tx_rings[0];
1728
1729 /* Issue a synchronous TX doorbell on the netif device ring */
1730 kring->ckr_na_sync(kring, PROC_NULL,
1731 (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1732 }
1733 } else {
1734 struct netif_stats *nifs =
1735 &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1736 STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1737 }
1738 }
1739
1740 static void
nx_netif_doorbell(struct ifnet * ifp)1741 nx_netif_doorbell(struct ifnet *ifp)
1742 {
1743 nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1744 }
1745
1746 /*
1747 * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1748 * perform synchronous TX doorbell to the driver, by invoking the driver's
1749 * doorbell callback directly in the same thread context. It is also called
1750 * when the layer above performs a TX sync operation, where we might need
1751 * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1752 */
1753 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1754 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1755 uint32_t flags)
1756 {
1757 #pragma unused(p)
1758 struct ifnet *ifp = KRNA(kring)->na_ifp;
1759 boolean_t sync_only;
1760 int ret = 0;
1761
1762 ASSERT(ifp != NULL);
1763
1764 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1765 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1766 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1767 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1768 flags);
1769
1770 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1771 SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1772 SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1773 return ENXIO;
1774 }
1775
1776 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1777 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1778 "flow control ON", SK_KVA(kring), if_name(ifp),
1779 SK_KVA(ifp));
1780 return ENXIO;
1781 }
1782
1783 /* update our work timestamp */
1784 KRNA(kring)->na_work_ts = _net_uptime;
1785
1786 sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1787 !KR_KERNEL_ONLY(kring);
1788 /* regular sync (reclaim) */
1789 if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1790 ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1791 kring->ckr_pending_intr != 0);
1792 kring->ckr_pending_intr = 0;
1793
1794 /* direct user channels do not need to use the doorbell */
1795 if (__improbable(sync_only)) {
1796 return ret;
1797 }
1798 }
1799
1800 /*
1801 * Doorbell call. Here we do doorbell explicitly if the flag is
1802 * set or implicitly if we're opened directly by a user channel.
1803 * Synchronous vs. asynchronous depending on the context.
1804 */
1805 if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1806 if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1807 ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1808 !(flags & NA_SYNCF_NETIF_ASYNC));
1809 nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1810 } else {
1811 ifnet_start(ifp);
1812 }
1813 }
1814
1815 return ret;
1816 }
1817
1818 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1819 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1820 uint32_t flags)
1821 {
1822 #pragma unused(p)
1823 int ret;
1824
1825 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1826 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1827 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1828 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1829 flags);
1830
1831 ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1832
1833 /* update our work timestamp */
1834 KRNA(kring)->na_work_ts = _net_uptime;
1835
1836 ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1837 kring->ckr_pending_intr != 0);
1838 kring->ckr_pending_intr = 0;
1839
1840 return ret;
1841 }
1842
1843 static void
nx_netif_na_dtor(struct nexus_adapter * na)1844 nx_netif_na_dtor(struct nexus_adapter *na)
1845 {
1846 struct ifnet *__single ifp;
1847 struct nexus_netif_adapter *nifna = NIFNA(na);
1848
1849 SK_LOCK_ASSERT_HELD();
1850 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1851
1852 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1853
1854 /*
1855 * If the finalizer callback hasn't been called for whatever
1856 * reasons, pick up the embryonic ifnet stored in na_private.
1857 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1858 */
1859 if ((ifp = na->na_ifp) == NULL) {
1860 ifp = na->na_private;
1861 na->na_private = NULL;
1862 } else {
1863 ifnet_decr_iorefcnt(ifp);
1864 na->na_ifp = NULL;
1865 }
1866
1867 if (nifna->nifna_netif != NULL) {
1868 nx_netif_release(nifna->nifna_netif);
1869 nifna->nifna_netif = NULL;
1870 }
1871 ASSERT(SKYWALK_NATIVE(ifp));
1872 }
1873
1874 /*
1875 * Dispatch rx/tx interrupts to the channel rings.
1876 *
1877 * The 'notify' routine depends on what the ring is attached to.
1878 * - for a channel file descriptor, do an event wakeup on the individual
1879 * waitqueue, plus one on the global one if needed (see na_notify)
1880 * - for a device port connected to a FlowSwitch, call the proper
1881 * forwarding routine; see nx_fsw_tx_hwna_notify()
1882 * or nx_fsw_rx_hwna_notify().
1883 */
1884 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1885 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1886 uint32_t flags, uint32_t *work_done)
1887 {
1888 struct netif_stats *nifs =
1889 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1890 int (*notify)(struct __kern_channel_ring *kring,
1891 struct proc *, uint32_t flags);
1892 int ret;
1893
1894 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1895
1896 SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1897 ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1898 "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1899 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1900 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1901
1902 /* update our work timestamp */
1903 KRNA(kring)->na_work_ts = _net_uptime;
1904
1905 kring->ckr_pending_intr++;
1906 if (work_done != NULL) {
1907 *work_done = 1; /* do not fire again */
1908 }
1909 /*
1910 * We can't be calling ckr_na_notify here since we could already be
1911 * intercepting it, else we'd end up recursively calling ourselves.
1912 * Use the original na_notify callback saved during na_activate, or in
1913 * the case when the module above us is the flowswitch, the notify
1914 * routine that it has installed in place of our original one.
1915 */
1916 if (__probable(!KR_DROP(kring) &&
1917 (notify = kring->ckr_netif_notify) != NULL)) {
1918 ret = notify(kring, p, flags);
1919 } else {
1920 /*
1921 * If the ring is in drop mode, pretend as if it's busy.
1922 * This allows the mitigation thread to pause for a while
1923 * before attempting again.
1924 */
1925 ret = EBUSY;
1926 }
1927 if (__improbable(ret != 0)) {
1928 switch (kring->ckr_tx) {
1929 case NR_RX:
1930 if (ret == EBUSY) {
1931 STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1932 } else if (ret == EAGAIN) {
1933 STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1934 } else {
1935 STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1936 }
1937 break;
1938
1939 case NR_TX:
1940 if (ret == EBUSY) {
1941 STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1942 } else if (ret == EAGAIN) {
1943 STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1944 } else {
1945 STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1946 }
1947 break;
1948
1949 default:
1950 break;
1951 }
1952 }
1953
1954 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1955
1956 return ret;
1957 }
1958
1959 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1960 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1961 uint32_t flags)
1962 {
1963 return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1964 }
1965
1966 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1967 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1968 uint32_t flags)
1969 {
1970 int ret;
1971
1972 /*
1973 * In the event the mitigation thread is disabled, protect
1974 * against recursion by detecting if we're already in the
1975 * context of an RX notify. IOSkywalkFamily may invoke the
1976 * notify callback as part of its RX sync callback.
1977 */
1978 if (__probable(!sk_is_rx_notify_protected())) {
1979 sk_protect_t protect;
1980 uint32_t work_done;
1981
1982 protect = sk_rx_notify_protect();
1983 ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1984 sk_sync_unprotect(protect);
1985 } else {
1986 ret = EAGAIN;
1987 }
1988
1989 return ret;
1990 }
1991
1992 static int
nx_netif_na_notify_rx_redirect(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1993 nx_netif_na_notify_rx_redirect(struct __kern_channel_ring *kring, struct proc *p,
1994 uint32_t flags)
1995 {
1996 struct netif_stats *nifs =
1997 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1998 uint32_t work_done;
1999
2000 ASSERT(kring->ckr_tx == NR_RX);
2001 STATS_INC(nifs, NETIF_STATS_RX_IRQ);
2002 return nx_netif_common_intr(kring, p, flags, &work_done);
2003 }
2004
2005 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)2006 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
2007 boolean_t *tx_mit, boolean_t *tx_mit_simple,
2008 boolean_t *rx_mit, boolean_t *rx_mit_simple)
2009 {
2010 struct nx_netif *nif = nifna->nifna_netif;
2011
2012 /*
2013 * TX mitigation is disabled by default, but can be
2014 * overridden via "sk_netif_tx_mit=N" boot-arg, where
2015 * N is one of SK_NETIF_MIT_FORCE_* values.
2016 */
2017 *tx_mit = *tx_mit_simple = FALSE;
2018 switch (sk_netif_tx_mit) {
2019 case SK_NETIF_MIT_FORCE_SIMPLE:
2020 *tx_mit_simple = TRUE;
2021 OS_FALLTHROUGH;
2022 case SK_NETIF_MIT_FORCE_ADVANCED:
2023 *tx_mit = TRUE;
2024 break;
2025 case SK_NETIF_MIT_FORCE_OFF:
2026 case SK_NETIF_MIT_AUTO:
2027 ASSERT(*tx_mit == FALSE);
2028 break;
2029 default:
2030 VERIFY(0);
2031 /* NOTREACHED */
2032 __builtin_unreachable();
2033 }
2034
2035 /*
2036 * RX mitigation is enabled by default only for BSD-style
2037 * virtual network interfaces, but can be overridden
2038 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
2039 * SK_NETIF_MIT_FORCE_* values.
2040 */
2041 *rx_mit = *rx_mit_simple = FALSE;
2042 switch (sk_netif_rx_mit) {
2043 case SK_NETIF_MIT_FORCE_OFF:
2044 ASSERT(*rx_mit == FALSE);
2045 break;
2046 case SK_NETIF_MIT_FORCE_SIMPLE:
2047 *rx_mit_simple = TRUE;
2048 OS_FALLTHROUGH;
2049 case SK_NETIF_MIT_FORCE_ADVANCED:
2050 *rx_mit = TRUE;
2051 break;
2052 case SK_NETIF_MIT_AUTO:
2053 *rx_mit_simple = TRUE;
2054 /*
2055 * Enable RX mitigation thread only for BSD-style virtual (and
2056 * regular) interfaces, since otherwise we may run out of stack
2057 * when subjected to IPsec processing, etc.
2058 */
2059 *rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
2060 NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
2061 break;
2062 default:
2063 VERIFY(0);
2064 /* NOTREACHED */
2065 __builtin_unreachable();
2066 }
2067 }
2068
2069 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)2070 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2071 {
2072 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2073 boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2074 struct nx_netif *nif = nifna->nifna_netif;
2075 struct ifnet *ifp = na->na_ifp;
2076 int error = 0;
2077 uint32_t r;
2078 /* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
2079 struct nx_netif_mit *mit_tmp;
2080 uint32_t nrings;
2081
2082 ASSERT(na->na_type == NA_NETIF_DEV);
2083 ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2084
2085 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
2086 SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2087
2088 switch (mode) {
2089 case NA_ACTIVATE_MODE_ON:
2090 ASSERT(SKYWALK_CAPABLE(ifp));
2091
2092 nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
2093 &rx_mit, &rx_mit_simple);
2094
2095 /*
2096 * Init the mitigation support on all the dev TX rings.
2097 */
2098 if (tx_mit) {
2099 nrings = na_get_nrings(na, NR_TX);
2100 mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
2101 nrings, Z_WAITOK, skmem_tag_netif_mit);
2102 if (mit_tmp == NULL) {
2103 SK_ERR("TX mitigation allocation failed");
2104 error = ENOMEM;
2105 goto out;
2106 }
2107 nifna->nifna_tx_mit = mit_tmp;
2108 nifna->nifna_tx_mit_count = nrings;
2109 } else {
2110 ASSERT(nifna->nifna_tx_mit == NULL);
2111 }
2112
2113 /*
2114 * Init the mitigation support on all the dev RX rings.
2115 */
2116 if (rx_mit) {
2117 nrings = na_get_nrings(na, NR_RX);
2118 mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
2119 nrings, Z_WAITOK, skmem_tag_netif_mit);
2120 if (mit_tmp == NULL) {
2121 SK_ERR("RX mitigation allocation failed");
2122 if (nifna->nifna_tx_mit != NULL) {
2123 skn_free_type_array_counted_by(rx_fail,
2124 struct nx_netif_mit,
2125 nifna->nifna_tx_mit_count,
2126 nifna->nifna_tx_mit);
2127 }
2128 error = ENOMEM;
2129 goto out;
2130 }
2131 nifna->nifna_rx_mit = mit_tmp;
2132 nifna->nifna_rx_mit_count = nrings;
2133 } else {
2134 ASSERT(nifna->nifna_rx_mit == NULL);
2135 }
2136
2137 /* intercept na_notify callback on the TX rings */
2138 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2139 na->na_tx_rings[r].ckr_netif_notify =
2140 na->na_tx_rings[r].ckr_na_notify;
2141 na->na_tx_rings[r].ckr_na_notify =
2142 nx_netif_na_notify_tx;
2143 if (nifna->nifna_tx_mit != NULL) {
2144 nx_netif_mit_init(nif, ifp,
2145 &nifna->nifna_tx_mit[r],
2146 &na->na_tx_rings[r], tx_mit_simple);
2147 }
2148 }
2149
2150 /* intercept na_notify callback on the RX rings */
2151 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2152 na->na_rx_rings[r].ckr_netif_notify =
2153 na->na_rx_rings[r].ckr_na_notify;
2154 na->na_rx_rings[r].ckr_na_notify = IFNET_IS_REDIRECT(ifp) ?
2155 nx_netif_na_notify_rx_redirect : nx_netif_na_notify_rx;
2156 if (nifna->nifna_rx_mit != NULL) {
2157 nx_netif_mit_init(nif, ifp,
2158 &nifna->nifna_rx_mit[r],
2159 &na->na_rx_rings[r], rx_mit_simple);
2160 }
2161 }
2162 nx_netif_filter_enable(nif);
2163 nx_netif_flow_enable(nif);
2164 os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
2165
2166 /* steer all start requests to netif; this must not fail */
2167 lck_mtx_lock(&ifp->if_start_lock);
2168 error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2169 VERIFY(error == 0);
2170 lck_mtx_unlock(&ifp->if_start_lock);
2171 break;
2172
2173 case NA_ACTIVATE_MODE_DEFUNCT:
2174 ASSERT(SKYWALK_CAPABLE(ifp));
2175 break;
2176
2177 case NA_ACTIVATE_MODE_OFF:
2178 /*
2179 * Note that here we cannot assert SKYWALK_CAPABLE()
2180 * as we're called in the destructor path.
2181 */
2182 os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
2183 nx_netif_flow_disable(nif);
2184 nx_netif_filter_disable(nif);
2185
2186 /*
2187 * Here we may block while holding sk_lock, but because
2188 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2189 * should immediately return. A better approach would be
2190 * to drop sk_lock and add a monitor for this routine.
2191 */
2192 lck_mtx_lock(&ifp->if_start_lock);
2193 while (ifp->if_start_active != 0) {
2194 ++ifp->if_start_waiters;
2195 (void) msleep(&ifp->if_start_waiters,
2196 &ifp->if_start_lock, (PZERO - 1),
2197 na->na_name, NULL);
2198 }
2199 /* steer all start requests to default handler */
2200 ifnet_reset_start_handler(ifp);
2201 lck_mtx_unlock(&ifp->if_start_lock);
2202
2203 /* reset all TX notify callbacks */
2204 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2205 na->na_tx_rings[r].ckr_na_notify =
2206 na->na_tx_rings[r].ckr_netif_notify;
2207 na->na_tx_rings[r].ckr_netif_notify = NULL;
2208 if (nifna->nifna_tx_mit != NULL) {
2209 na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2210 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2211 }
2212 }
2213
2214 if (nifna->nifna_tx_mit != NULL) {
2215 skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
2216 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
2217 }
2218
2219 /* reset all RX notify callbacks */
2220 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2221 na->na_rx_rings[r].ckr_na_notify =
2222 na->na_rx_rings[r].ckr_netif_notify;
2223 na->na_rx_rings[r].ckr_netif_notify = NULL;
2224 if (nifna->nifna_rx_mit != NULL) {
2225 na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2226 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2227 }
2228 }
2229 if (nifna->nifna_rx_mit != NULL) {
2230 skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
2231 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
2232 }
2233 break;
2234
2235 default:
2236 VERIFY(0);
2237 /* NOTREACHED */
2238 __builtin_unreachable();
2239 }
2240 out:
2241 return error;
2242 }
2243
2244 SK_NO_INLINE_ATTRIBUTE
2245 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2246 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2247 {
2248 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2249 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2250 struct nexus_netif_adapter *devnifna = NULL;
2251 struct nexus_netif_adapter *hostnifna = NULL;
2252 struct nexus_adapter *__single devna = NULL;
2253 struct nexus_adapter *__single hostna = NULL;
2254 boolean_t embryonic = FALSE;
2255 int retval = 0;
2256 uint32_t na_flags;
2257
2258 SK_LOCK_ASSERT_HELD();
2259 ASSERT(SKYWALK_NATIVE(ifp));
2260 ASSERT(!SKYWALK_CAPABLE(ifp));
2261 ASSERT(ifp->if_na == NULL);
2262 ASSERT(ifp->if_na_ops == NULL);
2263
2264 devnifna = na_netif_alloc(Z_WAITOK);
2265 hostnifna = na_netif_alloc(Z_WAITOK);
2266
2267 /*
2268 * We can be called for two different interface states:
2269 *
2270 * Fully attached: get an io ref count; upon success, this
2271 * holds a reference to the ifnet for the ifp pointer stored
2272 * in 'na_ifp' down below for both adapters.
2273 *
2274 * Embryonic: temporary hold the ifnet in na_private, which
2275 * upon a successful ifnet_attach(), will be moved over to
2276 * the 'na_ifp' with an io ref count held.
2277 *
2278 * The ifnet in 'na_ifp' will be released by na_release_locked().
2279 */
2280 if (!ifnet_is_attached(ifp, 1)) {
2281 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2282 ifp = NULL;
2283 retval = ENXIO;
2284 goto err;
2285 }
2286 embryonic = TRUE;
2287 }
2288
2289 /* initialize the device netif adapter */
2290 devnifna->nifna_netif = nif;
2291 nx_netif_retain(nif);
2292 devna = &devnifna->nifna_up;
2293 devna->na_type = NA_NETIF_DEV;
2294 devna->na_free = na_netif_free;
2295 (void) strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
2296 uuid_generate_random(devna->na_uuid);
2297 if (embryonic) {
2298 /*
2299 * We will move this over to na_ifp once
2300 * the interface is fully attached.
2301 */
2302 devna->na_private = ifp;
2303 ASSERT(devna->na_ifp == NULL);
2304 } else {
2305 ASSERT(devna->na_private == NULL);
2306 /* use I/O refcnt from ifnet_is_attached() */
2307 devna->na_ifp = ifp;
2308 }
2309 devna->na_activate = nx_netif_na_activate;
2310 devna->na_txsync = nx_netif_na_txsync;
2311 devna->na_rxsync = nx_netif_na_rxsync;
2312 devna->na_dtor = nx_netif_na_dtor;
2313 devna->na_krings_create = nx_netif_dev_krings_create;
2314 devna->na_krings_delete = nx_netif_dev_krings_delete;
2315 devna->na_special = nx_netif_na_special;
2316
2317 na_flags = NAF_NATIVE;
2318 if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2319 na_flags |= NAF_VIRTUAL_DEVICE;
2320 }
2321 if (NX_LLINK_PROV(nx)) {
2322 /*
2323 * while operating in logical link mode, we don't need to
2324 * create backing memory regions for the rings as they are
2325 * not used.
2326 */
2327 na_flags |= NAF_MEM_NO_INIT;
2328 }
2329 os_atomic_or(&devna->na_flags, na_flags, relaxed);
2330 *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2331 NEXUS_STATS_TYPE_INVALID;
2332
2333 na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2334 na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2335 na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2336 na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2337 /*
2338 * Verify upper bounds; the parameters must have already been
2339 * validated by nxdom_prov_params() by the time we get here.
2340 */
2341 ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2342 ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2343 ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2344 ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2345
2346 na_attach_common(devna, nx, &nx_netif_prov_s);
2347
2348 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2349 nx, devna)) != 0) {
2350 ASSERT(devna->na_arena == NULL);
2351 goto err;
2352 }
2353 ASSERT(devna->na_arena != NULL);
2354
2355 *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2356 ASSERT(devna->na_flowadv_max == 0 ||
2357 skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2358
2359 /* setup packet copy routines */
2360 if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2361 nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2362 nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2363 nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2364 } else {
2365 nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2366 nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2367 nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2368 }
2369
2370 /* initialize the host netif adapter */
2371 hostnifna->nifna_netif = nif;
2372 nx_netif_retain(nif);
2373 hostna = &hostnifna->nifna_up;
2374 (void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2375 "%s^", devna->na_name);
2376 uuid_generate_random(hostna->na_uuid);
2377 if (embryonic) {
2378 /*
2379 * We will move this over to na_ifp once
2380 * the interface is fully attached.
2381 */
2382 hostna->na_private = ifp;
2383 ASSERT(hostna->na_ifp == NULL);
2384 } else {
2385 ASSERT(hostna->na_private == NULL);
2386 hostna->na_ifp = devna->na_ifp;
2387 ifnet_incr_iorefcnt(hostna->na_ifp);
2388 }
2389 hostna->na_type = NA_NETIF_HOST;
2390 hostna->na_free = na_netif_free;
2391 hostna->na_activate = nx_netif_host_na_activate;
2392 hostna->na_txsync = nx_netif_host_na_txsync;
2393 hostna->na_rxsync = nx_netif_host_na_rxsync;
2394 hostna->na_dtor = nx_netif_na_dtor;
2395 hostna->na_krings_create = nx_netif_host_krings_create;
2396 hostna->na_krings_delete = nx_netif_host_krings_delete;
2397 hostna->na_special = nx_netif_host_na_special;
2398
2399 na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2400 if (NX_LLINK_PROV(nx)) {
2401 /*
2402 * while operating in logical link mode, we don't need to
2403 * create backing memory regions for the rings as they are
2404 * not used.
2405 */
2406 na_flags |= NAF_MEM_NO_INIT;
2407 }
2408 os_atomic_or(&hostna->na_flags, na_flags, relaxed);
2409 *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2410 NEXUS_STATS_TYPE_INVALID;
2411
2412 na_set_nrings(hostna, NR_TX, 1);
2413 na_set_nrings(hostna, NR_RX, 1);
2414 na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2415 na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2416
2417 na_attach_common(hostna, nx, &nx_netif_prov_s);
2418
2419 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2420 nx, hostna)) != 0) {
2421 ASSERT(hostna->na_arena == NULL);
2422 goto err;
2423 }
2424 ASSERT(hostna->na_arena != NULL);
2425
2426 *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2427 ASSERT(hostna->na_flowadv_max == 0 ||
2428 skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2429
2430 /* adjust the classq packet drop limit */
2431 if (embryonic) {
2432 uint32_t drop_lim;
2433 struct kern_pbufpool_memory_info pp_info;
2434
2435 retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2436 VERIFY(retval == 0);
2437
2438 /* set the drop limit as 80% of size of packet pool */
2439 drop_lim = (pp_info.kpm_packets * 4) / 5;
2440 VERIFY(drop_lim != 0);
2441 IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2442 }
2443
2444 /* these will be undone by destructor */
2445 ifp->if_na_ops = &na_netif_ops;
2446 ifp->if_na = devnifna;
2447 na_retain_locked(devna);
2448 na_retain_locked(hostna);
2449
2450 SKYWALK_SET_CAPABLE(ifp);
2451
2452 NETIF_WLOCK(nif);
2453 nif->nif_ifp = ifp;
2454 nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2455 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2456 kernproc);
2457 ASSERT(retval == 0);
2458 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2459 kernproc);
2460 ASSERT(retval == 0);
2461 NETIF_WUNLOCK(nif);
2462
2463 #if SK_LOG
2464 uuid_string_t uuidstr;
2465 SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2466 SK_DF(SK_VERB_NETIF, " UUID: %s",
2467 sk_uuid_unparse(devna->na_uuid, uuidstr));
2468 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2469 SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2470 NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2471 SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS);
2472 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max);
2473 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2474 na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2475 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2476 na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2477 #if CONFIG_NEXUS_USER_PIPE
2478 SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe);
2479 SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes);
2480 #endif /* CONFIG_NEXUS_USER_PIPE */
2481 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2482 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2483 SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2484 SK_DF(SK_VERB_NETIF, " UUID: %s",
2485 sk_uuid_unparse(hostna->na_uuid, uuidstr));
2486 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2487 SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2488 NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2489 SK_DF(SK_VERB_NETIF, " flags: 0x%b",
2490 hostna->na_flags, NAF_BITS);
2491 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max);
2492 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2493 na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2494 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2495 na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2496 #if CONFIG_NEXUS_USER_PIPE
2497 SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe);
2498 SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes);
2499 #endif /* CONFIG_NEXUS_USER_PIPE */
2500 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2501 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2502 #endif /* SK_LOG */
2503
2504 err:
2505 if (retval != 0) {
2506 if (ifp != NULL) {
2507 if (!embryonic) {
2508 ifnet_decr_iorefcnt(ifp);
2509 }
2510 ifp = NULL;
2511 }
2512 if (devna != NULL) {
2513 if (devna->na_arena != NULL) {
2514 skmem_arena_release(devna->na_arena);
2515 devna->na_arena = NULL;
2516 }
2517 if (devna->na_ifp != NULL) {
2518 ifnet_decr_iorefcnt(devna->na_ifp);
2519 devna->na_ifp = NULL;
2520 }
2521 devna->na_private = NULL;
2522 }
2523 if (hostna != NULL) {
2524 if (hostna->na_arena != NULL) {
2525 skmem_arena_release(hostna->na_arena);
2526 hostna->na_arena = NULL;
2527 }
2528 if (hostna->na_ifp != NULL) {
2529 ifnet_decr_iorefcnt(hostna->na_ifp);
2530 hostna->na_ifp = NULL;
2531 }
2532 hostna->na_private = NULL;
2533 }
2534 if (devnifna != NULL) {
2535 if (devnifna->nifna_netif != NULL) {
2536 nx_netif_release(devnifna->nifna_netif);
2537 devnifna->nifna_netif = NULL;
2538 }
2539 na_netif_free((struct nexus_adapter *)devnifna);
2540 }
2541 if (hostnifna != NULL) {
2542 if (hostnifna->nifna_netif != NULL) {
2543 nx_netif_release(hostnifna->nifna_netif);
2544 hostnifna->nifna_netif = NULL;
2545 }
2546 na_netif_free((struct nexus_adapter *)hostnifna);
2547 }
2548 }
2549 return retval;
2550 }
2551
2552 /*
2553 * Any per-netif state that can be discovered at attach time should be
2554 * initialized here.
2555 */
2556 static void
nx_netif_flags_init(struct nx_netif * nif)2557 nx_netif_flags_init(struct nx_netif *nif)
2558 {
2559 ifnet_t ifp = nif->nif_ifp;
2560 struct kern_nexus *nx = nif->nif_nx;
2561 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2562
2563 switch (devna->na_type) {
2564 case NA_NETIF_DEV:
2565 if (strlcmp(sk_ll_prefix, ifp->if_name, sizeof(sk_ll_prefix)) == 0) {
2566 nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2567 if_set_xflags(ifp, IFXF_LOW_LATENCY);
2568 }
2569 break;
2570 case NA_NETIF_COMPAT_DEV:
2571 nif->nif_flags |= NETIF_FLAG_COMPAT;
2572 break;
2573 default:
2574 break;
2575 }
2576 }
2577
2578 /*
2579 * This is also supposed to check for any inconsistent state at detach time.
2580 */
2581 static void
nx_netif_flags_fini(struct nx_netif * nif)2582 nx_netif_flags_fini(struct nx_netif *nif)
2583 {
2584 ifnet_t ifp = nif->nif_ifp;
2585
2586 if (ifp != NULL) {
2587 if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2588 }
2589 nif->nif_flags = 0;
2590 }
2591
2592 SK_NO_INLINE_ATTRIBUTE
2593 static void
nx_netif_callbacks_init(struct nx_netif * nif)2594 nx_netif_callbacks_init(struct nx_netif *nif)
2595 {
2596 ifnet_t ifp = nif->nif_ifp;
2597
2598 /*
2599 * XXX
2600 * This function is meant to be called by na_netif_finalize(), which is
2601 * called by ifnet_attach() while holding if_lock exclusively.
2602 */
2603 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
2604 if (ifnet_is_low_latency(ifp)) {
2605 ifnet_set_detach_notify_locked(ifp,
2606 nx_netif_llw_detach_notify, ifp->if_na);
2607 }
2608 }
2609
2610 SK_NO_INLINE_ATTRIBUTE
2611 static void
nx_netif_callbacks_fini(struct nx_netif * nif)2612 nx_netif_callbacks_fini(struct nx_netif *nif)
2613 {
2614 ifnet_t ifp = nif->nif_ifp;
2615
2616 if (ifnet_is_low_latency(ifp)) {
2617 ifnet_set_detach_notify(ifp, NULL, NULL);
2618 }
2619 }
2620
2621 static void
configure_capab_interface_advisory(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2622 configure_capab_interface_advisory(struct nx_netif *nif,
2623 nxprov_capab_config_fn_t capab_fn)
2624 {
2625 struct kern_nexus_capab_interface_advisory capab;
2626 struct kern_nexus *nx = nif->nif_nx;
2627 uint32_t capab_len;
2628 int error;
2629
2630 /* check/configure interface advisory notifications */
2631 if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
2632 return;
2633 }
2634 bzero(&capab, sizeof(capab));
2635 capab.kncia_version =
2636 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2637 *__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2638 &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
2639 *__DECONST(void **, &(capab.kncia_kern_context)) = nx;
2640 capab_len = sizeof(capab);
2641 error = capab_fn(NX_PROV(nx), nx,
2642 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
2643 if (error != 0) {
2644 DTRACE_SKYWALK2(interface__advisory__capab__error,
2645 struct nx_netif *, nif, int, error);
2646 return;
2647 }
2648 VERIFY(capab.kncia_config != NULL);
2649 VERIFY(capab.kncia_provider_context != NULL);
2650 nif->nif_intf_adv_config = capab.kncia_config;
2651 nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
2652 nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
2653 }
2654
2655 static void
unconfigure_capab_interface_advisory(struct nx_netif * nif)2656 unconfigure_capab_interface_advisory(struct nx_netif *nif)
2657 {
2658 if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
2659 return;
2660 }
2661 nif->nif_intf_adv_config = NULL;
2662 nif->nif_intf_adv_prov_ctx = NULL;
2663 nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
2664 }
2665
2666 static void
configure_capab_qset_extensions(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2667 configure_capab_qset_extensions(struct nx_netif *nif,
2668 nxprov_capab_config_fn_t capab_fn)
2669 {
2670 struct kern_nexus_capab_qset_extensions capab;
2671 struct kern_nexus *nx = nif->nif_nx;
2672 uint32_t capab_len;
2673 int error;
2674
2675 if (!NX_LLINK_PROV(nx)) {
2676 DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
2677 return;
2678 }
2679 bzero(&capab, sizeof(capab));
2680 capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
2681 capab_len = sizeof(capab);
2682 error = capab_fn(NX_PROV(nx), nx,
2683 KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
2684 if (error != 0) {
2685 DTRACE_SKYWALK2(qset__extensions__capab__error,
2686 struct nx_netif *, nif, int, error);
2687 return;
2688 }
2689 VERIFY(capab.cqe_notify_steering_info != NULL);
2690 VERIFY(capab.cqe_prov_ctx != NULL);
2691 nif->nif_qset_extensions.qe_notify_steering_info =
2692 capab.cqe_notify_steering_info;
2693 nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
2694 nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
2695 }
2696
2697 static void
unconfigure_capab_qset_extensions(struct nx_netif * nif)2698 unconfigure_capab_qset_extensions(struct nx_netif *nif)
2699 {
2700 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2701 return;
2702 }
2703 bzero(&nif->nif_qset_extensions, sizeof(nif->nif_qset_extensions));
2704 nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
2705 }
2706
2707 int
nx_netif_notify_steering_info(struct nx_netif * nif,struct netif_qset * qset,struct ifnet_traffic_descriptor_common * td,bool add)2708 nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
2709 struct ifnet_traffic_descriptor_common *td, bool add)
2710 {
2711 struct netif_qset_extensions *qset_ext;
2712 int err;
2713
2714 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2715 return ENOTSUP;
2716 }
2717 qset_ext = &nif->nif_qset_extensions;
2718 VERIFY(qset_ext->qe_prov_ctx != NULL);
2719 VERIFY(qset_ext->qe_notify_steering_info != NULL);
2720 err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
2721 qset->nqs_ctx, td, add);
2722 return err;
2723 }
2724
2725 static void
nx_netif_capabilities_init(struct nx_netif * nif)2726 nx_netif_capabilities_init(struct nx_netif *nif)
2727 {
2728 struct kern_nexus *nx = nif->nif_nx;
2729 nxprov_capab_config_fn_t capab_fn;
2730
2731 if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2732 KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2733 capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2734 ASSERT(capab_fn != NULL);
2735 } else {
2736 capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2737 }
2738 if (capab_fn == NULL) {
2739 return;
2740 }
2741 configure_capab_interface_advisory(nif, capab_fn);
2742 configure_capab_qset_extensions(nif, capab_fn);
2743 }
2744
2745 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2746 nx_netif_capabilities_fini(struct nx_netif *nif)
2747 {
2748 unconfigure_capab_interface_advisory(nif);
2749 unconfigure_capab_qset_extensions(nif);
2750 }
2751
2752 static void
nx_netif_verify_tso_config(struct nx_netif * nif)2753 nx_netif_verify_tso_config(struct nx_netif *nif)
2754 {
2755 ifnet_t ifp = nif->nif_ifp;
2756 uint32_t tso_v4_mtu = 0;
2757 uint32_t tso_v6_mtu = 0;
2758
2759 /*
2760 * compat interfaces always use 128-byte buffers on the device packet
2761 * pool side (for holding headers for classification) so no need to check
2762 * the size here.
2763 */
2764 if (!SKYWALK_NATIVE(ifp)) {
2765 return;
2766 }
2767
2768 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
2769 tso_v4_mtu = ifp->if_tso_v4_mtu;
2770 }
2771 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
2772 tso_v6_mtu = ifp->if_tso_v6_mtu;
2773 }
2774 VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
2775 max(tso_v4_mtu, tso_v6_mtu));
2776 }
2777
2778 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2779 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2780 {
2781 struct nx_netif *nif = nifna->nifna_netif;
2782 struct kern_nexus *nx = nif->nif_nx;
2783 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2784 struct nexus_adapter *hostna = nx_port_get_na(nx,
2785 NEXUS_PORT_NET_IF_HOST);
2786
2787 ASSERT(devna != NULL);
2788 ASSERT(hostna != NULL);
2789
2790 if (!ifnet_is_attached(ifp, 1)) {
2791 VERIFY(0);
2792 /* NOTREACHED */
2793 __builtin_unreachable();
2794 }
2795
2796 ASSERT(devna->na_private == ifp);
2797 ASSERT(devna->na_ifp == NULL);
2798 /* use I/O refcnt held by ifnet_is_attached() above */
2799 devna->na_ifp = devna->na_private;
2800 devna->na_private = NULL;
2801
2802 ASSERT(hostna->na_private == ifp);
2803 ASSERT(hostna->na_ifp == NULL);
2804 hostna->na_ifp = hostna->na_private;
2805 hostna->na_private = NULL;
2806 ifnet_incr_iorefcnt(hostna->na_ifp);
2807
2808 nx_netif_flags_init(nif);
2809 nx_netif_llink_init(nif);
2810 nx_netif_filter_init(nif);
2811 nx_netif_flow_init(nif);
2812 nx_netif_capabilities_init(nif);
2813 nx_netif_agent_init(nif);
2814 (void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
2815 &ifp->if_traffic_rule_count);
2816 nx_netif_verify_tso_config(nif);
2817 nx_netif_callbacks_init(nif);
2818 }
2819
2820 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2821 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2822 uint32_t thres, boolean_t low)
2823 {
2824 #pragma unused(ifp)
2825 struct nx_netif *nif = nifna->nifna_netif;
2826 struct kern_nexus *nx = nif->nif_nx;
2827 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2828 uint64_t now = _net_uptime;
2829 boolean_t purge;
2830
2831 ASSERT(thres != 0);
2832
2833 if (devna->na_work_ts == 0) {
2834 return;
2835 }
2836
2837 /*
2838 * Purge if it's has been inactive for some time (twice the drain
2839 * threshold), and clear the work timestamp to temporarily skip this
2840 * adapter until it's active again. Purging cached objects can be
2841 * expensive since we'd need to allocate and construct them again,
2842 * so we do it only when necessary.
2843 */
2844 if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2845 devna->na_work_ts = 0;
2846 purge = TRUE;
2847 } else {
2848 purge = FALSE;
2849 }
2850
2851 SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2852 (purge ? "purging" : "pruning"), devna->na_name);
2853
2854 /*
2855 * Device and host adapters share the same packet buffer pool,
2856 * so just reap the arena belonging to the device instance.
2857 */
2858 skmem_arena_reap(devna->na_arena, purge);
2859 }
2860
2861 /*
2862 * The purpose of this callback is to forceably remove resources held by VPNAs
2863 * in event of an interface detach. Without this callback an application can
2864 * prevent the detach from completing indefinitely. Note that this is only needed
2865 * for low latency VPNAs. Userspace do get notified about interface detach events
2866 * for other NA types (custom ether and filter) and will do the necessary cleanup.
2867 * The cleanup is done in two phases:
2868 * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
2869 * causes the device channel to be closed. All ifnet references held by VPNAs
2870 * are also released.
2871 * 2) This cleans up the netif nexus and releases the two remaining ifnet
2872 * references held by the device and host ports (nx_netif_clean()).
2873 */
2874 void
nx_netif_llw_detach_notify(void * arg)2875 nx_netif_llw_detach_notify(void *arg)
2876 {
2877 struct nexus_netif_adapter *__single nifna = arg;
2878 struct nx_netif *nif = nifna->nifna_netif;
2879 struct kern_nexus *nx = nif->nif_nx;
2880 struct kern_channel **ch_list = NULL;
2881 struct kern_channel *ch;
2882 int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
2883 struct proc *p;
2884
2885 ASSERT(NETIF_IS_LOW_LATENCY(nif));
2886 /*
2887 * kern_channel_defunct() requires sk_lock to be not held. We
2888 * will first find the list of channels we want to defunct and
2889 * then call kern_channel_defunct() on each of them. The number
2890 * of channels cannot increase after sk_lock is released since
2891 * this interface is being detached.
2892 */
2893 SK_LOCK();
2894 all_ch_cnt = nx->nx_ch_count;
2895 if (all_ch_cnt == 0) {
2896 DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
2897 SK_UNLOCK();
2898 return;
2899 }
2900 ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
2901 Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);
2902
2903 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
2904 struct nexus_adapter *na = ch->ch_na;
2905
2906 if (na != NULL && na->na_type == NA_NETIF_VP) {
2907 ASSERT(vp_ch_cnt < all_ch_cnt);
2908
2909 /* retain channel to prevent it from being freed */
2910 ch_retain_locked(ch);
2911 ch_list[vp_ch_cnt] = ch;
2912 DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
2913 struct kern_channel *, ch, struct nexus_adapter *, na);
2914 vp_ch_cnt++;
2915 }
2916 }
2917 if (vp_ch_cnt == 0) {
2918 DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
2919 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2920 SK_UNLOCK();
2921 return;
2922 }
2923 /* prevents the netif from being freed */
2924 nx_netif_retain(nif);
2925 SK_UNLOCK();
2926
2927 for (i = 0; i < vp_ch_cnt; i++) {
2928 ch = ch_list[i];
2929 p = proc_find(ch->ch_pid);
2930 if (p == NULL) {
2931 SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid);
2932 DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
2933 struct kern_channel *, ch, pid_t, ch->ch_pid);
2934 ch_release(ch);
2935 continue;
2936 }
2937 /*
2938 * It is possible for the channel to be closed before defunct gets
2939 * called. We need to get the fd lock here to ensure that the check
2940 * for the closed state and the calling of channel defunct are done
2941 * atomically.
2942 */
2943 proc_fdlock(p);
2944 if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
2945 kern_channel_defunct(p, ch);
2946 }
2947 proc_fdunlock(p);
2948 proc_rele(p);
2949 ch_release(ch);
2950 }
2951 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2952
2953 SK_LOCK();
2954 /*
2955 * Quiescing is not needed because:
2956 * The defuncting above ensures that no more tx syncs could enter.
2957 * The driver layer ensures that ifnet_detach() (this path) does not get
2958 * called until RX upcalls have returned.
2959 *
2960 * Before sk_lock is reacquired above, userspace could close its channels
2961 * and cause the nexus's destructor to be called. This is fine because we
2962 * have retained the nif so it can't disappear.
2963 */
2964 err = nx_netif_clean(nif, FALSE);
2965 if (err != 0) {
2966 SK_ERR("netif clean failed: err %d", err);
2967 DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
2968 }
2969 nx_netif_release(nif);
2970 SK_UNLOCK();
2971 }
2972
2973 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)2974 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2975 struct if_netif_stats *if_ns)
2976 {
2977 struct nx_netif_mit *mit;
2978 struct mit_cfg_tbl *mit_cfg;
2979
2980 if ((mit = nifna->nifna_rx_mit) == NULL) {
2981 return;
2982 }
2983
2984 if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2985 return;
2986 }
2987
2988 if_ns->ifn_rx_mit_interval = mit->mit_interval;
2989 if_ns->ifn_rx_mit_mode = mit->mit_mode;
2990 if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2991 if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2992 if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2993 if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2994 if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2995 if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2996 if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2997
2998 VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2999 mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
3000 if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
3001 if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
3002 if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
3003 if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
3004 if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
3005 }
3006
3007 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3008 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
3009 struct chreq *chr, nxspec_cmd_t spec_cmd)
3010 {
3011 ASSERT(na->na_type == NA_NETIF_DEV ||
3012 na->na_type == NA_NETIF_COMPAT_DEV);
3013 return nx_netif_na_special_common(na, ch, chr, spec_cmd);
3014 }
3015
3016 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3017 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
3018 struct chreq *chr, nxspec_cmd_t spec_cmd)
3019 {
3020 int error = 0;
3021
3022 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
3023 na->na_type == NA_NETIF_COMPAT_DEV ||
3024 na->na_type == NA_NETIF_COMPAT_HOST);
3025 SK_LOCK_ASSERT_HELD();
3026
3027 switch (spec_cmd) {
3028 case NXSPEC_CMD_CONNECT:
3029 /*
3030 * netif adapter isn't created exclusively for kernel.
3031 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
3032 * na_special() connect and disconnect.
3033 */
3034 if (NA_KERNEL_ONLY(na)) {
3035 error = EBUSY;
3036 goto done;
3037 }
3038 ASSERT(!(na->na_flags & NAF_SPEC_INIT));
3039
3040 os_atomic_or(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3041 error = na_bind_channel(na, ch, chr);
3042 if (error != 0) {
3043 os_atomic_andnot(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3044 goto done;
3045 }
3046 os_atomic_or(&na->na_flags, NAF_SPEC_INIT, relaxed);
3047 break;
3048
3049 case NXSPEC_CMD_DISCONNECT:
3050 ASSERT(NA_KERNEL_ONLY(na));
3051 ASSERT(na->na_channels > 0);
3052 ASSERT(na->na_flags & NAF_SPEC_INIT);
3053 na_unbind_channel(ch);
3054 os_atomic_andnot(&na->na_flags, (NAF_SPEC_INIT | NAF_KERNEL_ONLY), relaxed);
3055 break;
3056
3057 case NXSPEC_CMD_START:
3058 na_kr_drop(na, FALSE);
3059 break;
3060
3061 case NXSPEC_CMD_STOP:
3062 na_kr_drop(na, TRUE);
3063 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
3064 lck_mtx_lock(&ch->ch_lock);
3065 nxprov_advise_disconnect(na->na_nx, ch);
3066 lck_mtx_unlock(&ch->ch_lock);
3067 break;
3068
3069 default:
3070 error = EINVAL;
3071 break;
3072 }
3073
3074 done:
3075 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
3076 "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
3077 "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
3078 na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
3079
3080 return error;
3081 }
3082
3083 /*
3084 * Get a skywalk netif adapter for the port.
3085 */
3086 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)3087 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
3088 struct chreq *chr, struct nxbind *nxb, struct proc *p,
3089 struct nexus_adapter **nap, boolean_t create)
3090 {
3091 #pragma unused(ch)
3092 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3093 boolean_t anon = NX_ANONYMOUS_PROV(nx);
3094 ch_endpoint_t ep = chr->cr_endpoint;
3095 nexus_port_t nx_port = chr->cr_port;
3096 struct nexus_adapter *__single na = NULL;
3097 struct ifnet *ifp;
3098 int err = 0;
3099
3100 SK_LOCK_ASSERT_HELD();
3101 *nap = NULL; /* default */
3102
3103 #if SK_LOG
3104 uuid_string_t uuidstr;
3105 SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
3106 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
3107 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
3108 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
3109 chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
3110 chr->cr_real_endpoint, chr->cr_endpoint, create,
3111 (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
3112 #endif /* SK_LOG */
3113
3114 if (!create || ep != CH_ENDPOINT_NET_IF) {
3115 err = ENODEV;
3116 goto done;
3117 }
3118
3119 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
3120 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
3121 err = ENXIO;
3122 goto done;
3123 }
3124 ifp = nif->nif_ifp;
3125 if (!(SKYWALK_CAPABLE(ifp))) {
3126 SK_ERR("interface %s is no longer usable", if_name(ifp));
3127 err = ENOTSUP;
3128 goto done;
3129 }
3130
3131 if (chr->cr_mode & CHMODE_LOW_LATENCY) {
3132 SK_ERR("low latency is not supported for netif channel");
3133 err = ENOTSUP;
3134 goto done;
3135 }
3136
3137 switch (nx_port) {
3138 case NEXUS_PORT_NET_IF_DEV:
3139 /*
3140 * We have to reject direct user open that's not explicitly
3141 * allowed because netif nexuses do not by default have
3142 * user memory regions.
3143 */
3144 if (p != kernproc &&
3145 (!skywalk_netif_direct_allowed(ifp->if_xname) ||
3146 (kauth_cred_issuser(kauth_cred_get()) == 0 &&
3147 (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
3148 !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
3149 DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
3150 ifp, struct chreq *, chr);
3151 err = ENOTSUP;
3152 goto done;
3153 }
3154 if (chr->cr_mode & CHMODE_EVENT_RING) {
3155 SK_ERR("event ring is not supported for netif dev port channel");
3156 err = ENOTSUP;
3157 goto done;
3158 }
3159 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
3160 break;
3161
3162 case NEXUS_PORT_NET_IF_HOST:
3163 if (p != kernproc) {
3164 err = ENOTSUP;
3165 goto done;
3166 }
3167 if (chr->cr_mode & CHMODE_EVENT_RING) {
3168 SK_ERR("event ring is not supported for netif host port channel");
3169 err = ENOTSUP;
3170 goto done;
3171 }
3172 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
3173 break;
3174
3175 default:
3176 ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
3177
3178 NETIF_WLOCK(nif);
3179 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3180 if (err != 0) {
3181 NETIF_WUNLOCK(nif);
3182 goto done;
3183 }
3184
3185 if (na == NULL) {
3186 if (chr->cr_mode & CHMODE_FILTER) {
3187 err = netif_filter_na_create(nx, chr, &na);
3188 } else {
3189 err = netif_vp_na_create(nx, chr, &na);
3190 }
3191 if (err != 0) {
3192 NETIF_WUNLOCK(nif);
3193 goto done;
3194 }
3195 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3196 if (err != 0) {
3197 NETIF_WUNLOCK(nif);
3198 goto done;
3199 }
3200 }
3201 NETIF_WUNLOCK(nif);
3202
3203 break;
3204 }
3205
3206 ASSERT(err == 0);
3207 ASSERT(na != NULL);
3208
3209 #if CONFIG_NEXUS_USER_PIPE
3210 if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
3211 #else /* !CONFIG_NEXUS_USER_PIPE */
3212 if (NA_OWNED_BY_ANY(na)) {
3213 #endif /* !CONFIG_NEXUS_USER_PIPE */
3214 err = EBUSY;
3215 na = NULL;
3216 goto done;
3217 }
3218
3219 *nap = na;
3220 na_retain_locked(na);
3221
3222 done:
3223 ASSERT(err != 0 || na != NULL);
3224 if (err) {
3225 SK_ERR("na not found, err(%d)", err);
3226 } else {
3227 SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
3228 }
3229 return err;
3230 }
3231
3232 /* na_krings_create callback for all netif device adapters */
3233 int
3234 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3235 {
3236 int ret;
3237
3238 ASSERT(na->na_type == NA_NETIF_DEV ||
3239 na->na_type == NA_NETIF_COMPAT_DEV);
3240 /*
3241 * Allocate context structures for native netif only, for
3242 * IOSkywalkFamily to store its object references.
3243 */
3244 ret = na_rings_mem_setup(na, (na->na_flags & NAF_NATIVE), ch);
3245
3246 /*
3247 * We mark CKRF_DROP for kernel-only rings (kernel channel
3248 * opened by the flowswitch, etc.) to prevent packets from
3249 * going thru until after the client of the kernel channel
3250 * has fully plumbed things on its side. For userland-facing
3251 * rings (regular channel opened to netif), this is not
3252 * required, and so don't mark CKRF_DROP there.
3253 */
3254 if (ret == 0 && NA_KERNEL_ONLY(na)) {
3255 na_kr_drop(na, TRUE);
3256 }
3257
3258 return ret;
3259 }
3260
3261 /* call with SK_LOCK held */
3262 void
3263 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3264 boolean_t defunct)
3265 {
3266 ASSERT(na->na_type == NA_NETIF_DEV ||
3267 na->na_type == NA_NETIF_COMPAT_DEV);
3268
3269 /* see comments in nx_netif_dev_krings_create() */
3270 if (NA_KERNEL_ONLY(na)) {
3271 na_kr_drop(na, TRUE);
3272 }
3273
3274 na_rings_mem_teardown(na, ch, defunct);
3275 }
3276
3277 struct nx_netif *
3278 nx_netif_alloc(zalloc_flags_t how)
3279 {
3280 struct nx_netif *n;
3281
3282 SK_LOCK_ASSERT_HELD();
3283
3284 n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
3285 if (n == NULL) {
3286 return NULL;
3287 }
3288
3289 NETIF_RWINIT(n);
3290 os_ref_init(&n->nif_refcnt, NULL);
3291 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3292
3293 return n;
3294 }
3295
3296 static void
3297 nx_netif_destroy(struct nx_netif *n)
3298 {
3299 ASSERT(n->nif_dev_nxb == NULL);
3300 ASSERT(n->nif_host_nxb == NULL);
3301 ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3302 nx_netif_llink_config_free(n);
3303 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3304 NETIF_RWDESTROY(n);
3305 zfree(nx_netif_zone, n);
3306 }
3307
3308 void
3309 nx_netif_release(struct nx_netif *n)
3310 {
3311 SK_LOCK_ASSERT_HELD();
3312
3313 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3314 os_ref_get_count(&n->nif_refcnt));
3315 if (os_ref_release(&n->nif_refcnt) == 0) {
3316 nx_netif_destroy(n);
3317 }
3318 }
3319
3320 void
3321 nx_netif_retain(struct nx_netif *n)
3322 {
3323 SK_LOCK_ASSERT_HELD();
3324
3325 /* retaining an object with a zero refcount is not allowed */
3326 ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3327 os_ref_retain(&n->nif_refcnt);
3328 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3329 os_ref_get_count(&n->nif_refcnt));
3330 }
3331
3332 void
3333 nx_netif_free(struct nx_netif *n)
3334 {
3335 nx_netif_release(n);
3336 }
3337
3338 static int
3339 nx_netif_interface_advisory_report(struct kern_nexus *nx,
3340 const struct ifnet_interface_advisory *advisory)
3341 {
3342 struct kern_nexus *notify_nx;
3343 struct __kern_netif_intf_advisory *intf_adv;
3344 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3345 ifnet_t difp = nif->nif_ifp;
3346 ifnet_t __single parent = NULL;
3347
3348 /* If we are a delegate, notify the parent instead */
3349 if (ifnet_get_delegate_parent(difp, &parent) == 0) {
3350 nif = parent->if_na->nifna_netif;
3351 }
3352 if (nif->nif_fsw_nxadv != NULL) {
3353 ASSERT(nif->nif_fsw != NULL);
3354 intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3355 notify_nx = nif->nif_fsw->fsw_nx;
3356 } else {
3357 intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3358 notify_nx = nif->nif_nx;
3359 }
3360 /*
3361 * copy the advisory report in shared memory
3362 */
3363 intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3364 sizeof(*advisory), 0);
3365 STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3366 /*
3367 * notify user channels on advisory report availability
3368 */
3369 nx_interface_advisory_notify(notify_nx);
3370 if (parent != NULL) {
3371 ifnet_release_delegate_parent(difp);
3372 }
3373 return 0;
3374 }
3375
3376 static errno_t
3377 nx_netif_interface_advisory_notify(void *kern_ctx,
3378 const struct ifnet_interface_advisory *advisory)
3379 {
3380 _CASSERT(offsetof(struct ifnet_interface_advisory, version) ==
3381 offsetof(struct ifnet_interface_advisory, header.version));
3382 _CASSERT(offsetof(struct ifnet_interface_advisory, direction) ==
3383 offsetof(struct ifnet_interface_advisory, header.direction));
3384 _CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) ==
3385 offsetof(struct ifnet_interface_advisory, header.interface_type));
3386
3387 if (__improbable(kern_ctx == NULL || advisory == NULL)) {
3388 return EINVAL;
3389 }
3390 if (__improbable((advisory->header.version <
3391 IF_INTERFACE_ADVISORY_VERSION_MIN) ||
3392 (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
3393 SK_ERR("Invalid advisory version %d", advisory->header.version);
3394 return EINVAL;
3395 }
3396 if (__improbable((advisory->header.direction !=
3397 IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3398 (advisory->header.direction !=
3399 IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3400 SK_ERR("Invalid advisory direction %d",
3401 advisory->header.direction);
3402 return EINVAL;
3403 }
3404 if (__improbable(((advisory->header.interface_type <
3405 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
3406 (advisory->header.interface_type >
3407 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
3408 (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
3409 SK_ERR("Invalid advisory interface type %d",
3410 advisory->header.interface_type);
3411 return EINVAL;
3412 }
3413 return nx_netif_interface_advisory_report(kern_ctx, advisory);
3414 }
3415
3416 void
3417 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3418 {
3419 struct kern_nexus *nx_netif;
3420 struct nx_netif *nif;
3421
3422 if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3423 return;
3424 }
3425 if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3426 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3427 nx_netif = fsw->fsw_nifna->na_nx;
3428 } else {
3429 nx_netif = nx;
3430 }
3431 ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3432 nif = NX_NETIF_PRIVATE(nx_netif);
3433 if (nif->nif_intf_adv_config != NULL) {
3434 nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3435 }
3436 }
3437
3438 /*
3439 * This function has no use anymore since we are now passing truncated packets
3440 * to filters. We keep this logic just in case we need to prevent certain
3441 * packets from being passed to filters.
3442 */
3443 static boolean_t
3444 packet_is_filterable(struct nexus_netif_adapter *nifna,
3445 struct __kern_packet *pkt)
3446 {
3447 #pragma unused (nifna, pkt)
3448 return TRUE;
3449 }
3450
3451 /*
3452 * This function is only meant for supporting the RX path because the TX path
3453 * will not send packets > MTU size due to the disabling of TSO when filters
3454 * are enabled.
3455 */
3456 static void
3457 get_filterable_packets(struct nexus_netif_adapter *nifna,
3458 struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3459 struct __kern_packet **passthrough_chain)
3460 {
3461 struct nx_netif *nif = nifna->nifna_netif;
3462 struct netif_stats *nifs = &nif->nif_stats;
3463 struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3464 struct __kern_packet *__single fpkt_head = NULL;
3465 struct __kern_packet *__single passthrough_head = NULL;
3466 struct __kern_packet **fpkt_tailp = &fpkt_head;
3467 struct __kern_packet **passthrough_tailp = &passthrough_head;
3468 int fcnt = 0, pcnt = 0, dcnt = 0;
3469
3470 while (pkt != NULL) {
3471 next = pkt->pkt_nextpkt;
3472 pkt->pkt_nextpkt = NULL;
3473
3474 if (!packet_is_filterable(nifna, pkt)) {
3475 pcnt++;
3476 *passthrough_tailp = pkt;
3477 passthrough_tailp = &pkt->pkt_nextpkt;
3478 pkt = next;
3479 continue;
3480 }
3481 fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3482 if (fpkt != NULL) {
3483 fcnt++;
3484 *fpkt_tailp = fpkt;
3485 fpkt_tailp = &fpkt->pkt_nextpkt;
3486 } else {
3487 dcnt++;
3488 }
3489 pkt = next;
3490 }
3491 *fpkt_chain = fpkt_head;
3492 *passthrough_chain = passthrough_head;
3493
3494 /*
3495 * No need to increment drop stats because that's already
3496 * done in nx_netif_pkt_to_filter_pkt.
3497 */
3498 STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3499 DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3500 int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3501 fpkt_head, struct __kern_packet *, passthrough_head);
3502 }
3503
3504 /*
3505 * This is only used by ring-based notify functions for now.
3506 * When a qset-based notify becomes available, this function can be used
3507 * unmodified.
3508 */
3509 void
3510 netif_receive(struct nexus_netif_adapter *nifna,
3511 struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3512 {
3513 struct nx_netif *nif = nifna->nifna_netif;
3514 struct nexus_adapter *na = &nifna->nifna_up;
3515 struct netif_stats *nifs = &nif->nif_stats;
3516 int err, dropcnt, dropstat = -1;
3517
3518 if ((nif->nif_ifp->if_xflags & IFXF_DISABLE_INPUT) != 0) {
3519 uint64_t byte_cnt = 0;
3520 struct __kern_packet *pkt;
3521 struct ifnet *ifp = nif->nif_ifp;
3522
3523 dropcnt = 0;
3524 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
3525 dropcnt++;
3526 byte_cnt += ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) ?
3527 m_pktlen(pkt->pkt_mbuf) : pkt->pkt_length;
3528 }
3529 os_atomic_add(&ifp->if_data.ifi_ipackets, dropcnt, relaxed);
3530 os_atomic_add(&ifp->if_data.ifi_ibytes, byte_cnt, relaxed);
3531
3532 dropstat = NETIF_STATS_DROP_INPUT_DISABLED;
3533 goto drop;
3534 }
3535
3536 /* update our work timestamp */
3537 na->na_work_ts = _net_uptime;
3538
3539 if (nif->nif_filter_cnt > 0) {
3540 struct __kern_packet *__single fpkt_chain = NULL;
3541 struct __kern_packet *__single passthrough_chain = NULL;
3542
3543 get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3544 &passthrough_chain);
3545 if (fpkt_chain != NULL) {
3546 (void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3547 NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3548 }
3549 if (passthrough_chain != NULL) {
3550 pkt_chain = passthrough_chain;
3551 } else {
3552 return;
3553 }
3554 } else if (!NETIF_IS_LOW_LATENCY(nif) && nx_netif_filter_default_drop != 0) {
3555 /*
3556 * Default drop is meant for dropping packets on interfaces without
3557 * interface filters attached. It can be skipped for LLW because it
3558 * doesn't have a network stack path.
3559 */
3560 DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3561 struct __kern_packet *, pkt_chain);
3562 dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3563 goto drop;
3564 }
3565
3566 if (nif->nif_flow_cnt > 0) {
3567 struct __kern_packet *__single remain = NULL;
3568
3569 err = nx_netif_demux(nifna, pkt_chain, &remain, stats, NETIF_FLOW_SOURCE);
3570 if (remain == NULL) {
3571 return;
3572 }
3573 pkt_chain = remain;
3574 }
3575
3576 if (na->na_rx != NULL) {
3577 na->na_rx(na, pkt_chain, stats);
3578 } else {
3579 DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3580 struct __kern_packet *, pkt_chain);
3581 dropstat = NETIF_STATS_DROP_NO_RX_CB;
3582 goto drop;
3583 }
3584
3585 return;
3586
3587 drop:
3588 dropcnt = 0;
3589 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3590 if (dropstat != -1) {
3591 STATS_ADD(nifs, dropstat, dropcnt);
3592 }
3593 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3594 }
3595
3596 static slot_idx_t
3597 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3598 slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3599 {
3600 uint64_t elapsed;
3601 uint64_t now;
3602 struct __kern_packet *pkt;
3603 clock_sec_t sec;
3604 clock_usec_t usec;
3605 slot_idx_t i;
3606
3607 if (__probable(rate == 0)) {
3608 return end;
3609 }
3610
3611 /* init tbr if not so */
3612 if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3613 r->ckr_tbr_token = rate;
3614 r->ckr_tbr_depth = rate;
3615 r->ckr_tbr_last = mach_absolute_time();
3616 } else {
3617 now = mach_absolute_time();
3618 elapsed = now - r->ckr_tbr_last;
3619 absolutetime_to_microtime(elapsed, &sec, &usec);
3620 r->ckr_tbr_token +=
3621 ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3622 if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3623 r->ckr_tbr_token = r->ckr_tbr_depth;
3624 }
3625 r->ckr_tbr_last = now;
3626 }
3627
3628 *rate_limited = FALSE;
3629 for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3630 pkt = KR_KSD(r, i)->sd_pkt;
3631 if (__improbable(pkt == NULL)) {
3632 continue;
3633 }
3634 if (__improbable(r->ckr_tbr_token <= 0)) {
3635 end = i;
3636 *rate_limited = TRUE;
3637 break;
3638 }
3639 r->ckr_tbr_token -= pkt->pkt_length * 8;
3640 }
3641
3642 SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3643 r, r->ckr_name, i);
3644
3645 return end;
3646 }
3647
3648 SK_NO_INLINE_ATTRIBUTE
3649 static struct __kern_packet *
3650 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3651 {
3652 struct __kern_packet *__single pkt_chain = NULL;
3653 struct __kern_packet **tailp = &pkt_chain;
3654 slot_idx_t idx = ring->ckr_rhead;
3655
3656 while (idx != end) {
3657 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3658 struct __kern_packet *pkt = ksd->sd_pkt;
3659
3660 ASSERT(pkt->pkt_nextpkt == NULL);
3661 KR_SLOT_DETACH_METADATA(ring, ksd);
3662 *tailp = pkt;
3663 tailp = &pkt->pkt_nextpkt;
3664 idx = SLOT_NEXT(idx, ring->ckr_lim);
3665 }
3666 ring->ckr_rhead = end;
3667 ring->ckr_rtail = ring->ckr_ktail;
3668 return pkt_chain;
3669 }
3670
3671 int
3672 netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3673 uint32_t flags)
3674 {
3675 struct nexus_adapter *hwna;
3676 struct nexus_netif_adapter *nifna;
3677 struct nx_netif *nif;
3678 struct __kern_packet *pkt_chain;
3679 struct nexus_pkt_stats stats = {0};
3680 sk_protect_t protect;
3681 slot_idx_t ktail;
3682 int err = 0;
3683
3684 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3685 SK_KVA(ring));
3686
3687 ASSERT(ring->ckr_tx == NR_RX);
3688 ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3689
3690 err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3691 if (err != 0) {
3692 /* not a serious error, so no need to be chatty here */
3693 SK_DF(SK_VERB_FSW,
3694 "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3695 "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3696 ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3697 CKRF_BITS, err);
3698 goto out;
3699 }
3700 if (__improbable(KR_DROP(ring))) {
3701 kr_exit(ring);
3702 err = ENODEV;
3703 goto out;
3704 }
3705 hwna = KRNA(ring);
3706 nifna = NIFNA(hwna);
3707 nif = nifna->nifna_netif;
3708 if (__improbable(hwna->na_ifp == NULL)) {
3709 kr_exit(ring);
3710 err = ENODEV;
3711 goto out;
3712 }
3713 protect = sk_sync_protect();
3714 err = ring->ckr_na_sync(ring, p, 0);
3715 if (err != 0 && err != EAGAIN) {
3716 goto put_out;
3717 }
3718
3719 /* read the tail pointer once */
3720 ktail = ring->ckr_ktail;
3721 if (__improbable(ring->ckr_khead == ktail)) {
3722 SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3723 "how strange, interrupt with no packets on hwna "
3724 "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3725 goto put_out;
3726 }
3727 ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3728 ktail, &ring->ckr_rate_limited);
3729
3730 pkt_chain = consume_pkts(ring, ktail);
3731 if (pkt_chain != NULL) {
3732 netif_receive(nifna, pkt_chain, &stats);
3733
3734 if (ring->ckr_netif_mit_stats != NULL &&
3735 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3736 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3737 stats.nps_bytes);
3738 }
3739 }
3740
3741 put_out:
3742 sk_sync_unprotect(protect);
3743 kr_exit(ring);
3744
3745 out:
3746 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3747 SK_KVA(ring), err);
3748 return err;
3749 }
3750
3751 int
3752 netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3753 uint32_t flags)
3754 {
3755 #pragma unused(p, flags)
3756 sk_protect_t protect;
3757 struct nexus_adapter *hwna;
3758 struct nexus_pkt_stats stats = {0};
3759 uint32_t i, count;
3760 int err = 0;
3761
3762 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3763 SK_KVA(ring));
3764
3765 /* XXX
3766 * sk_sync_protect() is not needed for this case because
3767 * we are not using the dev ring. Unfortunately lots of
3768 * macros used by fsw still require this.
3769 */
3770 protect = sk_sync_protect();
3771 hwna = KRNA(ring);
3772 count = na_get_nslots(hwna, NR_RX);
3773 err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
3774 if (__improbable(err != 0)) {
3775 SK_ERR("nx_rx_sync_packets failed: %d", err);
3776 DTRACE_SKYWALK2(rx__sync__packets__failed,
3777 struct __kern_channel_ring *, ring, int, err);
3778 goto out;
3779 }
3780 DTRACE_SKYWALK1(chain__count, uint32_t, count);
3781 for (i = 0; i < count; i++) {
3782 struct __kern_packet *pkt_chain;
3783
3784 pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3785 ASSERT(pkt_chain != NULL);
3786 netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);
3787
3788 if (ring->ckr_netif_mit_stats != NULL &&
3789 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3790 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3791 stats.nps_bytes);
3792 }
3793 }
3794 out:
3795 sk_sync_unprotect(protect);
3796 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3797 SK_KVA(ring), err);
3798 return err;
3799 }
3800
3801
3802 /*
3803 * Configure the NA to operate in a particular mode.
3804 */
3805 static channel_ring_notify_t
3806 netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3807 {
3808 channel_ring_notify_t notify = NULL;
3809 boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3810 nx_has_rx_sync_packets(ring));
3811
3812 if (mode == NETIF_MODE_FSW) {
3813 notify = (has_sync_pkts ? netif_rx_notify_fast :
3814 netif_rx_notify_default);
3815 } else if (mode == NETIF_MODE_LLW) {
3816 notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3817 netif_llw_rx_notify_default);
3818 }
3819 return notify;
3820 }
3821
3822
3823 static uint32_t
3824 netif_mode_to_flag(netif_mode_t mode)
3825 {
3826 uint32_t flag = 0;
3827
3828 if (mode == NETIF_MODE_FSW) {
3829 flag = NAF_MODE_FSW;
3830 } else if (mode == NETIF_MODE_LLW) {
3831 flag = NAF_MODE_LLW;
3832 }
3833 return flag;
3834 }
3835
3836 static void
3837 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3838 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3839 struct nexus_pkt_stats *), boolean_t set)
3840 {
3841 uint32_t i;
3842 uint32_t flag;
3843
3844 ASSERT(hwna->na_type == NA_NETIF_DEV ||
3845 hwna->na_type == NA_NETIF_COMPAT_DEV);
3846
3847 for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3848 struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3849 channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);
3850
3851 if (set) {
3852 kr->ckr_save_notify = kr->ckr_netif_notify;
3853 kr->ckr_netif_notify = notify;
3854 } else {
3855 kr->ckr_netif_notify = kr->ckr_save_notify;
3856 kr->ckr_save_notify = NULL;
3857 }
3858 }
3859 if (set) {
3860 hwna->na_rx = rx;
3861 flag = netif_mode_to_flag(mode);
3862 os_atomic_or(&hwna->na_flags, flag, relaxed);
3863 } else {
3864 hwna->na_rx = NULL;
3865 os_atomic_andnot(&hwna->na_flags, (NAF_MODE_FSW | NAF_MODE_LLW), relaxed);
3866 }
3867 }
3868
3869 void
3870 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3871 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3872 struct nexus_pkt_stats *))
3873 {
3874 return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3875 }
3876
3877 void
3878 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3879 {
3880 return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3881 }
3882
3883 static void
3884 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3885 {
3886 struct nexus_netif_adapter *nifna = NIFNA(na);
3887 struct nx_netif *nif = nifna->nifna_netif;
3888 struct netif_stats *nifs = &nif->nif_stats;
3889 struct __kern_channel_ring *r;
3890 struct nexus_pkt_stats stats;
3891 sk_protect_t protect;
3892 boolean_t ring_drop = FALSE;
3893 int err, dropcnt;
3894
3895 if (!NA_OWNED_BY_FSW(na)) {
3896 DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3897 goto fail;
3898 }
3899 ASSERT(na->na_rx != NULL);
3900
3901 /*
3902 * XXX
3903 * This function is called when a filter injects a packet back to the
3904 * regular RX path. We can assume the ring is 0 for now because RSS
3905 * is not supported. This needs to be revisited when we add support for
3906 * RSS.
3907 */
3908 r = &na->na_rx_rings[0];
3909 ASSERT(r->ckr_tx == NR_RX);
3910 err = kr_enter(r, TRUE);
3911 VERIFY(err == 0);
3912
3913 if (__improbable(KR_DROP(r))) {
3914 kr_exit(r);
3915 DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3916 struct __kern_channel_ring *, r);
3917 ring_drop = TRUE;
3918 goto fail;
3919 }
3920 protect = sk_sync_protect();
3921 na->na_rx(na, pkt_chain, &stats);
3922
3923 if (r->ckr_netif_mit_stats != NULL &&
3924 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3925 r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3926 }
3927 sk_sync_unprotect(protect);
3928
3929 kr_exit(r);
3930 return;
3931
3932 fail:
3933 dropcnt = 0;
3934 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3935 if (ring_drop) {
3936 STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3937 }
3938 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3939 }
3940
3941 /*
3942 * This is called when an inbound packet has traversed all filters.
3943 */
3944 errno_t
3945 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3946 struct __kern_packet *fpkt_chain, uint32_t flags)
3947 {
3948 #pragma unused (flags)
3949 struct nx_netif *nif = nifna->nifna_netif;
3950 struct netif_stats *nifs = &nif->nif_stats;
3951 struct nexus_adapter *na = &nifna->nifna_up;
3952 struct __kern_packet *pkt_chain;
3953 int err;
3954
3955 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3956 fpkt_chain, NETIF_CONVERT_RX);
3957 if (pkt_chain == NULL) {
3958 return ENOMEM;
3959 }
3960 if (nif->nif_flow_cnt > 0) {
3961 struct __kern_packet *__single remain = NULL;
3962
3963 err = nx_netif_demux(nifna, pkt_chain, &remain,
3964 NULL, NETIF_FLOW_INJECT);
3965 if (remain == NULL) {
3966 return err;
3967 }
3968 pkt_chain = remain;
3969 }
3970 if (na->na_rx != NULL) {
3971 netif_inject_rx(na, pkt_chain);
3972 } else {
3973 int dropcnt = 0;
3974 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3975 STATS_ADD(nifs,
3976 NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3977 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3978 }
3979 return 0;
3980 }
3981
3982 /*
3983 * This is called when an outbound packet has traversed all filters.
3984 */
3985 errno_t
3986 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3987 struct __kern_packet *fpkt_chain, uint32_t flags)
3988 {
3989 #pragma unused (flags)
3990 struct nx_netif *nif = nifna->nifna_netif;
3991 struct nexus_adapter *na = &nifna->nifna_up;
3992 int err;
3993
3994 if (NETIF_IS_COMPAT(nif)) {
3995 struct mbuf *m_chain;
3996 mbuf_svc_class_t sc;
3997
3998 m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3999 fpkt_chain, NETIF_CONVERT_TX);
4000 if (m_chain == NULL) {
4001 return ENOMEM;
4002 }
4003 /*
4004 * All packets in the chain have the same service class.
4005 * If the sc is missing or invalid, a valid value will be
4006 * returned.
4007 */
4008 sc = mbuf_get_service_class(m_chain);
4009 err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
4010 sc, m_chain);
4011 } else {
4012 struct __kern_packet *pkt_chain;
4013 kern_packet_svc_class_t sc;
4014
4015 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
4016 fpkt_chain, NETIF_CONVERT_TX);
4017 if (pkt_chain == NULL) {
4018 return ENOMEM;
4019 }
4020 /*
4021 * All packets in the chain have the same service class.
4022 * If the sc is missing or invalid, a valid value will be
4023 * returned.
4024 */
4025 sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
4026 err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
4027 sc, pkt_chain);
4028 }
4029 /* Tell driver to resume dequeuing */
4030 ifnet_start(na->na_ifp);
4031 return err;
4032 }
4033
4034 void
4035 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
4036 struct skmem_region_params *srp)
4037 {
4038 #pragma unused(na, srp)
4039 return;
4040 }
4041
4042 /* returns true, if starter thread is utilized */
4043 static bool
4044 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
4045 {
4046 #if (DEVELOPMENT || DEBUG)
4047 if (__improbable(nx_netif_force_ifnet_start != 0)) {
4048 ifnet_start(ifp);
4049 return true;
4050 }
4051 #endif /* !DEVELOPMENT && !DEBUG */
4052 /*
4053 * use starter thread in following conditions:
4054 * - interface is not skywalk native
4055 * - interface attached to virtual driver (ipsec, utun)
4056 * - TBR is enabled
4057 * - delayed start mechanism is in use
4058 * - remaining stack space on the thread is not enough for driver
4059 * - caller is in rx workloop context
4060 * - caller is from the flowswitch path doing ARP resolving
4061 * - caller requires the use of starter thread (stack usage)
4062 * - caller requires starter thread for pacing
4063 */
4064 if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
4065 !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
4066 ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
4067 IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
4068 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4069 (flags & NETIF_XMIT_FLAG_PACING) != 0 ||
4070 sk_is_rx_notify_protected() ||
4071 sk_is_async_transmit_protected() ||
4072 (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
4073 DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
4074 uint32_t, flags);
4075 ifnet_start(ifp);
4076 return true;
4077 }
4078 lck_mtx_lock_spin(&ifp->if_start_lock);
4079 /* interface is flow controlled */
4080 if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
4081 lck_mtx_unlock(&ifp->if_start_lock);
4082 return true;
4083 }
4084 /* if starter thread is active, utilize it */
4085 if (ifp->if_start_active) {
4086 ifp->if_start_req++;
4087 lck_mtx_unlock(&ifp->if_start_lock);
4088 return true;
4089 }
4090 lck_mtx_unlock(&ifp->if_start_lock);
4091 /* Check remaining stack space */
4092 if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
4093 ifnet_start(ifp);
4094 return true;
4095 }
4096 return false;
4097 }
4098
4099 void
4100 netif_transmit(struct ifnet *ifp, uint32_t flags)
4101 {
4102 if (netif_use_starter_thread(ifp, flags)) {
4103 return;
4104 }
4105 nx_netif_doorbell_internal(ifp, flags);
4106 }
4107
4108 static struct ifclassq *
4109 netif_get_default_ifcq(struct nexus_adapter *hwna)
4110 {
4111 struct nx_netif *nif;
4112 struct ifclassq *ifcq;
4113
4114 nif = NX_NETIF_PRIVATE(hwna->na_nx);
4115 if (NETIF_LLINK_ENABLED(nif)) {
4116 struct netif_qset *qset;
4117
4118 /*
4119 * Use the default ifcq for now.
4120 * In the future this could be chosen by the caller.
4121 */
4122 qset = nx_netif_get_default_qset_noref(nif);
4123 ASSERT(qset != NULL);
4124 ifcq = qset->nqs_ifcq;
4125 } else {
4126 ifcq = nif->nif_ifp->if_snd;
4127 }
4128 return ifcq;
4129 }
4130
4131 static errno_t
4132 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
4133 uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
4134 boolean_t *pkts_pending, kern_packet_svc_class_t sc,
4135 uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
4136 {
4137 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
4138 struct ifnet *ifp = hwna->na_ifp;
4139 uint32_t pkts_cnt;
4140 uint32_t bytes_cnt;
4141 errno_t rc;
4142
4143 ASSERT(ifp != NULL);
4144 ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
4145 ASSERT((pkt_limit != 0) && (byte_limit != 0));
4146
4147 if (ifcq == NULL) {
4148 ifcq = netif_get_default_ifcq(hwna);
4149 }
4150 if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
4151 rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
4152 pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4153 } else {
4154 rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
4155 &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4156 }
4157 ASSERT((rc == 0) || (rc == EAGAIN));
4158 ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
4159
4160 ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
4161 &pkts_cnt, &bytes_cnt);
4162 *pkts_pending = pkts_cnt > 0;
4163
4164 *head = pkt_head.cp_kpkt;
4165 return rc;
4166 }
4167
4168 #if SK_LOG
4169 /* Hoisted out of line to reduce kernel stack footprint */
4170 SK_LOG_ATTRIBUTE
4171 static void
4172 netif_no_ring_space_log(const struct nexus_adapter *na,
4173 const kern_channel_ring_t ring)
4174 {
4175 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4176 "no ring space: na \"%s\" [%u] "
4177 "\"%s\"(kh %u kt %u | rh %u rt %u)",
4178 na->na_name, ring->ckr_ring_id,
4179 ring->ckr_name, ring->ckr_khead,
4180 ring->ckr_ktail, ring->ckr_rhead,
4181 ring->ckr_rtail);
4182 }
4183 #endif /* SK_LOG */
4184
4185 /*
4186 * netif refill function for rings
4187 */
4188 errno_t
4189 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
4190 uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
4191 boolean_t canblock)
4192 {
4193 struct nexus_adapter *hwna;
4194 struct ifnet *ifp;
4195 struct __kern_packet *__single head = NULL;
4196 sk_protect_t protect;
4197 errno_t rc = 0;
4198 errno_t sync_err = 0;
4199 uint32_t npkts = 0, consumed = 0;
4200 uint32_t flags;
4201 slot_idx_t idx, ktail;
4202 int ring_space = 0;
4203
4204 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
4205
4206 VERIFY(ring != NULL);
4207 hwna = KRNA(ring);
4208 ifp = hwna->na_ifp;
4209
4210 ASSERT(hwna->na_type == NA_NETIF_DEV);
4211 ASSERT(ring->ckr_tx == NR_TX);
4212 *pkts_pending = FALSE;
4213
4214 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
4215 SK_ERR("invalid limits plim %d, blim %d",
4216 pkt_limit, byte_limit);
4217 rc = EINVAL;
4218 goto out;
4219 }
4220
4221 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
4222 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
4223 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4224 rc = ENXIO;
4225 goto out;
4226 }
4227
4228 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
4229 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
4230 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4231 rc = ENXIO;
4232 goto out;
4233 }
4234
4235 /*
4236 * if the ring is busy, it means another dequeue is in
4237 * progress, so ignore this request and return success.
4238 */
4239 if (kr_enter(ring, canblock) != 0) {
4240 rc = 0;
4241 goto out;
4242 }
4243 /* mark thread with sync-in-progress flag */
4244 protect = sk_sync_protect();
4245
4246 if (__improbable(KR_DROP(ring) ||
4247 !NA_IS_ACTIVE(ring->ckr_na))) {
4248 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
4249 rc = ENXIO;
4250 goto done;
4251 }
4252
4253 idx = ring->ckr_rhead;
4254 ktail = ring->ckr_ktail;
4255 /* calculate available space on tx ring */
4256 ring_space = ktail - idx;
4257 if (ring_space < 0) {
4258 ring_space += ring->ckr_num_slots;
4259 }
4260 if (ring_space == 0) {
4261 struct ifclassq *ifcq;
4262
4263 /* no space in ring, driver should retry */
4264 #if SK_LOG
4265 if (__improbable((sk_verbose &
4266 (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
4267 netif_no_ring_space_log(hwna, ring);
4268 }
4269 #endif /* SK_LOG */
4270 ifcq = netif_get_default_ifcq(hwna);
4271 if (IFCQ_LEN(ifcq) != 0) {
4272 *pkts_pending = TRUE;
4273 }
4274 /*
4275 * We ran out of space in ring, most probably
4276 * because the driver is slow to drain its TX queue.
4277 * We want another doorbell to be generated as soon
4278 * as the TX notify completion happens; mark this
4279 * through ckr_pending_doorbell counter. Do this
4280 * regardless of whether there's any pending packet.
4281 */
4282 ring->ckr_pending_doorbell++;
4283 rc = EAGAIN;
4284 goto sync_ring;
4285 }
4286
4287 if ((uint32_t)ring_space < pkt_limit) {
4288 pkt_limit = ring_space;
4289 }
4290
4291 if (tx_doorbell_ctxt &&
4292 ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4293 pkt_limit = MIN(pkt_limit,
4294 nx_netif_doorbell_max_dequeue);
4295 }
4296
4297 rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4298 &head, pkts_pending, ring->ckr_svc, NULL, NULL, 0);
4299
4300 /*
4301 * There's room in ring; if we haven't dequeued everything,
4302 * mark ckr_pending_doorbell for the next TX notify to issue
4303 * a TX door bell; otherwise, clear it. The next packet that
4304 * gets enqueued will trigger a door bell again.
4305 */
4306 if (*pkts_pending) {
4307 ring->ckr_pending_doorbell++;
4308 } else if (ring->ckr_pending_doorbell != 0) {
4309 ring->ckr_pending_doorbell = 0;
4310 }
4311
4312 if (rc != 0) {
4313 /*
4314 * This is expected sometimes as the IOSkywalkFamily
4315 * errs on the side of caution to perform an extra
4316 * dequeue when multiple doorbells are pending;
4317 * nothing to dequeue, do a sync if there are slots
4318 * to reclaim else just return.
4319 */
4320 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4321 "nothing to dequeue, err %d", rc);
4322
4323 if ((uint32_t)ring_space == ring->ckr_lim) {
4324 goto done;
4325 } else {
4326 goto sync_ring;
4327 }
4328 }
4329 /* move the dequeued packets to tx ring */
4330 while (head != NULL && idx != ktail) {
4331 ASSERT(npkts <= pkt_limit);
4332 struct __kern_packet *pkt = head;
4333 KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
4334 (struct __kern_quantum *)pkt);
4335 npkts++;
4336 if (__improbable(pkt->pkt_trace_id != 0)) {
4337 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4338 KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4339 }
4340 idx = SLOT_NEXT(idx, ring->ckr_lim);
4341 head = pkt->pkt_nextpkt;
4342 pkt->pkt_nextpkt = NULL;
4343 }
4344
4345 /*
4346 * We checked for ring space earlier so the ring should have enough
4347 * space for the entire chain.
4348 */
4349 ASSERT(head == NULL);
4350 ring->ckr_rhead = idx;
4351
4352 sync_ring:
4353 flags = NA_SYNCF_NETIF;
4354 if (ring->ckr_pending_doorbell != 0) {
4355 flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4356 }
4357
4358 ring->ckr_khead_pre = ring->ckr_khead;
4359 sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4360 if (sync_err != 0 && sync_err != EAGAIN) {
4361 SK_ERR("unexpected sync err %d", sync_err);
4362 if (rc == 0) {
4363 rc = sync_err;
4364 }
4365 goto done;
4366 }
4367 /*
4368 * Verify that the driver has detached packets from the consumed slots.
4369 */
4370 idx = ring->ckr_khead_pre;
4371 consumed = 0;
4372 while (idx != ring->ckr_khead) {
4373 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4374
4375 consumed++;
4376 VERIFY(!KSD_VALID_METADATA(ksd));
4377 idx = SLOT_NEXT(idx, ring->ckr_lim);
4378 }
4379 ring->ckr_khead_pre = ring->ckr_khead;
4380
4381 done:
4382 sk_sync_unprotect(protect);
4383 kr_exit(ring);
4384 out:
4385 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4386 SK_KVA(ring), rc, 0, npkts);
4387
4388 return rc;
4389 }
4390
4391 #define NQ_EWMA(old, new, decay) do { \
4392 u_int64_t _avg; \
4393 if (__probable((_avg = (old)) > 0)) \
4394 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
4395 else \
4396 _avg = (new); \
4397 (old) = _avg; \
4398 } while (0)
4399
4400 void
4401 kern_netif_increment_queue_stats(kern_netif_queue_t queue,
4402 uint32_t pkt_count, uint32_t byte_count)
4403 {
4404 struct netif_llink *llink = queue->nq_qset->nqs_llink;
4405 struct ifnet *ifp = llink->nll_nif->nif_ifp;
4406 if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
4407 os_atomic_add(&ifp->if_data.ifi_opackets, pkt_count, relaxed);
4408 os_atomic_add(&ifp->if_data.ifi_obytes, byte_count, relaxed);
4409 } else {
4410 os_atomic_add(&ifp->if_data.ifi_ipackets, pkt_count, relaxed);
4411 os_atomic_add(&ifp->if_data.ifi_ibytes, byte_count, relaxed);
4412 }
4413
4414 if (ifp->if_data_threshold != 0) {
4415 ifnet_notify_data_threshold(ifp);
4416 }
4417
4418 uint64_t now;
4419 uint64_t diff_secs;
4420 struct netif_qstats *stats = &queue->nq_stats;
4421
4422 if (nq_stat_enable == 0) {
4423 return;
4424 }
4425
4426 if (__improbable(pkt_count == 0)) {
4427 return;
4428 }
4429
4430 stats->nq_num_xfers++;
4431 stats->nq_total_bytes += byte_count;
4432 stats->nq_total_pkts += pkt_count;
4433 if (pkt_count > stats->nq_max_pkts) {
4434 stats->nq_max_pkts = pkt_count;
4435 }
4436 if (stats->nq_min_pkts == 0 ||
4437 pkt_count < stats->nq_min_pkts) {
4438 stats->nq_min_pkts = pkt_count;
4439 }
4440
4441 now = net_uptime();
4442 if (__probable(queue->nq_accumulate_start != 0)) {
4443 diff_secs = now - queue->nq_accumulate_start;
4444 if (diff_secs >= nq_accumulate_interval) {
4445 uint64_t bps;
4446 uint64_t pps;
4447 uint64_t pps_ma;
4448
4449 /* bytes per second */
4450 bps = queue->nq_accumulated_bytes / diff_secs;
4451 NQ_EWMA(stats->nq_bytes_ps_ma,
4452 bps, nq_transfer_decay);
4453 stats->nq_bytes_ps = bps;
4454
4455 /* pkts per second */
4456 pps = queue->nq_accumulated_pkts / diff_secs;
4457 pps_ma = stats->nq_pkts_ps_ma;
4458 NQ_EWMA(pps_ma, pps, nq_transfer_decay);
4459 stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
4460 stats->nq_pkts_ps = (uint32_t)pps;
4461
4462 /* start over */
4463 queue->nq_accumulate_start = now;
4464 queue->nq_accumulated_bytes = 0;
4465 queue->nq_accumulated_pkts = 0;
4466
4467 stats->nq_min_pkts = 0;
4468 stats->nq_max_pkts = 0;
4469 }
4470 } else {
4471 queue->nq_accumulate_start = now;
4472 }
4473 queue->nq_accumulated_bytes += byte_count;
4474 queue->nq_accumulated_pkts += pkt_count;
4475 }
4476
4477 void
4478 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4479 uint32_t count, uint32_t flags)
4480 {
4481 #pragma unused (count)
4482 struct netif_queue *q = queue;
4483 struct netif_llink *llink = q->nq_qset->nqs_llink;
4484 struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4485 bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4486 struct pktq *pktq = &q->nq_pktq;
4487 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4488 struct nexus_pkt_stats stats = {0};
4489 sk_protect_t protect;
4490
4491 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4492 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4493 int drop_cnt = 0;
4494
4495 pp_free_packet_chain(pkt_chain, &drop_cnt);
4496 STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4497 return;
4498 }
4499 KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4500 if (flush) {
4501 pkt_chain = KPKTQ_FIRST(pktq);
4502 KPKTQ_INIT(pktq);
4503
4504 protect = sk_sync_protect();
4505 netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4506 sk_sync_unprotect(protect);
4507 kern_netif_increment_queue_stats(queue, (uint32_t)stats.nps_pkts,
4508 (uint32_t)stats.nps_bytes);
4509 }
4510 }
4511
4512 errno_t
4513 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4514 uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4515 {
4516 struct netif_queue *q = queue;
4517 struct netif_llink *llink = q->nq_qset->nqs_llink;
4518 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4519 struct nexus_adapter *hwna;
4520 struct __kern_packet *__single pkt_chain = NULL;
4521 uint32_t bytes = 0, pkt_cnt = 0;
4522 errno_t rc;
4523
4524 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4525 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4526 STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4527 return ENXIO;
4528 }
4529 hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4530
4531 if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4532 sk_is_tx_notify_protected()) {
4533 pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4534 }
4535 rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4536 byte_limit, &pkt_chain, pending, q->nq_svc, &pkt_cnt, &bytes,
4537 q->nq_qset->nqs_idx);
4538
4539 if (pkt_cnt > 0) {
4540 kern_netif_increment_queue_stats(queue, pkt_cnt, bytes);
4541 }
4542 if (pkt_chain != NULL) {
4543 *ph_chain = SK_PKT2PH(pkt_chain);
4544 }
4545 return rc;
4546 }
4547
4548 errno_t
4549 kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
4550 uint32_t * pkts_cnt, uint32_t * bytes_cnt)
4551 {
4552 VERIFY(qset != NULL);
4553 VERIFY(pkts_cnt != NULL);
4554 VERIFY(bytes_cnt != NULL);
4555
4556 return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
4557 bytes_cnt);
4558 }
4559
4560 void
4561 kern_netif_set_qset_combined(kern_netif_qset_t qset)
4562 {
4563 VERIFY(qset != NULL);
4564 VERIFY(qset->nqs_ifcq != NULL);
4565
4566 ifclassq_set_grp_combined(qset->nqs_ifcq, qset->nqs_idx);
4567 }
4568
4569 void
4570 kern_netif_set_qset_separate(kern_netif_qset_t qset)
4571 {
4572 VERIFY(qset != NULL);
4573 VERIFY(qset->nqs_ifcq != NULL);
4574
4575 ifclassq_set_grp_separated(qset->nqs_ifcq, qset->nqs_idx);
4576 }
4577
4578 errno_t
4579 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4580 struct kern_nexus_netif_llink_init *llink_init)
4581 {
4582 errno_t err;
4583 struct nx_netif *nif;
4584 struct netif_llink *__single llink;
4585 struct netif_stats *nifs;
4586
4587 VERIFY(nx != NULL);
4588 VERIFY(llink_init != NULL);
4589 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4590
4591 nif = NX_NETIF_PRIVATE(nx);
4592 nifs = &nif->nif_stats;
4593
4594 err = nx_netif_validate_llink_config(llink_init, false);
4595 if (err != 0) {
4596 SK_ERR("Invalid llink init params");
4597 STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4598 return err;
4599 }
4600
4601 err = nx_netif_llink_add(nif, llink_init, &llink);
4602 return err;
4603 }
4604
4605 errno_t
4606 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4607 kern_nexus_netif_llink_id_t llink_id)
4608 {
4609 struct nx_netif *nif;
4610
4611 VERIFY(nx != NULL);
4612 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4613
4614 nif = NX_NETIF_PRIVATE(nx);
4615 return nx_netif_llink_remove(nif, llink_id);
4616 }
4617
4618 errno_t
4619 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4620 kern_packet_svc_class_t *svc)
4621 {
4622 *svc = queue->nq_svc;
4623 return 0;
4624 }
4625