1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * The netif nexus domain has two domain providers: native and compat, with
31 * the latter being the default provider of this domain. The compat provider
32 * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33 *
34 * A netif nexus instance can be in a native or compat mode; in either case,
35 * it is associated with two instances of a nexus_adapter structure, and allows
36 * at most two channels opened to the nexus. Two two adapters correspond to
37 * host and device ports, respectively.
38 *
39 * By itself, a netif nexus isn't associated with a network interface. The
40 * association happens by attaching a network interface to the nexus instance.
41 * A channel can only be successfully opened to a netif nexus after it has an
42 * interface attached to it.
43 *
44 * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45 * structure refers to the attached netif nexus adapter via its if_na field.
46 * The nexus also holds a reference to the interface on its na_ifp field. Note
47 * that attaching to a netif_compat nexus does not alter the input/output data
48 * path, nor does it remove any of the interface's hardware offload flags. It
49 * merely associates the interface and netif nexus together.
50 *
51 * During a detach, the above references are dropped and the fields are cleared;
52 * the interface is also marked as non-Skywalk-capable. This detach can happen
53 * explicitly via a command down the nexus, or implicitly when the nexus goes
54 * away (assuming there's no channel opened to it.)
55 *
56 * A userland channel can be opened to a netif nexus via the usual ch_open()
57 * way, assuming the nexus provider is setup to allow access for the userland
58 * process (either by binding the nexus port to PID, etc. or by creating the
59 * nexus in the anonymous mode.)
60 *
61 * Alternatively, a kernel channel can also be opened to it by some kernel
62 * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63 * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64 * indicate that.
65 *
66 * Opening a channel to the host port of a native or compat netif causes the
67 * ifnet output path to be redirected to nx_netif_host_transmit(). We also,
68 * at present, disable any hardware offload features.
69 *
70 * Opening a channel to the device port of a compat netif causes the ifnet
71 * input path to be redirected to nx_netif_compat_receive(). This is specific
72 * to the compat variant, as the native variant's RX path already goes to
73 * the native netif.
74 *
75 * During channel close, we restore the original I/O callbacks, as well as the
76 * interface's offload flags.
77 */
78
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87
88 #define NX_NETIF_MAXRINGS NX_MAX_NUM_RING_PAIR
89 #define NX_NETIF_MINSLOTS 2 /* XXX same as above */
90 #define NX_NETIF_MAXSLOTS NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91 #define NX_NETIF_TXRINGSIZE 512 /* default TX ring size */
92 #define NX_NETIF_RXRINGSIZE 1024 /* default RX ring size */
93 #define NX_NETIF_BUFSIZE (2 * 1024) /* default buffer size */
94 #define NX_NETIF_MINBUFSIZE (128) /* min buffer size */
95 #define NX_NETIF_MAXBUFSIZE (32 * 1024) /* max buffer size */
96
97 /*
98 * TODO: [email protected] -- minimum buflets for now; we will need to
99 * have a way to adjust this based on the underlying interface's
100 * parameters, e.g. jumbo MTU, large segment offload, etc.
101 */
102 #define NX_NETIF_UMD_SIZE _USER_PACKET_SIZE(BUFLETS_MIN)
103 #define NX_NETIF_KMD_SIZE _KERN_PACKET_SIZE(BUFLETS_MIN)
104
105 /*
106 * minimum stack space required for IOSkywalkFamily and Driver execution.
107 */
108 #if XNU_TARGET_OS_OSX
109 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 1)
110 #else /* !XNU_TARGET_OS_OSX */
111 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 2)
112 #endif /* XNU_TARGET_OS_OSX */
113
114 static void nx_netif_dom_init(struct nxdom *);
115 static void nx_netif_dom_terminate(struct nxdom *);
116 static void nx_netif_dom_fini(struct nxdom *);
117 static int nx_netif_prov_params_adjust(
118 const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 struct nxprov_adjusted_params *);
120
121 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122 struct nxbind *, void *);
123 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125 struct kern_nexus *, struct kern_channel *, struct chreq *,
126 struct kern_channel *, struct nxbind *, struct proc *);
127 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128 struct kern_nexus *, struct kern_channel *);
129 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130 struct kern_nexus *, struct kern_channel *, struct proc *);
131 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132 struct kern_nexus *, struct kern_channel *, boolean_t);
133
134 static void nx_netif_doorbell(struct ifnet *);
135 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136 uint32_t);
137 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138 uint32_t);
139 static void nx_netif_na_dtor(struct nexus_adapter *na);
140 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141 uint32_t);
142 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143 uint32_t);
144 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145
146 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147 struct proc *);
148 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149 struct proc *);
150 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152 static void nx_netif_flags_init(struct nx_netif *);
153 static void nx_netif_flags_fini(struct nx_netif *);
154 static void nx_netif_callbacks_init(struct nx_netif *);
155 static void nx_netif_callbacks_fini(struct nx_netif *);
156 static void nx_netif_capabilities_fini(struct nx_netif *);
157 static errno_t nx_netif_interface_advisory_notify(void *,
158 const struct ifnet_interface_advisory *);
159
160 struct nxdom nx_netif_dom_s = {
161 .nxdom_prov_head =
162 STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
163 .nxdom_type = NEXUS_TYPE_NET_IF,
164 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
165 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
166 .nxdom_name = "netif",
167 .nxdom_ports = {
168 .nb_def = 2,
169 .nb_min = 2,
170 .nb_max = NX_NETIF_MAXPORTS,
171 },
172 .nxdom_tx_rings = {
173 .nb_def = 1,
174 .nb_min = 1,
175 .nb_max = NX_NETIF_MAXRINGS,
176 },
177 .nxdom_rx_rings = {
178 .nb_def = 1,
179 .nb_min = 1,
180 .nb_max = NX_NETIF_MAXRINGS,
181 },
182 .nxdom_tx_slots = {
183 .nb_def = NX_NETIF_TXRINGSIZE,
184 .nb_min = NX_NETIF_MINSLOTS,
185 .nb_max = NX_NETIF_MAXSLOTS,
186 },
187 .nxdom_rx_slots = {
188 .nb_def = NX_NETIF_RXRINGSIZE,
189 .nb_min = NX_NETIF_MINSLOTS,
190 .nb_max = NX_NETIF_MAXSLOTS,
191 },
192 .nxdom_buf_size = {
193 .nb_def = NX_NETIF_BUFSIZE,
194 .nb_min = NX_NETIF_MINBUFSIZE,
195 .nb_max = NX_NETIF_MAXBUFSIZE,
196 },
197 .nxdom_large_buf_size = {
198 .nb_def = 0,
199 .nb_min = 0,
200 .nb_max = 0,
201 },
202 .nxdom_meta_size = {
203 .nb_def = NX_NETIF_UMD_SIZE,
204 .nb_min = NX_NETIF_UMD_SIZE,
205 .nb_max = NX_METADATA_USR_MAX_SZ,
206 },
207 .nxdom_stats_size = {
208 .nb_def = 0,
209 .nb_min = 0,
210 .nb_max = NX_STATS_MAX_SZ,
211 },
212 .nxdom_pipes = {
213 .nb_def = 0,
214 .nb_min = 0,
215 .nb_max = NX_UPIPE_MAXPIPES,
216 },
217 .nxdom_flowadv_max = {
218 .nb_def = 0,
219 .nb_min = 0,
220 .nb_max = NX_FLOWADV_MAX,
221 },
222 .nxdom_nexusadv_size = {
223 .nb_def = 0,
224 .nb_min = 0,
225 .nb_max = NX_NEXUSADV_MAX_SZ,
226 },
227 .nxdom_capabilities = {
228 .nb_def = NXPCAP_USER_CHANNEL,
229 .nb_min = 0,
230 .nb_max = NXPCAP_USER_CHANNEL,
231 },
232 .nxdom_qmap = {
233 .nb_def = NEXUS_QMAP_TYPE_DEFAULT,
234 .nb_min = NEXUS_QMAP_TYPE_DEFAULT,
235 .nb_max = NEXUS_QMAP_TYPE_WMM,
236 },
237 .nxdom_max_frags = {
238 .nb_def = NX_PBUF_FRAGS_DEFAULT,
239 .nb_min = NX_PBUF_FRAGS_MIN,
240 .nb_max = NX_PBUF_FRAGS_MAX,
241 },
242 .nxdom_init = nx_netif_dom_init,
243 .nxdom_terminate = nx_netif_dom_terminate,
244 .nxdom_fini = nx_netif_dom_fini,
245 .nxdom_find_port = NULL,
246 .nxdom_port_is_reserved = NULL,
247 .nxdom_bind_port = nx_netif_dom_bind_port,
248 .nxdom_unbind_port = nx_netif_dom_unbind_port,
249 .nxdom_connect = nx_netif_dom_connect,
250 .nxdom_disconnect = nx_netif_dom_disconnect,
251 .nxdom_defunct = nx_netif_dom_defunct,
252 .nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
253 };
254
255 struct kern_nexus_domain_provider nx_netif_prov_s = {
256 .nxdom_prov_name = NEXUS_PROVIDER_NET_IF,
257 /*
258 * Don't install this as the default domain provider, i.e.
259 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
260 * provider to be the one handling userland-issued requests
261 * coming down thru nxprov_create() instead.
262 */
263 .nxdom_prov_flags = 0,
264 .nxdom_prov_cb = {
265 .dp_cb_init = nx_netif_prov_init,
266 .dp_cb_fini = nx_netif_prov_fini,
267 .dp_cb_params = nx_netif_prov_params,
268 .dp_cb_mem_new = nx_netif_prov_mem_new,
269 .dp_cb_config = nx_netif_prov_config,
270 .dp_cb_nx_ctor = nx_netif_prov_nx_ctor,
271 .dp_cb_nx_dtor = nx_netif_prov_nx_dtor,
272 .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info,
273 .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get,
274 .dp_cb_nx_stop = nx_netif_prov_nx_stop,
275 },
276 };
277
278 struct nexus_ifnet_ops na_netif_ops = {
279 .ni_finalize = na_netif_finalize,
280 .ni_reap = nx_netif_reap,
281 .ni_dequeue = nx_netif_native_tx_dequeue,
282 .ni_get_len = nx_netif_native_tx_get_len,
283 };
284
285 #define NX_NETIF_DOORBELL_MAX_DEQUEUE 64
286 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
287
288 #define NQ_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
289 static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;
290
291 #define NQ_ACCUMULATE_INTERVAL 2 /* 2 seconds */
292 static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;
293
294 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
295 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
296 #if (DEVELOPMENT || DEBUG)
297 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
298 CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
299 "ifname prefix for enabling low latency support");
300 static uint32_t nx_netif_force_ifnet_start = 0;
301 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
302 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
303 "always use ifnet starter thread");
304 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
305 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
306 NX_NETIF_DOORBELL_MAX_DEQUEUE,
307 "max packets to dequeue in doorbell context");
308 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
309 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
310 NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
311 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
312 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
313 NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
314 #endif /* !DEVELOPMENT && !DEBUG */
315
316 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
317 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_netif_queue_stat_enable,
318 0, "enable/disable stats collection for netif queue");
319
320 static SKMEM_TYPE_DEFINE(na_netif_zone, struct nexus_netif_adapter);
321
322 static SKMEM_TYPE_DEFINE(nx_netif_zone, struct nx_netif);
323
324 #define SKMEM_TAG_NETIF_MIT "com.apple.skywalk.netif.mit"
325 static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
326
327 #define SKMEM_TAG_NETIF_FILTER "com.apple.skywalk.netif.filter"
328 SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
329
330 #define SKMEM_TAG_NETIF_FLOW "com.apple.skywalk.netif.flow"
331 SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
332
333 #define SKMEM_TAG_NETIF_AGENT_FLOW "com.apple.skywalk.netif.agent_flow"
334 SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
335
336 #define SKMEM_TAG_NETIF_LLINK "com.apple.skywalk.netif.llink"
337 SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
338
339 #define SKMEM_TAG_NETIF_QSET "com.apple.skywalk.netif.qset"
340 SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
341
342 #define SKMEM_TAG_NETIF_LLINK_INFO "com.apple.skywalk.netif.llink_info"
343 SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
344
345 /* use this for any temporary allocations */
346 #define SKMEM_TAG_NETIF_TEMP "com.apple.skywalk.netif.temp"
347 static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);
348
349 static void
nx_netif_dom_init(struct nxdom * nxdom)350 nx_netif_dom_init(struct nxdom *nxdom)
351 {
352 SK_LOCK_ASSERT_HELD();
353 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
354
355 _CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
356 _CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
357 _CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
358 _CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
359 _CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
360 _CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
361 _CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
362
363 (void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
364
365 nx_netif_compat_init(nxdom);
366
367 ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
368 strbufcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
369 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
370
371 netif_gso_init();
372 }
373
374 static void
nx_netif_dom_terminate(struct nxdom * nxdom)375 nx_netif_dom_terminate(struct nxdom *nxdom)
376 {
377 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
378
379 SK_LOCK_ASSERT_HELD();
380
381 netif_gso_fini();
382 nx_netif_compat_fini();
383
384 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
385 nxdom_prov_link, tnxdp) {
386 (void) nxdom_prov_del(nxdom_prov);
387 }
388 }
389
390 static void
nx_netif_dom_fini(struct nxdom * nxdom)391 nx_netif_dom_fini(struct nxdom *nxdom)
392 {
393 #pragma unused(nxdom)
394 }
395
396 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)397 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
398 {
399 #pragma unused(nxdom_prov)
400 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
401 return 0;
402 }
403
404 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)405 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
406 uint32_t flags)
407 {
408 #pragma unused(kring, p, flags)
409 return ENXIO;
410 }
411
412 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)413 nx_netif_prov_nx_stop(struct kern_nexus *nx)
414 {
415 uint32_t r;
416 struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
417 struct nexus_netif_adapter *nifna = NIFNA(na);
418
419 SK_LOCK_ASSERT_HELD();
420 ASSERT(nx != NULL);
421
422 /* place all rings in drop mode */
423 na_kr_drop(na, TRUE);
424
425 /* ensure global visibility */
426 os_atomic_thread_fence(seq_cst);
427
428 /* reset all TX notify callbacks */
429 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
430 while (!os_atomic_cmpxchg((void * volatile *)&na->na_tx_rings[r].ckr_na_notify,
431 ptrauth_nop_cast(void *__single, na->na_tx_rings[r].ckr_na_notify),
432 ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
433 ;
434 }
435 os_atomic_thread_fence(seq_cst);
436 if (nifna->nifna_tx_mit != NULL) {
437 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
438 }
439 }
440 if (nifna->nifna_tx_mit != NULL) {
441 skn_free_type_array_counted_by(tx, struct nx_netif_mit,
442 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
443 }
444
445 /* reset all RX notify callbacks */
446 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
447 while (!os_atomic_cmpxchg((void * volatile *)&na->na_rx_rings[r].ckr_na_notify,
448 ptrauth_nop_cast(void *__single, na->na_rx_rings[r].ckr_na_notify),
449 ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
450 ;
451 }
452 os_atomic_thread_fence(seq_cst);
453 if (nifna->nifna_rx_mit != NULL) {
454 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
455 }
456 }
457 if (nifna->nifna_rx_mit != NULL) {
458 skn_free_type_array_counted_by(rx, struct nx_netif_mit,
459 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
460 }
461 return 0;
462 }
463
464 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)465 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
466 ifnet_t ifp)
467 {
468 const char *ifname;
469
470 ifname = __terminated_by_to_indexable(ifp->if_name);
471 if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
472 *(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
473 *(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
474 } else if (IFNET_IS_WIFI(ifp)) {
475 if (ifname[0] == 'a' && ifname[1] == 'p' &&
476 ifname[2] == '\0') {
477 /* Wi-Fi Access Point */
478 *(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
479 *(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
480 } else if (ifp->if_eflags & IFEF_AWDL) {
481 /* AWDL */
482 *(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
483 *(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
484 } else {
485 /* Wi-Fi infrastructure */
486 *(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
487 *(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
488 }
489 } else if (IFNET_IS_ETHERNET(ifp)) {
490 #if !XNU_TARGET_OS_OSX
491 /*
492 * On non-macOS platforms, treat all compat Ethernet
493 * interfaces as USB Ethernet with reduced ring sizes.
494 */
495 *(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
496 *(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
497 #else /* XNU_TARGET_OS_OSX */
498 if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
499 *(adj->adj_rx_slots) =
500 sk_netif_compat_usb_eth_rx_ring_sz;
501 *(adj->adj_tx_slots) =
502 sk_netif_compat_usb_eth_tx_ring_sz;
503 }
504 #endif /* XNU_TARGET_OS_OSX */
505 }
506 }
507
508 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)509 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
510 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
511 {
512 /*
513 * for netif compat adjust the following parameters for memory
514 * optimization:
515 * - change the size of buffer object to 128 bytes.
516 * - don't allocate rx ring for host port and tx ring for dev port.
517 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
518 * Assumption here is that pdp_ip0 is always used as the data
519 * interface.
520 * - reduce the ring size for AWDL interface.
521 * - reduce the ring size for USB ethernet interface.
522 */
523 if (strbufcmp(nxdom_prov->nxdom_prov_name,
524 NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
525 /*
526 * Leave the parameters default if userspace access may be
527 * needed. We can't use skywalk_direct_allowed() here because
528 * the drivers have not attached yet.
529 */
530 if (skywalk_netif_direct_enabled()) {
531 goto done;
532 }
533
534 *(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
535 *(adj->adj_tx_rings) = 1;
536 if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
537 ifnet_t ifp;
538 ifnet_head_lock_shared();
539 ifp = ifindex2ifnet[nxp->nxp_ifindex];
540 ifnet_head_done();
541 VERIFY(ifp != NULL);
542 nx_netif_compat_adjust_ring_size(adj, ifp);
543 }
544 } else { /* netif native */
545 if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
546 *(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
547 *(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
548 }
549 /*
550 * Add another extra ring for host port. Note that if the
551 * nexus isn't configured to use the same pbufpool for all of
552 * its ports, we'd end up allocating extra here.
553 * Not a big deal since that case isn't the default.
554 */
555 *(adj->adj_tx_rings) += 1;
556 *(adj->adj_rx_rings) += 1;
557
558 if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
559 SK_ERR("buf size too small, min (%d)",
560 PKT_MAX_PROTO_HEADER_SIZE);
561 return EINVAL;
562 }
563 _CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
564 NX_INTF_ADV_SIZE);
565 *(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
566 }
567 done:
568 return 0;
569 }
570
571 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)572 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
573 const uint32_t req, const struct nxprov_params *nxp0,
574 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
575 uint32_t pp_region_config_flags)
576 {
577 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
578
579 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
580 nxdom, nxdom, nxdom, pp_region_config_flags,
581 nx_netif_prov_params_adjust);
582 }
583
584 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)585 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
586 struct kern_nexus *nx, struct nexus_adapter *na)
587 {
588 #pragma unused(nxdom_prov)
589 int err = 0;
590 boolean_t pp_truncated_buf = FALSE;
591 boolean_t allow_direct;
592 boolean_t kernel_only;
593
594 SK_DF(SK_VERB_NETIF,
595 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
596 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
597 SK_KVA(na));
598
599 ASSERT(na->na_arena == NULL);
600 if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
601 (na->na_type == NA_NETIF_COMPAT_HOST)) {
602 pp_truncated_buf = TRUE;
603 }
604 /*
605 * We do this check to determine whether to create the extra
606 * regions needed for userspace access. This is per interface.
607 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
608 */
609 allow_direct = skywalk_netif_direct_allowed(
610 __unsafe_null_terminated_from_indexable(na->na_name));
611
612 /*
613 * Both ports (host and dev) share the same packet buffer pool;
614 * the first time a port gets opened will allocate the pp that
615 * gets stored in the nexus, which will then be used by any
616 * subsequent opens.
617 */
618 kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
619 na->na_arena = skmem_arena_create_for_nexus(na,
620 NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
621 &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
622 ASSERT(na->na_arena != NULL || err != 0);
623 ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
624 NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
625 NX_DOM(nx)->nxdom_md_subtype));
626
627 return err;
628 }
629
630 SK_NO_INLINE_ATTRIBUTE
631 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)632 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
633 {
634 struct nx_llink_info_req *nlir = NULL;
635 struct nx_netif *nif;
636 struct netif_llink *llink;
637 uint16_t llink_cnt;
638 size_t len, user_len;
639 int err, i;
640
641 nif = NX_NETIF_PRIVATE(nx);
642 if (!NETIF_LLINK_ENABLED(nif)) {
643 SK_ERR("llink mode not enabled");
644 return ENOTSUP;
645 }
646 lck_rw_lock_shared(&nif->nif_llink_lock);
647 llink_cnt = nif->nif_llink_cnt;
648 if (llink_cnt == 0) {
649 SK_ERR("zero llink cnt");
650 err = ENXIO;
651 goto done;
652 }
653 len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
654 /* preserve sopt_valsize because it gets overwritten by copyin */
655 user_len = sopt->sopt_valsize;
656 if (user_len < len) {
657 SK_ERR("buffer too small");
658 err = ENOBUFS;
659 goto done;
660 }
661 nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
662 if (nlir == NULL) {
663 SK_ERR("failed to allocate nlir");
664 err = ENOMEM;
665 goto done;
666 }
667 err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
668 if (err != 0) {
669 SK_ERR("copyin failed: %d", err);
670 goto done;
671 }
672 if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
673 SK_ERR("nlir version mismatch: %d != %d",
674 nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
675 err = ENOTSUP;
676 goto done;
677 }
678 nlir->nlir_llink_cnt = llink_cnt;
679 i = 0;
680 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
681 struct nx_llink_info *nli;
682 struct netif_qset *qset;
683 uint16_t qset_cnt;
684 int j;
685
686 nli = &nlir->nlir_llink[i];
687 nli->nli_link_id = llink->nll_link_id;
688 nli->nli_link_id_internal = llink->nll_link_id_internal;
689 nli->nli_state = llink->nll_state;
690 nli->nli_flags = llink->nll_flags;
691
692 qset_cnt = llink->nll_qset_cnt;
693 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
694 nli->nli_qset_cnt = qset_cnt;
695
696 j = 0;
697 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
698 struct nx_qset_info *nqi;
699
700 nqi = &nli->nli_qset[j];
701 nqi->nqi_id = qset->nqs_id;
702 nqi->nqi_flags = qset->nqs_flags;
703 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
704 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
705 j++;
706 }
707 ASSERT(j == qset_cnt);
708 i++;
709 }
710 ASSERT(i == llink_cnt);
711 sopt->sopt_valsize = user_len;
712 err = sooptcopyout(sopt, nlir, len);
713 if (err != 0) {
714 SK_ERR("sooptcopyout failed: %d", err);
715 }
716 done:
717 lck_rw_unlock_shared(&nif->nif_llink_lock);
718 if (nlir != NULL) {
719 sk_free_data(nlir, len);
720 }
721 return err;
722 }
723
724 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)725 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
726 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
727 struct proc *p, kauth_cred_t cred)
728 {
729 #pragma unused(nxdom_prov)
730 struct sockopt sopt;
731 int err = 0;
732
733 SK_LOCK_ASSERT_HELD();
734
735 /* proceed only if the client possesses netif entitlement */
736 if ((err = skywalk_priv_check_cred(p, cred,
737 PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
738 goto done;
739 }
740
741 if (ncr->nc_req == USER_ADDR_NULL) {
742 err = EINVAL;
743 goto done;
744 }
745
746 /* to make life easier for handling copies */
747 bzero(&sopt, sizeof(sopt));
748 sopt.sopt_dir = sopt_dir;
749 sopt.sopt_val = ncr->nc_req;
750 sopt.sopt_valsize = ncr->nc_req_len;
751 sopt.sopt_p = p;
752
753 switch (ncr->nc_cmd) {
754 case NXCFG_CMD_ATTACH:
755 case NXCFG_CMD_DETACH: {
756 struct nx_spec_req nsr;
757
758 bzero(&nsr, sizeof(nsr));
759 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
760 if (err != 0) {
761 goto done;
762 }
763
764 /*
765 * Null-terminate in case this has an interface name;
766 * the union is already large enough for uuid_t.
767 */
768 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
769 if (p != kernproc) {
770 nsr.nsr_flags &= NXSPECREQ_MASK;
771 }
772
773 err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
774 if (err != 0) {
775 goto done;
776 }
777
778 /* XXX: [email protected] -- can this copyout fail? */
779 (void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
780 break;
781 }
782 case NXCFG_CMD_FLOW_ADD:
783 case NXCFG_CMD_FLOW_DEL: {
784 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
785 offsetof(struct nx_flow_req, _nfr_common_field_end));
786 struct nx_flow_req nfr;
787
788 bzero(&nfr, sizeof(nfr));
789 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
790 if (err != 0) {
791 goto done;
792 }
793
794 err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
795 if (err != 0) {
796 goto done;
797 }
798
799 /* XXX: [email protected] -- can this copyout fail? */
800 (void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
801 break;
802 }
803 case NXCFG_CMD_GET_LLINK_INFO: {
804 err = nx_netif_get_llink_info(&sopt, nx);
805 break;
806 }
807 default:
808 err = EINVAL;
809 goto done;
810 }
811 done:
812 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
813 "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
814 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
815 return err;
816 }
817
818 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)819 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
820 {
821 #pragma unused(nxdom_prov)
822 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
823 }
824
825 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)826 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
827 {
828 struct nx_netif *n;
829 char name[64];
830 const char *__null_terminated nxadv_name = NULL;
831 int error;
832
833 SK_LOCK_ASSERT_HELD();
834 ASSERT(nx->nx_arg == NULL);
835
836 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
837
838 nx->nx_arg = nx_netif_alloc(Z_WAITOK);
839 n = NX_NETIF_PRIVATE(nx);
840 if (NX_USER_CHANNEL_PROV(nx) &&
841 NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
842 nxadv_name = tsnprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
843 error = nx_advisory_alloc(nx, nxadv_name,
844 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
845 NEXUS_ADVISORY_TYPE_NETIF);
846 if (error != 0) {
847 nx_netif_free(n);
848 return error;
849 }
850 }
851 n->nif_nx = nx;
852 SK_D("create new netif 0x%llx for nexus 0x%llx",
853 SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
854 return 0;
855 }
856
857 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)858 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
859 {
860 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
861
862 SK_LOCK_ASSERT_HELD();
863
864 SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
865 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
866
867 /*
868 * XXX
869 * detach should be done separately to be symmetrical with attach.
870 */
871 nx_advisory_free(nx);
872 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
873 /* we're called by nx_detach(), so this cannot fail */
874 int err = nx_netif_ctl_detach(nx, NULL);
875 VERIFY(err == 0);
876 }
877 if (n->nif_dev_nxb != NULL) {
878 nxb_free(n->nif_dev_nxb);
879 n->nif_dev_nxb = NULL;
880 }
881 if (n->nif_host_nxb != NULL) {
882 nxb_free(n->nif_host_nxb);
883 n->nif_host_nxb = NULL;
884 }
885 SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
886 nx_netif_free(n);
887 nx->nx_arg = NULL;
888 }
889
890 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)891 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
892 struct kern_pbufpool **rpp)
893 {
894 ASSERT(nx->nx_tx_pp != NULL);
895 ASSERT(nx->nx_rx_pp != NULL);
896
897 if (tpp != NULL) {
898 *tpp = nx->nx_tx_pp;
899 }
900 if (rpp != NULL) {
901 *rpp = nx->nx_rx_pp;
902 }
903
904 return 0;
905 }
906
907 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)908 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
909 {
910 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
911 struct ifnet *ifp = nif->nif_ifp;
912 struct sk_stats_net_if *__single sns = out;
913 size_t actual_space = sizeof(struct sk_stats_net_if);
914
915 if (out != NULL && actual_space <= len) {
916 uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
917 if (ifp != NULL) {
918 (void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
919 }
920 sns->sns_nifs = nif->nif_stats;
921 }
922
923 return actual_space;
924 }
925
926 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)927 __netif_mib_get_llinks(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
928 {
929 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
930 struct nx_llink_info *nli_list = out;
931 size_t actual_space = 0;
932 if (NETIF_LLINK_ENABLED(nif)) {
933 lck_rw_lock_shared(&nif->nif_llink_lock);
934 actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
935
936 if (out != NULL && actual_space <= len) {
937 struct netif_llink *llink;
938 int i = 0;
939 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
940 struct nx_llink_info *nli;
941 struct netif_qset *qset;
942 uint16_t qset_cnt;
943 int j;
944
945 nli = &nli_list[i];
946 uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
947 nli->nli_link_id = llink->nll_link_id;
948 nli->nli_link_id_internal = llink->nll_link_id_internal;
949 nli->nli_state = llink->nll_state;
950 nli->nli_flags = llink->nll_flags;
951
952 qset_cnt = llink->nll_qset_cnt;
953 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
954 nli->nli_qset_cnt = qset_cnt;
955
956 j = 0;
957 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
958 struct nx_qset_info *nqi;
959
960 nqi = &nli->nli_qset[j];
961 nqi->nqi_id = qset->nqs_id;
962 nqi->nqi_flags = qset->nqs_flags;
963 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
964 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
965 j++;
966 }
967 ASSERT(j == qset_cnt);
968 i++;
969 }
970 ASSERT(i == nif->nif_llink_cnt);
971 }
972 lck_rw_unlock_shared(&nif->nif_llink_lock);
973 }
974
975 return actual_space;
976 }
977
978 static size_t
__netif_mib_get_queue_stats(struct kern_nexus * nx,void * __sized_by (len)out,size_t len)979 __netif_mib_get_queue_stats(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
980 {
981 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
982 uint8_t *itr = out;
983 size_t actual_space = 0;
984 if (!NETIF_LLINK_ENABLED(nif)) {
985 return actual_space;
986 }
987
988 lck_rw_lock_shared(&nif->nif_llink_lock);
989 struct netif_llink *llink;
990 struct netif_qset *qset;
991 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
992 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
993 actual_space += sizeof(struct netif_qstats_info) *
994 (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
995 }
996 }
997 if (out == NULL || actual_space > len) {
998 lck_rw_unlock_shared(&nif->nif_llink_lock);
999 return actual_space;
1000 }
1001
1002 llink = NULL;
1003 qset = NULL;
1004 uint16_t i = 0, j = 0;
1005 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1006 uint16_t qset_cnt;
1007 j = 0;
1008 qset_cnt = llink->nll_qset_cnt;
1009 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
1010 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1011 int queue_cnt = qset->nqs_num_rx_queues +
1012 qset->nqs_num_tx_queues;
1013 for (uint16_t k = 0; k < queue_cnt; k++) {
1014 struct netif_qstats_info *nqi =
1015 (struct netif_qstats_info *)(void *)itr;
1016 struct netif_queue *nq = &qset->nqs_driver_queues[k];
1017 nqi->nqi_qset_id = qset->nqs_id;
1018 nqi->nqi_queue_idx = k;
1019 if (KPKT_VALID_SVC(nq->nq_svc)) {
1020 nqi->nqi_svc = (packet_svc_class_t)nq->nq_svc;
1021 }
1022 if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
1023 nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
1024 }
1025
1026 struct netif_qstats *nq_out = &nqi->nqi_stats;
1027 struct netif_qstats *nq_src = &nq->nq_stats;
1028 memcpy(nq_out, nq_src, sizeof(struct netif_qstats));
1029
1030 itr += sizeof(struct netif_qstats_info);
1031 }
1032 j++;
1033 }
1034 ASSERT(j == qset_cnt);
1035 i++;
1036 }
1037 ASSERT(i == nif->nif_llink_cnt);
1038
1039 lck_rw_unlock_shared(&nif->nif_llink_lock);
1040 return actual_space;
1041 }
1042
1043 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)1044 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1045 void *__sized_by(len) out, size_t len, struct proc *p)
1046 {
1047 #pragma unused(p)
1048 size_t ret;
1049
1050 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1051 (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
1052 return 0;
1053 }
1054
1055 switch (filter->nmf_type) {
1056 case NXMIB_NETIF_STATS:
1057 ret = __netif_mib_get_stats(nx, out, len);
1058 break;
1059 case NXMIB_LLINK_LIST:
1060 ret = __netif_mib_get_llinks(nx, out, len);
1061 break;
1062 case NXMIB_NETIF_QUEUE_STATS:
1063 ret = __netif_mib_get_queue_stats(nx, out, len);
1064 break;
1065 default:
1066 ret = 0;
1067 break;
1068 }
1069 return ret;
1070 }
1071
1072 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)1073 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1074 struct nxbind *nxb, void *info)
1075 {
1076 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1077 nexus_port_t first, last, port;
1078 int error;
1079
1080 ASSERT(nx_port != NULL);
1081 ASSERT(nxb != NULL);
1082
1083 port = *nx_port;
1084
1085 /*
1086 * If port is:
1087 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1088 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1089 * return back the assigned port.
1090 */
1091 first = NEXUS_PORT_NET_IF_CLIENT;
1092 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
1093 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
1094 ASSERT(first <= last);
1095
1096 NETIF_WLOCK(nif);
1097
1098 if (__improbable(first == last)) {
1099 error = ENOMEM;
1100 } else if (port != NEXUS_PORT_ANY) {
1101 error = nx_port_bind_info(nx, port, nxb, info);
1102 SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1103 } else {
1104 error = nx_port_find(nx, first, last - 1, &port);
1105 ASSERT(error != 0 || (port >= first && port < last));
1106 if (error == 0) {
1107 error = nx_port_bind_info(nx, port, nxb, info);
1108 SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1109 port, error);
1110 }
1111 }
1112 NETIF_WUNLOCK(nif);
1113
1114 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1115 if (error == 0) {
1116 *nx_port = port;
1117 }
1118
1119 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1120 "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1121 SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1122 nx->nx_active_ports, error);
1123
1124 return error;
1125 }
1126
1127 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1128 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1129 {
1130 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1131 int error = 0;
1132
1133 ASSERT(nx_port != NEXUS_PORT_ANY);
1134
1135 NETIF_WLOCK(nif);
1136 error = nx_port_unbind(nx, nx_port);
1137 NETIF_WUNLOCK(nif);
1138
1139 return error;
1140 }
1141
1142 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)1143 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1144 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1145 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1146 {
1147 #pragma unused(nxdom_prov)
1148 int err = 0;
1149
1150 SK_LOCK_ASSERT_HELD();
1151
1152 ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1153 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1154 nxdom_prov->nxdom_prov_dom->nxdom_type &&
1155 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1156 ASSERT(!(ch->ch_flags & CHANF_HOST));
1157
1158 switch (chr->cr_port) {
1159 case NEXUS_PORT_NET_IF_DEV:
1160 if (chr->cr_mode & CHMODE_HOST) {
1161 err = EINVAL;
1162 goto done;
1163 }
1164 break;
1165
1166 case NEXUS_PORT_NET_IF_HOST:
1167 if (!(chr->cr_mode & CHMODE_HOST)) {
1168 if (ch->ch_flags & CHANF_KERNEL) {
1169 err = EINVAL;
1170 goto done;
1171 }
1172 chr->cr_mode |= CHMODE_HOST;
1173 }
1174 /*
1175 * This channel is exclusively opened to the host
1176 * rings; don't notify the external provider.
1177 */
1178 os_atomic_or(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP, relaxed);
1179 break;
1180
1181 default:
1182 /*
1183 * This channel is shared between netif and user process;
1184 * don't notify the external provider.
1185 */
1186 os_atomic_or(&ch->ch_flags, CHANF_EXT_SKIP, relaxed);
1187 break;
1188 }
1189
1190 chr->cr_ring_set = RING_SET_DEFAULT;
1191 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1192 (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1193 nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1194 nx->nx_prov->nxprov_params->nxp_name);
1195
1196 if (ch->ch_flags & CHANF_KERNEL) {
1197 err = na_connect_spec(nx, ch, chr, p);
1198 } else {
1199 err = na_connect(nx, ch, chr, ch0, nxb, p);
1200 }
1201
1202 if (err == 0) {
1203 /*
1204 * Mark the kernel slot descriptor region as busy; this
1205 * prevents it from being torn-down at channel defunct
1206 * time, as the (external) nexus owner may be calling
1207 * KPIs that require accessing the slots.
1208 */
1209 skmem_arena_nexus_sd_set_noidle(
1210 skmem_arena_nexus(ch->ch_na->na_arena), 1);
1211 }
1212
1213 done:
1214 return err;
1215 }
1216
1217 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1218 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1219 struct kern_nexus *nx, struct kern_channel *ch)
1220 {
1221 #pragma unused(nxdom_prov)
1222 SK_LOCK_ASSERT_HELD();
1223
1224 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1225 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1226 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1227
1228 /*
1229 * Release busy assertion held earlier in nx_netif_dom_connect();
1230 * this allows for the final arena teardown to succeed.
1231 */
1232 skmem_arena_nexus_sd_set_noidle(
1233 skmem_arena_nexus(ch->ch_na->na_arena), -1);
1234
1235 if (ch->ch_flags & CHANF_KERNEL) {
1236 na_disconnect_spec(nx, ch);
1237 } else {
1238 na_disconnect(nx, ch);
1239 }
1240 }
1241
1242 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1243 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1244 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1245 {
1246 #pragma unused(nxdom_prov, nx)
1247 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1248 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1249 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1250 ch->ch_na->na_type == NA_NETIF_HOST ||
1251 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1252 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1253 ch->ch_na->na_type == NA_NETIF_VP);
1254
1255 na_ch_rings_defunct(ch, p);
1256 }
1257
1258 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1259 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1260 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1261 {
1262 #pragma unused(nxdom_prov)
1263 struct ifnet *ifp;
1264
1265 if (!locked) {
1266 SK_LOCK_ASSERT_NOTHELD();
1267 SK_LOCK();
1268 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1269 } else {
1270 SK_LOCK_ASSERT_HELD();
1271 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1272 }
1273
1274 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1275 ch->ch_na->na_type == NA_NETIF_HOST ||
1276 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1277 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1278 ch->ch_na->na_type == NA_NETIF_VP);
1279
1280 na_defunct(nx, ch, ch->ch_na, locked);
1281 ifp = ch->ch_na->na_ifp;
1282 if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
1283 ifnet_is_low_latency(ifp)) {
1284 /*
1285 * We release the VPNA's ifp here instead of waiting for the
1286 * application to close the channel to trigger the release.
1287 */
1288 DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
1289 ch->ch_na, struct ifnet *, ifp);
1290 ifnet_decr_iorefcnt(ifp);
1291 ch->ch_na->na_ifp = NULL;
1292 }
1293 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1294 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1295 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1296 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1297
1298 if (!locked) {
1299 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1300 SK_UNLOCK();
1301 } else {
1302 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1303 SK_LOCK_ASSERT_HELD();
1304 }
1305 }
1306
1307 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1308 na_netif_alloc(zalloc_flags_t how)
1309 {
1310 _CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1311
1312 return zalloc_flags(na_netif_zone, how | Z_ZERO);
1313 }
1314
1315 void
na_netif_free(struct nexus_adapter * na)1316 na_netif_free(struct nexus_adapter *na)
1317 {
1318 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1319
1320 SK_LOCK_ASSERT_HELD();
1321 SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1322
1323 ASSERT(na->na_refcount == 0);
1324 ASSERT(nifna->nifna_tx_mit == NULL);
1325 ASSERT(nifna->nifna_rx_mit == NULL);
1326 bzero(nifna, sizeof(*nifna));
1327
1328 zfree(na_netif_zone, nifna);
1329 }
1330
1331 /* Process NXCFG_CMD_ATTACH */
1332 SK_NO_INLINE_ATTRIBUTE
1333 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1334 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1335 struct proc *p)
1336 {
1337 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1338 struct ifnet *ifp = NULL;
1339 boolean_t compat;
1340 int err = 0;
1341
1342 SK_LOCK_ASSERT_HELD();
1343
1344 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1345 compat = (strbufcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1346 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1347
1348 uuid_clear(nsr->nsr_if_uuid);
1349 /*
1350 * The netif accepts either an interface name or a pointer to
1351 * an ifnet, but never a UUID.
1352 */
1353 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1354 err = EINVAL;
1355 goto done;
1356 }
1357 if (nsr->nsr_flags & NXSPECREQ_IFP) {
1358 if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1359 err = EINVAL;
1360 goto done;
1361 }
1362 } else if ((ifp = ifunit_ref(__unsafe_null_terminated_from_indexable(
1363 nsr->nsr_name))) == NULL) {
1364 err = ENXIO;
1365 goto done;
1366 }
1367
1368 if ((compat && SKYWALK_NATIVE(ifp)) ||
1369 (!compat && !SKYWALK_NATIVE(ifp))) {
1370 /* native driver for netif; non-native for netif_compat */
1371 err = ENODEV;
1372 } else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1373 err = EBUSY;
1374 } else {
1375 ASSERT(uuid_is_null(n->nif_uuid));
1376 /*
1377 * Upon success, callee will hold its own ifnet iorefcnt
1378 * as well as a retain count on the nexus adapter.
1379 */
1380 if (compat) {
1381 err = nx_netif_compat_attach(nx, ifp);
1382 } else {
1383 err = nx_netif_attach(nx, ifp);
1384 }
1385
1386 if (err == 0) {
1387 /* return the adapter UUID */
1388 uuid_generate_random(n->nif_uuid);
1389 uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1390 #if (DEVELOPMENT || DEBUG)
1391 skoid_create(&n->nif_skoid,
1392 SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1393 CTLFLAG_RW);
1394 #endif /* !DEVELOPMENT && !DEBUG */
1395 }
1396 }
1397 done:
1398 /* drop I/O refcnt from ifunit_ref() */
1399 if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1400 ifnet_decr_iorefcnt(ifp);
1401 }
1402
1403 #if SK_LOG
1404 uuid_string_t uuidstr, ifuuidstr;
1405 const char *nustr;
1406 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1407 nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1408 } else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1409 (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1410 SK_KVA(nsr->nsr_ifp));
1411 nustr = uuidstr;
1412 } else {
1413 nustr = nsr->nsr_name;
1414 }
1415 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1416 "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1417 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1418 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1419 #endif /* SK_LOG */
1420
1421 return err;
1422 }
1423
1424 SK_NO_INLINE_ATTRIBUTE
1425 static int
nx_netif_clean(struct nx_netif * nif,boolean_t quiesce_needed)1426 nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
1427 {
1428 struct kern_nexus *nx = nif->nif_nx;
1429 struct ifnet *ifp;
1430 boolean_t suspended = FALSE;
1431
1432 ifp = nif->nif_ifp;
1433 if (ifp == NULL) {
1434 return EALREADY;
1435 }
1436 /*
1437 * For regular kernel-attached interfaces, quiescing is handled by
1438 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1439 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1440 * are constructed on the fly and can also be torn down on the fly.
1441 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1442 * can be detached while the interface is still attached.
1443 */
1444 if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
1445 SK_UNLOCK();
1446 suspended = TRUE;
1447 ifnet_datamov_drain(ifp);
1448 SK_LOCK();
1449 }
1450 nx_netif_callbacks_fini(nif);
1451 nx_netif_agent_fini(nif);
1452 nx_netif_capabilities_fini(nif);
1453 nx_netif_flow_fini(nif);
1454 nx_netif_filter_fini(nif);
1455 nx_netif_llink_fini(nif);
1456 nx_netif_flags_fini(nif);
1457
1458 uuid_clear(nif->nif_uuid);
1459 /* nx_netif_{compat_}attach() held both references */
1460 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1461 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1462 nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1463 nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1464
1465 ifp->if_na_ops = NULL;
1466 ifp->if_na = NULL;
1467 nif->nif_ifp = NULL;
1468 nif->nif_netif_nxadv = NULL;
1469 SKYWALK_CLEAR_CAPABLE(ifp);
1470 if (suspended) {
1471 ifnet_datamov_resume(ifp);
1472 }
1473
1474 #if (DEVELOPMENT || DEBUG)
1475 skoid_destroy(&nif->nif_skoid);
1476 #endif /* !DEVELOPMENT && !DEBUG */
1477 return 0;
1478 }
1479
1480 /* process NXCFG_CMD_DETACH */
1481 SK_NO_INLINE_ATTRIBUTE
1482 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1483 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1484 {
1485 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1486 int err = 0;
1487
1488 SK_LOCK_ASSERT_HELD();
1489
1490 /*
1491 * nsr is NULL when we're called from the destructor, and it
1492 * implies that we'll detach whatever that is attached.
1493 */
1494 if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1495 err = EINVAL;
1496 } else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1497 nif->nif_uuid) != 0) {
1498 err = ESRCH;
1499 } else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1500 /* nx_netif_ctl_attach() not yet done or already detached */
1501 err = ENXIO;
1502 } else if (nx->nx_ch_count != 0) {
1503 /*
1504 * There's at least a channel opened; we can't
1505 * yank the interface from underneath the nexus
1506 * since our dlil input/output handler may be
1507 * running now. Bail out and come back here
1508 * again when the nexus detaches.
1509 */
1510 err = EBUSY;
1511 } else {
1512 err = nx_netif_clean(nif, TRUE);
1513 }
1514
1515 #if SK_LOG
1516 if (nsr != NULL) {
1517 uuid_string_t ifuuidstr;
1518 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1519 "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1520 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1521 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1522 nsr->nsr_flags, err);
1523 } else {
1524 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1525 "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1526 NX_DOM_PROV(nx)->nxdom_prov_name, err);
1527 }
1528 #endif /* SK_LOG */
1529
1530 return err;
1531 }
1532
1533 /*
1534 * XXX
1535 * These checks are copied from fsw.c
1536 * There are no tests exercising this code. Do we still need this?
1537 */
1538 SK_NO_INLINE_ATTRIBUTE
1539 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1540 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1541 struct proc *p, struct nx_flow_req *req)
1542 {
1543 #pragma unused(nif)
1544 boolean_t need_check;
1545 int error;
1546
1547 if (uuid_is_null(req->nfr_flow_uuid)) {
1548 return EINVAL;
1549 }
1550 req->nfr_flags &= NXFLOWREQF_MASK;
1551 req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1552
1553 if (cmd == NXCFG_CMD_FLOW_DEL) {
1554 return 0;
1555 }
1556 need_check = FALSE;
1557 if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1558 need_check = TRUE;
1559 } else if (!uuid_is_null(req->nfr_euuid)) {
1560 uuid_t uuid;
1561
1562 /* get the UUID of the issuing process */
1563 proc_getexecutableuuid(p, uuid, sizeof(uuid));
1564
1565 /*
1566 * If this is not issued by a process for its own
1567 * executable UUID and if the process does not have
1568 * the necessary privilege, reject the request.
1569 * The logic is similar to so_set_effective_uuid().
1570 */
1571 if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1572 need_check = TRUE;
1573 }
1574 }
1575 if (need_check) {
1576 kauth_cred_t cred = kauth_cred_proc_ref(p);
1577 error = priv_check_cred(cred,
1578 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1579 kauth_cred_unref(&cred);
1580 if (error != 0) {
1581 return error;
1582 }
1583 }
1584 return 0;
1585 }
1586
1587 SK_NO_INLINE_ATTRIBUTE
1588 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1589 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1590 struct nx_flow_req *req)
1591 {
1592 int err;
1593
1594 ASSERT(p != PROC_NULL);
1595 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1596 if (err != 0) {
1597 return err;
1598 }
1599
1600 /* init kernel only fields */
1601 nx_flow_req_internalize(req);
1602 req->nfr_context = NULL;
1603 req->nfr_flow_stats = NULL;
1604 req->nfr_port_reservation = NULL;
1605 req->nfr_pid = proc_pid(p);
1606
1607 err = nx_netif_netagent_flow_add(nif, req);
1608 nx_flow_req_externalize(req);
1609 return err;
1610 }
1611
1612 SK_NO_INLINE_ATTRIBUTE
1613 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1614 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1615 struct nx_flow_req *req)
1616 {
1617 int err;
1618
1619 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1620 if (err != 0) {
1621 return err;
1622 }
1623
1624 nx_flow_req_internalize(req);
1625 req->nfr_pid = proc_pid(p);
1626
1627 err = nx_netif_netagent_flow_del(nif, req);
1628 nx_flow_req_externalize(req);
1629 return err;
1630 }
1631
1632 SK_NO_INLINE_ATTRIBUTE
1633 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1634 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1635 struct proc *p)
1636 {
1637 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1638 struct nx_spec_req *__single nsr = data;
1639 struct nx_flow_req *__single nfr = data;
1640 int error = 0;
1641
1642 SK_LOCK_ASSERT_HELD();
1643
1644 switch (nc_cmd) {
1645 case NXCFG_CMD_ATTACH:
1646 error = nx_netif_ctl_attach(nx, nsr, p);
1647 break;
1648
1649 case NXCFG_CMD_DETACH:
1650 error = nx_netif_ctl_detach(nx, nsr);
1651 break;
1652
1653 case NXCFG_CMD_FLOW_ADD:
1654 error = nx_netif_ctl_flow_add(nif, p, nfr);
1655 break;
1656
1657 case NXCFG_CMD_FLOW_DEL:
1658 error = nx_netif_ctl_flow_del(nif, p, nfr);
1659 break;
1660
1661 default:
1662 SK_ERR("invalid cmd %u", nc_cmd);
1663 error = EINVAL;
1664 break;
1665 }
1666 return error;
1667 }
1668
1669 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1670 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1671 uint32_t flags)
1672 {
1673 #pragma unused(flags)
1674 struct netif_qset *qset;
1675
1676 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1677 (void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1678 }
1679 }
1680
1681 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1682 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1683 {
1684 struct nx_netif *nif;
1685 struct netif_llink *llink;
1686
1687 nif = NX_NETIF_PRIVATE(nx);
1688
1689 lck_rw_lock_shared(&nif->nif_llink_lock);
1690 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1691 nx_netif_llink_notify(nx, llink, flags);
1692 }
1693 lck_rw_unlock_shared(&nif->nif_llink_lock);
1694 }
1695
1696 /*
1697 * if_start() callback for native Skywalk interfaces, registered
1698 * at ifnet_allocate_extended() time, and invoked by the ifnet
1699 * starter thread.
1700 */
1701 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1702 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1703 {
1704 if (__improbable(ifp->if_na == NULL)) {
1705 return;
1706 }
1707
1708 /*
1709 * Do this only if the nexus adapter is active, i.e. a channel
1710 * has been opened to it by the module above (flowswitch, etc.)
1711 */
1712 struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1713 if (__probable(NA_IS_ACTIVE(hwna))) {
1714 struct kern_nexus *nx = hwna->na_nx;
1715
1716 /* update our work timestamp */
1717 hwna->na_work_ts = _net_uptime;
1718
1719 if (NX_LLINK_PROV(nx)) {
1720 nx_netif_llink_notify_all(nx, flags);
1721 } else {
1722 struct __kern_channel_ring *kring;
1723
1724 /* for doorbell purposes, use TX ring 0 */
1725 kring = &hwna->na_tx_rings[0];
1726
1727 /* Issue a synchronous TX doorbell on the netif device ring */
1728 kring->ckr_na_sync(kring, PROC_NULL,
1729 (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1730 }
1731 } else {
1732 struct netif_stats *nifs =
1733 &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1734 STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1735 }
1736 }
1737
1738 static void
nx_netif_doorbell(struct ifnet * ifp)1739 nx_netif_doorbell(struct ifnet *ifp)
1740 {
1741 nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1742 }
1743
1744 /*
1745 * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1746 * perform synchronous TX doorbell to the driver, by invoking the driver's
1747 * doorbell callback directly in the same thread context. It is also called
1748 * when the layer above performs a TX sync operation, where we might need
1749 * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1750 */
1751 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1752 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1753 uint32_t flags)
1754 {
1755 #pragma unused(p)
1756 struct ifnet *ifp = KRNA(kring)->na_ifp;
1757 boolean_t sync_only;
1758 int ret = 0;
1759
1760 ASSERT(ifp != NULL);
1761
1762 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1763 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1764 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1765 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1766 flags);
1767
1768 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1769 SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1770 SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1771 return ENXIO;
1772 }
1773
1774 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1775 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1776 "flow control ON", SK_KVA(kring), if_name(ifp),
1777 SK_KVA(ifp));
1778 return ENXIO;
1779 }
1780
1781 /* update our work timestamp */
1782 KRNA(kring)->na_work_ts = _net_uptime;
1783
1784 sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1785 !KR_KERNEL_ONLY(kring);
1786 /* regular sync (reclaim) */
1787 if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1788 ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1789 kring->ckr_pending_intr != 0);
1790 kring->ckr_pending_intr = 0;
1791
1792 /* direct user channels do not need to use the doorbell */
1793 if (__improbable(sync_only)) {
1794 return ret;
1795 }
1796 }
1797
1798 /*
1799 * Doorbell call. Here we do doorbell explicitly if the flag is
1800 * set or implicitly if we're opened directly by a user channel.
1801 * Synchronous vs. asynchronous depending on the context.
1802 */
1803 if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1804 if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1805 ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1806 !(flags & NA_SYNCF_NETIF_ASYNC));
1807 nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1808 } else {
1809 ifnet_start(ifp);
1810 }
1811 }
1812
1813 return ret;
1814 }
1815
1816 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1817 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1818 uint32_t flags)
1819 {
1820 #pragma unused(p)
1821 int ret;
1822
1823 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1824 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1825 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1826 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1827 flags);
1828
1829 ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1830
1831 /* update our work timestamp */
1832 KRNA(kring)->na_work_ts = _net_uptime;
1833
1834 ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1835 kring->ckr_pending_intr != 0);
1836 kring->ckr_pending_intr = 0;
1837
1838 return ret;
1839 }
1840
1841 static void
nx_netif_na_dtor(struct nexus_adapter * na)1842 nx_netif_na_dtor(struct nexus_adapter *na)
1843 {
1844 struct ifnet *__single ifp;
1845 struct nexus_netif_adapter *nifna = NIFNA(na);
1846
1847 SK_LOCK_ASSERT_HELD();
1848 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1849
1850 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1851
1852 /*
1853 * If the finalizer callback hasn't been called for whatever
1854 * reasons, pick up the embryonic ifnet stored in na_private.
1855 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1856 */
1857 if ((ifp = na->na_ifp) == NULL) {
1858 ifp = na->na_private;
1859 na->na_private = NULL;
1860 } else {
1861 ifnet_decr_iorefcnt(ifp);
1862 na->na_ifp = NULL;
1863 }
1864
1865 if (nifna->nifna_netif != NULL) {
1866 nx_netif_release(nifna->nifna_netif);
1867 nifna->nifna_netif = NULL;
1868 }
1869 ASSERT(SKYWALK_NATIVE(ifp));
1870 }
1871
1872 /*
1873 * Dispatch rx/tx interrupts to the channel rings.
1874 *
1875 * The 'notify' routine depends on what the ring is attached to.
1876 * - for a channel file descriptor, do an event wakeup on the individual
1877 * waitqueue, plus one on the global one if needed (see na_notify)
1878 * - for a device port connected to a FlowSwitch, call the proper
1879 * forwarding routine; see nx_fsw_tx_hwna_notify()
1880 * or nx_fsw_rx_hwna_notify().
1881 */
1882 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1883 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1884 uint32_t flags, uint32_t *work_done)
1885 {
1886 struct netif_stats *nifs =
1887 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1888 int (*notify)(struct __kern_channel_ring *kring,
1889 struct proc *, uint32_t flags);
1890 int ret;
1891
1892 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1893
1894 SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1895 ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1896 "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1897 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1898 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1899
1900 /* update our work timestamp */
1901 KRNA(kring)->na_work_ts = _net_uptime;
1902
1903 kring->ckr_pending_intr++;
1904 if (work_done != NULL) {
1905 *work_done = 1; /* do not fire again */
1906 }
1907 /*
1908 * We can't be calling ckr_na_notify here since we could already be
1909 * intercepting it, else we'd end up recursively calling ourselves.
1910 * Use the original na_notify callback saved during na_activate, or in
1911 * the case when the module above us is the flowswitch, the notify
1912 * routine that it has installed in place of our original one.
1913 */
1914 if (__probable(!KR_DROP(kring) &&
1915 (notify = kring->ckr_netif_notify) != NULL)) {
1916 ret = notify(kring, p, flags);
1917 } else {
1918 /*
1919 * If the ring is in drop mode, pretend as if it's busy.
1920 * This allows the mitigation thread to pause for a while
1921 * before attempting again.
1922 */
1923 ret = EBUSY;
1924 }
1925 if (__improbable(ret != 0)) {
1926 switch (kring->ckr_tx) {
1927 case NR_RX:
1928 if (ret == EBUSY) {
1929 STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1930 } else if (ret == EAGAIN) {
1931 STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1932 } else {
1933 STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1934 }
1935 break;
1936
1937 case NR_TX:
1938 if (ret == EBUSY) {
1939 STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1940 } else if (ret == EAGAIN) {
1941 STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1942 } else {
1943 STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1944 }
1945 break;
1946
1947 default:
1948 break;
1949 }
1950 }
1951
1952 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1953
1954 return ret;
1955 }
1956
1957 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1958 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1959 uint32_t flags)
1960 {
1961 return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1962 }
1963
1964 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1965 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1966 uint32_t flags)
1967 {
1968 int ret;
1969
1970 /*
1971 * In the event the mitigation thread is disabled, protect
1972 * against recursion by detecting if we're already in the
1973 * context of an RX notify. IOSkywalkFamily may invoke the
1974 * notify callback as part of its RX sync callback.
1975 */
1976 if (__probable(!sk_is_rx_notify_protected())) {
1977 sk_protect_t protect;
1978 uint32_t work_done;
1979
1980 protect = sk_rx_notify_protect();
1981 ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1982 sk_sync_unprotect(protect);
1983 } else {
1984 ret = EAGAIN;
1985 }
1986
1987 return ret;
1988 }
1989
1990 static int
nx_netif_na_notify_rx_redirect(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1991 nx_netif_na_notify_rx_redirect(struct __kern_channel_ring *kring, struct proc *p,
1992 uint32_t flags)
1993 {
1994 struct netif_stats *nifs =
1995 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1996 uint32_t work_done;
1997
1998 ASSERT(kring->ckr_tx == NR_RX);
1999 STATS_INC(nifs, NETIF_STATS_RX_IRQ);
2000 return nx_netif_common_intr(kring, p, flags, &work_done);
2001 }
2002
2003 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)2004 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
2005 boolean_t *tx_mit, boolean_t *tx_mit_simple,
2006 boolean_t *rx_mit, boolean_t *rx_mit_simple)
2007 {
2008 struct nx_netif *nif = nifna->nifna_netif;
2009
2010 /*
2011 * TX mitigation is disabled by default, but can be
2012 * overridden via "sk_netif_tx_mit=N" boot-arg, where
2013 * N is one of SK_NETIF_MIT_FORCE_* values.
2014 */
2015 *tx_mit = *tx_mit_simple = FALSE;
2016 switch (sk_netif_tx_mit) {
2017 case SK_NETIF_MIT_FORCE_SIMPLE:
2018 *tx_mit_simple = TRUE;
2019 OS_FALLTHROUGH;
2020 case SK_NETIF_MIT_FORCE_ADVANCED:
2021 *tx_mit = TRUE;
2022 break;
2023 case SK_NETIF_MIT_FORCE_OFF:
2024 case SK_NETIF_MIT_AUTO:
2025 ASSERT(*tx_mit == FALSE);
2026 break;
2027 default:
2028 VERIFY(0);
2029 /* NOTREACHED */
2030 __builtin_unreachable();
2031 }
2032
2033 /*
2034 * RX mitigation is enabled by default only for BSD-style
2035 * virtual network interfaces, but can be overridden
2036 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
2037 * SK_NETIF_MIT_FORCE_* values.
2038 */
2039 *rx_mit = *rx_mit_simple = FALSE;
2040 switch (sk_netif_rx_mit) {
2041 case SK_NETIF_MIT_FORCE_OFF:
2042 ASSERT(*rx_mit == FALSE);
2043 break;
2044 case SK_NETIF_MIT_FORCE_SIMPLE:
2045 *rx_mit_simple = TRUE;
2046 OS_FALLTHROUGH;
2047 case SK_NETIF_MIT_FORCE_ADVANCED:
2048 *rx_mit = TRUE;
2049 break;
2050 case SK_NETIF_MIT_AUTO:
2051 *rx_mit_simple = TRUE;
2052 /*
2053 * Enable RX mitigation thread only for BSD-style virtual (and
2054 * regular) interfaces, since otherwise we may run out of stack
2055 * when subjected to IPsec processing, etc.
2056 */
2057 *rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
2058 NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
2059 break;
2060 default:
2061 VERIFY(0);
2062 /* NOTREACHED */
2063 __builtin_unreachable();
2064 }
2065 }
2066
2067 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)2068 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2069 {
2070 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2071 boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2072 struct nx_netif *nif = nifna->nifna_netif;
2073 struct ifnet *ifp = na->na_ifp;
2074 int error = 0;
2075 uint32_t r;
2076 /* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
2077 struct nx_netif_mit *mit_tmp;
2078 uint32_t nrings;
2079
2080 ASSERT(na->na_type == NA_NETIF_DEV);
2081 ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2082
2083 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
2084 SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2085
2086 switch (mode) {
2087 case NA_ACTIVATE_MODE_ON:
2088 ASSERT(SKYWALK_CAPABLE(ifp));
2089
2090 nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
2091 &rx_mit, &rx_mit_simple);
2092
2093 /*
2094 * Init the mitigation support on all the dev TX rings.
2095 */
2096 if (tx_mit) {
2097 nrings = na_get_nrings(na, NR_TX);
2098 mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
2099 nrings, Z_WAITOK, skmem_tag_netif_mit);
2100 if (mit_tmp == NULL) {
2101 SK_ERR("TX mitigation allocation failed");
2102 error = ENOMEM;
2103 goto out;
2104 }
2105 nifna->nifna_tx_mit = mit_tmp;
2106 nifna->nifna_tx_mit_count = nrings;
2107 } else {
2108 ASSERT(nifna->nifna_tx_mit == NULL);
2109 }
2110
2111 /*
2112 * Init the mitigation support on all the dev RX rings.
2113 */
2114 if (rx_mit) {
2115 nrings = na_get_nrings(na, NR_RX);
2116 mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
2117 nrings, Z_WAITOK, skmem_tag_netif_mit);
2118 if (mit_tmp == NULL) {
2119 SK_ERR("RX mitigation allocation failed");
2120 if (nifna->nifna_tx_mit != NULL) {
2121 skn_free_type_array_counted_by(rx_fail,
2122 struct nx_netif_mit,
2123 nifna->nifna_tx_mit_count,
2124 nifna->nifna_tx_mit);
2125 }
2126 error = ENOMEM;
2127 goto out;
2128 }
2129 nifna->nifna_rx_mit = mit_tmp;
2130 nifna->nifna_rx_mit_count = nrings;
2131 } else {
2132 ASSERT(nifna->nifna_rx_mit == NULL);
2133 }
2134
2135 /* intercept na_notify callback on the TX rings */
2136 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2137 na->na_tx_rings[r].ckr_netif_notify =
2138 na->na_tx_rings[r].ckr_na_notify;
2139 na->na_tx_rings[r].ckr_na_notify =
2140 nx_netif_na_notify_tx;
2141 if (nifna->nifna_tx_mit != NULL) {
2142 nx_netif_mit_init(nif, ifp,
2143 &nifna->nifna_tx_mit[r],
2144 &na->na_tx_rings[r], tx_mit_simple);
2145 }
2146 }
2147
2148 /* intercept na_notify callback on the RX rings */
2149 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2150 na->na_rx_rings[r].ckr_netif_notify =
2151 na->na_rx_rings[r].ckr_na_notify;
2152 na->na_rx_rings[r].ckr_na_notify = IFNET_IS_REDIRECT(ifp) ?
2153 nx_netif_na_notify_rx_redirect : nx_netif_na_notify_rx;
2154 if (nifna->nifna_rx_mit != NULL) {
2155 nx_netif_mit_init(nif, ifp,
2156 &nifna->nifna_rx_mit[r],
2157 &na->na_rx_rings[r], rx_mit_simple);
2158 }
2159 }
2160 nx_netif_filter_enable(nif);
2161 nx_netif_flow_enable(nif);
2162 os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
2163
2164 /* steer all start requests to netif; this must not fail */
2165 lck_mtx_lock(&ifp->if_start_lock);
2166 error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2167 VERIFY(error == 0);
2168 lck_mtx_unlock(&ifp->if_start_lock);
2169 break;
2170
2171 case NA_ACTIVATE_MODE_DEFUNCT:
2172 ASSERT(SKYWALK_CAPABLE(ifp));
2173 break;
2174
2175 case NA_ACTIVATE_MODE_OFF:
2176 /*
2177 * Note that here we cannot assert SKYWALK_CAPABLE()
2178 * as we're called in the destructor path.
2179 */
2180 os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
2181 nx_netif_flow_disable(nif);
2182 nx_netif_filter_disable(nif);
2183
2184 /*
2185 * Here we may block while holding sk_lock, but because
2186 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2187 * should immediately return. A better approach would be
2188 * to drop sk_lock and add a monitor for this routine.
2189 */
2190 lck_mtx_lock(&ifp->if_start_lock);
2191 while (ifp->if_start_active != 0) {
2192 ++ifp->if_start_waiters;
2193 (void) msleep(&ifp->if_start_waiters,
2194 &ifp->if_start_lock, (PZERO - 1),
2195 na->na_name, NULL);
2196 }
2197 /* steer all start requests to default handler */
2198 ifnet_reset_start_handler(ifp);
2199 lck_mtx_unlock(&ifp->if_start_lock);
2200
2201 /* reset all TX notify callbacks */
2202 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2203 na->na_tx_rings[r].ckr_na_notify =
2204 na->na_tx_rings[r].ckr_netif_notify;
2205 na->na_tx_rings[r].ckr_netif_notify = NULL;
2206 if (nifna->nifna_tx_mit != NULL) {
2207 na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2208 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2209 }
2210 }
2211
2212 if (nifna->nifna_tx_mit != NULL) {
2213 skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
2214 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
2215 }
2216
2217 /* reset all RX notify callbacks */
2218 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2219 na->na_rx_rings[r].ckr_na_notify =
2220 na->na_rx_rings[r].ckr_netif_notify;
2221 na->na_rx_rings[r].ckr_netif_notify = NULL;
2222 if (nifna->nifna_rx_mit != NULL) {
2223 na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2224 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2225 }
2226 }
2227 if (nifna->nifna_rx_mit != NULL) {
2228 skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
2229 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
2230 }
2231 break;
2232
2233 default:
2234 VERIFY(0);
2235 /* NOTREACHED */
2236 __builtin_unreachable();
2237 }
2238 out:
2239 return error;
2240 }
2241
2242 SK_NO_INLINE_ATTRIBUTE
2243 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2244 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2245 {
2246 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2247 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2248 struct nexus_netif_adapter *devnifna = NULL;
2249 struct nexus_netif_adapter *hostnifna = NULL;
2250 struct nexus_adapter *__single devna = NULL;
2251 struct nexus_adapter *__single hostna = NULL;
2252 boolean_t embryonic = FALSE;
2253 int retval = 0;
2254 uint32_t na_flags;
2255
2256 SK_LOCK_ASSERT_HELD();
2257 ASSERT(SKYWALK_NATIVE(ifp));
2258 ASSERT(!SKYWALK_CAPABLE(ifp));
2259 ASSERT(ifp->if_na == NULL);
2260 ASSERT(ifp->if_na_ops == NULL);
2261
2262 devnifna = na_netif_alloc(Z_WAITOK);
2263 hostnifna = na_netif_alloc(Z_WAITOK);
2264
2265 /*
2266 * We can be called for two different interface states:
2267 *
2268 * Fully attached: get an io ref count; upon success, this
2269 * holds a reference to the ifnet for the ifp pointer stored
2270 * in 'na_ifp' down below for both adapters.
2271 *
2272 * Embryonic: temporary hold the ifnet in na_private, which
2273 * upon a successful ifnet_attach(), will be moved over to
2274 * the 'na_ifp' with an io ref count held.
2275 *
2276 * The ifnet in 'na_ifp' will be released by na_release_locked().
2277 */
2278 if (!ifnet_is_attached(ifp, 1)) {
2279 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2280 ifp = NULL;
2281 retval = ENXIO;
2282 goto err;
2283 }
2284 embryonic = TRUE;
2285 }
2286
2287 /* initialize the device netif adapter */
2288 devnifna->nifna_netif = nif;
2289 nx_netif_retain(nif);
2290 devna = &devnifna->nifna_up;
2291 devna->na_type = NA_NETIF_DEV;
2292 devna->na_free = na_netif_free;
2293 (void) strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
2294 uuid_generate_random(devna->na_uuid);
2295 if (embryonic) {
2296 /*
2297 * We will move this over to na_ifp once
2298 * the interface is fully attached.
2299 */
2300 devna->na_private = ifp;
2301 ASSERT(devna->na_ifp == NULL);
2302 } else {
2303 ASSERT(devna->na_private == NULL);
2304 /* use I/O refcnt from ifnet_is_attached() */
2305 devna->na_ifp = ifp;
2306 }
2307 devna->na_activate = nx_netif_na_activate;
2308 devna->na_txsync = nx_netif_na_txsync;
2309 devna->na_rxsync = nx_netif_na_rxsync;
2310 devna->na_dtor = nx_netif_na_dtor;
2311 devna->na_krings_create = nx_netif_dev_krings_create;
2312 devna->na_krings_delete = nx_netif_dev_krings_delete;
2313 devna->na_special = nx_netif_na_special;
2314
2315 na_flags = NAF_NATIVE;
2316 if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2317 na_flags |= NAF_VIRTUAL_DEVICE;
2318 }
2319 if (NX_LLINK_PROV(nx)) {
2320 /*
2321 * while operating in logical link mode, we don't need to
2322 * create backing memory regions for the rings as they are
2323 * not used.
2324 */
2325 na_flags |= NAF_MEM_NO_INIT;
2326 }
2327 os_atomic_or(&devna->na_flags, na_flags, relaxed);
2328 *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2329 NEXUS_STATS_TYPE_INVALID;
2330
2331 na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2332 na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2333 na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2334 na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2335 /*
2336 * Verify upper bounds; the parameters must have already been
2337 * validated by nxdom_prov_params() by the time we get here.
2338 */
2339 ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2340 ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2341 ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2342 ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2343
2344 na_attach_common(devna, nx, &nx_netif_prov_s);
2345
2346 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2347 nx, devna)) != 0) {
2348 ASSERT(devna->na_arena == NULL);
2349 goto err;
2350 }
2351 ASSERT(devna->na_arena != NULL);
2352
2353 *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2354 ASSERT(devna->na_flowadv_max == 0 ||
2355 skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2356
2357 /* setup packet copy routines */
2358 if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2359 nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2360 nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2361 nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2362 } else {
2363 nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2364 nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2365 nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2366 }
2367
2368 /* initialize the host netif adapter */
2369 hostnifna->nifna_netif = nif;
2370 nx_netif_retain(nif);
2371 hostna = &hostnifna->nifna_up;
2372 (void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2373 "%s^", devna->na_name);
2374 uuid_generate_random(hostna->na_uuid);
2375 if (embryonic) {
2376 /*
2377 * We will move this over to na_ifp once
2378 * the interface is fully attached.
2379 */
2380 hostna->na_private = ifp;
2381 ASSERT(hostna->na_ifp == NULL);
2382 } else {
2383 ASSERT(hostna->na_private == NULL);
2384 hostna->na_ifp = devna->na_ifp;
2385 ifnet_incr_iorefcnt(hostna->na_ifp);
2386 }
2387 hostna->na_type = NA_NETIF_HOST;
2388 hostna->na_free = na_netif_free;
2389 hostna->na_activate = nx_netif_host_na_activate;
2390 hostna->na_txsync = nx_netif_host_na_txsync;
2391 hostna->na_rxsync = nx_netif_host_na_rxsync;
2392 hostna->na_dtor = nx_netif_na_dtor;
2393 hostna->na_krings_create = nx_netif_host_krings_create;
2394 hostna->na_krings_delete = nx_netif_host_krings_delete;
2395 hostna->na_special = nx_netif_host_na_special;
2396
2397 na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2398 if (NX_LLINK_PROV(nx)) {
2399 /*
2400 * while operating in logical link mode, we don't need to
2401 * create backing memory regions for the rings as they are
2402 * not used.
2403 */
2404 na_flags |= NAF_MEM_NO_INIT;
2405 }
2406 os_atomic_or(&hostna->na_flags, na_flags, relaxed);
2407 *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2408 NEXUS_STATS_TYPE_INVALID;
2409
2410 na_set_nrings(hostna, NR_TX, 1);
2411 na_set_nrings(hostna, NR_RX, 1);
2412 na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2413 na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2414
2415 na_attach_common(hostna, nx, &nx_netif_prov_s);
2416
2417 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2418 nx, hostna)) != 0) {
2419 ASSERT(hostna->na_arena == NULL);
2420 goto err;
2421 }
2422 ASSERT(hostna->na_arena != NULL);
2423
2424 *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2425 ASSERT(hostna->na_flowadv_max == 0 ||
2426 skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2427
2428 /* adjust the classq packet drop limit */
2429 if (embryonic) {
2430 uint32_t drop_lim;
2431 struct kern_pbufpool_memory_info pp_info;
2432
2433 retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2434 VERIFY(retval == 0);
2435
2436 /* set the drop limit as 75% of size of packet pool */
2437 drop_lim = (pp_info.kpm_packets * 3) / 4;
2438 VERIFY(drop_lim != 0);
2439 IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2440 }
2441
2442 /* these will be undone by destructor */
2443 ifp->if_na_ops = &na_netif_ops;
2444 ifp->if_na = devnifna;
2445 na_retain_locked(devna);
2446 na_retain_locked(hostna);
2447
2448 SKYWALK_SET_CAPABLE(ifp);
2449
2450 NETIF_WLOCK(nif);
2451 nif->nif_ifp = ifp;
2452 nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2453 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2454 kernproc);
2455 ASSERT(retval == 0);
2456 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2457 kernproc);
2458 ASSERT(retval == 0);
2459 NETIF_WUNLOCK(nif);
2460
2461 #if SK_LOG
2462 uuid_string_t uuidstr;
2463 SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2464 SK_DF(SK_VERB_NETIF, " UUID: %s",
2465 sk_uuid_unparse(devna->na_uuid, uuidstr));
2466 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2467 SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2468 NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2469 SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS);
2470 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max);
2471 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2472 na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2473 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2474 na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2475 #if CONFIG_NEXUS_USER_PIPE
2476 SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe);
2477 SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes);
2478 #endif /* CONFIG_NEXUS_USER_PIPE */
2479 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2480 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2481 SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2482 SK_DF(SK_VERB_NETIF, " UUID: %s",
2483 sk_uuid_unparse(hostna->na_uuid, uuidstr));
2484 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2485 SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2486 NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2487 SK_DF(SK_VERB_NETIF, " flags: 0x%b",
2488 hostna->na_flags, NAF_BITS);
2489 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max);
2490 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2491 na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2492 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2493 na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2494 #if CONFIG_NEXUS_USER_PIPE
2495 SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe);
2496 SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes);
2497 #endif /* CONFIG_NEXUS_USER_PIPE */
2498 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2499 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2500 #endif /* SK_LOG */
2501
2502 err:
2503 if (retval != 0) {
2504 if (ifp != NULL) {
2505 if (!embryonic) {
2506 ifnet_decr_iorefcnt(ifp);
2507 }
2508 ifp = NULL;
2509 }
2510 if (devna != NULL) {
2511 if (devna->na_arena != NULL) {
2512 skmem_arena_release(devna->na_arena);
2513 devna->na_arena = NULL;
2514 }
2515 if (devna->na_ifp != NULL) {
2516 ifnet_decr_iorefcnt(devna->na_ifp);
2517 devna->na_ifp = NULL;
2518 }
2519 devna->na_private = NULL;
2520 }
2521 if (hostna != NULL) {
2522 if (hostna->na_arena != NULL) {
2523 skmem_arena_release(hostna->na_arena);
2524 hostna->na_arena = NULL;
2525 }
2526 if (hostna->na_ifp != NULL) {
2527 ifnet_decr_iorefcnt(hostna->na_ifp);
2528 hostna->na_ifp = NULL;
2529 }
2530 hostna->na_private = NULL;
2531 }
2532 if (devnifna != NULL) {
2533 if (devnifna->nifna_netif != NULL) {
2534 nx_netif_release(devnifna->nifna_netif);
2535 devnifna->nifna_netif = NULL;
2536 }
2537 na_netif_free((struct nexus_adapter *)devnifna);
2538 }
2539 if (hostnifna != NULL) {
2540 if (hostnifna->nifna_netif != NULL) {
2541 nx_netif_release(hostnifna->nifna_netif);
2542 hostnifna->nifna_netif = NULL;
2543 }
2544 na_netif_free((struct nexus_adapter *)hostnifna);
2545 }
2546 }
2547 return retval;
2548 }
2549
2550 /*
2551 * Any per-netif state that can be discovered at attach time should be
2552 * initialized here.
2553 */
2554 static void
nx_netif_flags_init(struct nx_netif * nif)2555 nx_netif_flags_init(struct nx_netif *nif)
2556 {
2557 ifnet_t ifp = nif->nif_ifp;
2558 struct kern_nexus *nx = nif->nif_nx;
2559 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2560
2561 switch (devna->na_type) {
2562 case NA_NETIF_DEV:
2563 if (strlcmp(sk_ll_prefix, ifp->if_name, sizeof(sk_ll_prefix)) == 0) {
2564 nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2565 if_set_xflags(ifp, IFXF_LOW_LATENCY);
2566 }
2567 break;
2568 case NA_NETIF_COMPAT_DEV:
2569 nif->nif_flags |= NETIF_FLAG_COMPAT;
2570 break;
2571 default:
2572 break;
2573 }
2574 }
2575
2576 /*
2577 * This is also supposed to check for any inconsistent state at detach time.
2578 */
2579 static void
nx_netif_flags_fini(struct nx_netif * nif)2580 nx_netif_flags_fini(struct nx_netif *nif)
2581 {
2582 ifnet_t ifp = nif->nif_ifp;
2583
2584 if (ifp != NULL) {
2585 if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2586 }
2587 nif->nif_flags = 0;
2588 }
2589
2590 SK_NO_INLINE_ATTRIBUTE
2591 static void
nx_netif_callbacks_init(struct nx_netif * nif)2592 nx_netif_callbacks_init(struct nx_netif *nif)
2593 {
2594 ifnet_t ifp = nif->nif_ifp;
2595
2596 /*
2597 * XXX
2598 * This function is meant to be called by na_netif_finalize(), which is
2599 * called by ifnet_attach() while holding if_lock exclusively.
2600 */
2601 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
2602 if (ifnet_is_low_latency(ifp)) {
2603 ifnet_set_detach_notify_locked(ifp,
2604 nx_netif_llw_detach_notify, ifp->if_na);
2605 }
2606 }
2607
2608 SK_NO_INLINE_ATTRIBUTE
2609 static void
nx_netif_callbacks_fini(struct nx_netif * nif)2610 nx_netif_callbacks_fini(struct nx_netif *nif)
2611 {
2612 ifnet_t ifp = nif->nif_ifp;
2613
2614 if (ifnet_is_low_latency(ifp)) {
2615 ifnet_set_detach_notify(ifp, NULL, NULL);
2616 }
2617 }
2618
2619 static void
configure_capab_interface_advisory(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2620 configure_capab_interface_advisory(struct nx_netif *nif,
2621 nxprov_capab_config_fn_t capab_fn)
2622 {
2623 struct kern_nexus_capab_interface_advisory capab;
2624 struct kern_nexus *nx = nif->nif_nx;
2625 uint32_t capab_len;
2626 int error;
2627
2628 /* check/configure interface advisory notifications */
2629 if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
2630 return;
2631 }
2632 bzero(&capab, sizeof(capab));
2633 capab.kncia_version =
2634 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2635 *__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2636 &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
2637 *__DECONST(void **, &(capab.kncia_kern_context)) = nx;
2638 capab_len = sizeof(capab);
2639 error = capab_fn(NX_PROV(nx), nx,
2640 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
2641 if (error != 0) {
2642 DTRACE_SKYWALK2(interface__advisory__capab__error,
2643 struct nx_netif *, nif, int, error);
2644 return;
2645 }
2646 VERIFY(capab.kncia_config != NULL);
2647 VERIFY(capab.kncia_provider_context != NULL);
2648 nif->nif_intf_adv_config = capab.kncia_config;
2649 nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
2650 nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
2651 }
2652
2653 static void
unconfigure_capab_interface_advisory(struct nx_netif * nif)2654 unconfigure_capab_interface_advisory(struct nx_netif *nif)
2655 {
2656 if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
2657 return;
2658 }
2659 nif->nif_intf_adv_config = NULL;
2660 nif->nif_intf_adv_prov_ctx = NULL;
2661 nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
2662 }
2663
2664 static void
configure_capab_qset_extensions(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2665 configure_capab_qset_extensions(struct nx_netif *nif,
2666 nxprov_capab_config_fn_t capab_fn)
2667 {
2668 struct kern_nexus_capab_qset_extensions capab;
2669 struct kern_nexus *nx = nif->nif_nx;
2670 uint32_t capab_len;
2671 int error;
2672
2673 if (!NX_LLINK_PROV(nx)) {
2674 DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
2675 return;
2676 }
2677 bzero(&capab, sizeof(capab));
2678 capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
2679 capab_len = sizeof(capab);
2680 error = capab_fn(NX_PROV(nx), nx,
2681 KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
2682 if (error != 0) {
2683 DTRACE_SKYWALK2(qset__extensions__capab__error,
2684 struct nx_netif *, nif, int, error);
2685 return;
2686 }
2687 VERIFY(capab.cqe_notify_steering_info != NULL);
2688 VERIFY(capab.cqe_prov_ctx != NULL);
2689 nif->nif_qset_extensions.qe_notify_steering_info =
2690 capab.cqe_notify_steering_info;
2691 nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
2692 nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
2693 }
2694
2695 static void
unconfigure_capab_qset_extensions(struct nx_netif * nif)2696 unconfigure_capab_qset_extensions(struct nx_netif *nif)
2697 {
2698 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2699 return;
2700 }
2701 bzero(&nif->nif_qset_extensions, sizeof(nif->nif_qset_extensions));
2702 nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
2703 }
2704
2705 int
nx_netif_notify_steering_info(struct nx_netif * nif,struct netif_qset * qset,struct ifnet_traffic_descriptor_common * td,bool add)2706 nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
2707 struct ifnet_traffic_descriptor_common *td, bool add)
2708 {
2709 struct netif_qset_extensions *qset_ext;
2710 int err;
2711
2712 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2713 return ENOTSUP;
2714 }
2715 qset_ext = &nif->nif_qset_extensions;
2716 VERIFY(qset_ext->qe_prov_ctx != NULL);
2717 VERIFY(qset_ext->qe_notify_steering_info != NULL);
2718 err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
2719 qset->nqs_ctx, td, add);
2720 return err;
2721 }
2722
2723 static void
nx_netif_capabilities_init(struct nx_netif * nif)2724 nx_netif_capabilities_init(struct nx_netif *nif)
2725 {
2726 struct kern_nexus *nx = nif->nif_nx;
2727 nxprov_capab_config_fn_t capab_fn;
2728
2729 if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2730 KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2731 capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2732 ASSERT(capab_fn != NULL);
2733 } else {
2734 capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2735 }
2736 if (capab_fn == NULL) {
2737 return;
2738 }
2739 configure_capab_interface_advisory(nif, capab_fn);
2740 configure_capab_qset_extensions(nif, capab_fn);
2741 }
2742
2743 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2744 nx_netif_capabilities_fini(struct nx_netif *nif)
2745 {
2746 unconfigure_capab_interface_advisory(nif);
2747 unconfigure_capab_qset_extensions(nif);
2748 }
2749
2750 static void
nx_netif_verify_tso_config(struct nx_netif * nif)2751 nx_netif_verify_tso_config(struct nx_netif *nif)
2752 {
2753 ifnet_t ifp = nif->nif_ifp;
2754 uint32_t tso_v4_mtu = 0;
2755 uint32_t tso_v6_mtu = 0;
2756
2757 /*
2758 * compat interfaces always use 128-byte buffers on the device packet
2759 * pool side (for holding headers for classification) so no need to check
2760 * the size here.
2761 */
2762 if (!SKYWALK_NATIVE(ifp)) {
2763 return;
2764 }
2765
2766 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
2767 tso_v4_mtu = ifp->if_tso_v4_mtu;
2768 }
2769 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
2770 tso_v6_mtu = ifp->if_tso_v6_mtu;
2771 }
2772 VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
2773 max(tso_v4_mtu, tso_v6_mtu));
2774 }
2775
2776 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2777 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2778 {
2779 struct nx_netif *nif = nifna->nifna_netif;
2780 struct kern_nexus *nx = nif->nif_nx;
2781 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2782 struct nexus_adapter *hostna = nx_port_get_na(nx,
2783 NEXUS_PORT_NET_IF_HOST);
2784
2785 ASSERT(devna != NULL);
2786 ASSERT(hostna != NULL);
2787
2788 if (!ifnet_is_attached(ifp, 1)) {
2789 VERIFY(0);
2790 /* NOTREACHED */
2791 __builtin_unreachable();
2792 }
2793
2794 ASSERT(devna->na_private == ifp);
2795 ASSERT(devna->na_ifp == NULL);
2796 /* use I/O refcnt held by ifnet_is_attached() above */
2797 devna->na_ifp = devna->na_private;
2798 devna->na_private = NULL;
2799
2800 ASSERT(hostna->na_private == ifp);
2801 ASSERT(hostna->na_ifp == NULL);
2802 hostna->na_ifp = hostna->na_private;
2803 hostna->na_private = NULL;
2804 ifnet_incr_iorefcnt(hostna->na_ifp);
2805
2806 nx_netif_flags_init(nif);
2807 nx_netif_llink_init(nif);
2808 nx_netif_filter_init(nif);
2809 nx_netif_flow_init(nif);
2810 nx_netif_capabilities_init(nif);
2811 nx_netif_agent_init(nif);
2812 (void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
2813 &ifp->if_traffic_rule_count);
2814 nx_netif_verify_tso_config(nif);
2815 nx_netif_callbacks_init(nif);
2816 }
2817
2818 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2819 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2820 uint32_t thres, boolean_t low)
2821 {
2822 #pragma unused(ifp)
2823 struct nx_netif *nif = nifna->nifna_netif;
2824 struct kern_nexus *nx = nif->nif_nx;
2825 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2826 uint64_t now = _net_uptime;
2827 boolean_t purge;
2828
2829 ASSERT(thres != 0);
2830
2831 if (devna->na_work_ts == 0) {
2832 return;
2833 }
2834
2835 /*
2836 * Purge if it's has been inactive for some time (twice the drain
2837 * threshold), and clear the work timestamp to temporarily skip this
2838 * adapter until it's active again. Purging cached objects can be
2839 * expensive since we'd need to allocate and construct them again,
2840 * so we do it only when necessary.
2841 */
2842 if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2843 devna->na_work_ts = 0;
2844 purge = TRUE;
2845 } else {
2846 purge = FALSE;
2847 }
2848
2849 SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2850 (purge ? "purging" : "pruning"), devna->na_name);
2851
2852 /*
2853 * Device and host adapters share the same packet buffer pool,
2854 * so just reap the arena belonging to the device instance.
2855 */
2856 skmem_arena_reap(devna->na_arena, purge);
2857 }
2858
2859 /*
2860 * The purpose of this callback is to forceably remove resources held by VPNAs
2861 * in event of an interface detach. Without this callback an application can
2862 * prevent the detach from completing indefinitely. Note that this is only needed
2863 * for low latency VPNAs. Userspace do get notified about interface detach events
2864 * for other NA types (custom ether and filter) and will do the necessary cleanup.
2865 * The cleanup is done in two phases:
2866 * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
2867 * causes the device channel to be closed. All ifnet references held by VPNAs
2868 * are also released.
2869 * 2) This cleans up the netif nexus and releases the two remaining ifnet
2870 * references held by the device and host ports (nx_netif_clean()).
2871 */
2872 void
nx_netif_llw_detach_notify(void * arg)2873 nx_netif_llw_detach_notify(void *arg)
2874 {
2875 struct nexus_netif_adapter *__single nifna = arg;
2876 struct nx_netif *nif = nifna->nifna_netif;
2877 struct kern_nexus *nx = nif->nif_nx;
2878 struct kern_channel **ch_list = NULL;
2879 struct kern_channel *ch;
2880 int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
2881 struct proc *p;
2882
2883 ASSERT(NETIF_IS_LOW_LATENCY(nif));
2884 /*
2885 * kern_channel_defunct() requires sk_lock to be not held. We
2886 * will first find the list of channels we want to defunct and
2887 * then call kern_channel_defunct() on each of them. The number
2888 * of channels cannot increase after sk_lock is released since
2889 * this interface is being detached.
2890 */
2891 SK_LOCK();
2892 all_ch_cnt = nx->nx_ch_count;
2893 if (all_ch_cnt == 0) {
2894 DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
2895 SK_UNLOCK();
2896 return;
2897 }
2898 ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
2899 Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);
2900
2901 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
2902 struct nexus_adapter *na = ch->ch_na;
2903
2904 if (na != NULL && na->na_type == NA_NETIF_VP) {
2905 ASSERT(vp_ch_cnt < all_ch_cnt);
2906
2907 /* retain channel to prevent it from being freed */
2908 ch_retain_locked(ch);
2909 ch_list[vp_ch_cnt] = ch;
2910 DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
2911 struct kern_channel *, ch, struct nexus_adapter *, na);
2912 vp_ch_cnt++;
2913 }
2914 }
2915 if (vp_ch_cnt == 0) {
2916 DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
2917 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2918 SK_UNLOCK();
2919 return;
2920 }
2921 /* prevents the netif from being freed */
2922 nx_netif_retain(nif);
2923 SK_UNLOCK();
2924
2925 for (i = 0; i < vp_ch_cnt; i++) {
2926 ch = ch_list[i];
2927 p = proc_find(ch->ch_pid);
2928 if (p == NULL) {
2929 SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid);
2930 DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
2931 struct kern_channel *, ch, pid_t, ch->ch_pid);
2932 ch_release(ch);
2933 continue;
2934 }
2935 /*
2936 * It is possible for the channel to be closed before defunct gets
2937 * called. We need to get the fd lock here to ensure that the check
2938 * for the closed state and the calling of channel defunct are done
2939 * atomically.
2940 */
2941 proc_fdlock(p);
2942 if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
2943 kern_channel_defunct(p, ch);
2944 }
2945 proc_fdunlock(p);
2946 proc_rele(p);
2947 ch_release(ch);
2948 }
2949 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2950
2951 SK_LOCK();
2952 /*
2953 * Quiescing is not needed because:
2954 * The defuncting above ensures that no more tx syncs could enter.
2955 * The driver layer ensures that ifnet_detach() (this path) does not get
2956 * called until RX upcalls have returned.
2957 *
2958 * Before sk_lock is reacquired above, userspace could close its channels
2959 * and cause the nexus's destructor to be called. This is fine because we
2960 * have retained the nif so it can't disappear.
2961 */
2962 err = nx_netif_clean(nif, FALSE);
2963 if (err != 0) {
2964 SK_ERR("netif clean failed: err %d", err);
2965 DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
2966 }
2967 nx_netif_release(nif);
2968 SK_UNLOCK();
2969 }
2970
2971 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)2972 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2973 struct if_netif_stats *if_ns)
2974 {
2975 struct nx_netif_mit *mit;
2976 struct mit_cfg_tbl *mit_cfg;
2977
2978 if ((mit = nifna->nifna_rx_mit) == NULL) {
2979 return;
2980 }
2981
2982 if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2983 return;
2984 }
2985
2986 if_ns->ifn_rx_mit_interval = mit->mit_interval;
2987 if_ns->ifn_rx_mit_mode = mit->mit_mode;
2988 if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2989 if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2990 if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2991 if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2992 if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2993 if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2994 if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2995
2996 VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2997 mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
2998 if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
2999 if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
3000 if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
3001 if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
3002 if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
3003 }
3004
3005 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3006 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
3007 struct chreq *chr, nxspec_cmd_t spec_cmd)
3008 {
3009 ASSERT(na->na_type == NA_NETIF_DEV ||
3010 na->na_type == NA_NETIF_COMPAT_DEV);
3011 return nx_netif_na_special_common(na, ch, chr, spec_cmd);
3012 }
3013
3014 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)3015 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
3016 struct chreq *chr, nxspec_cmd_t spec_cmd)
3017 {
3018 int error = 0;
3019
3020 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
3021 na->na_type == NA_NETIF_COMPAT_DEV ||
3022 na->na_type == NA_NETIF_COMPAT_HOST);
3023 SK_LOCK_ASSERT_HELD();
3024
3025 switch (spec_cmd) {
3026 case NXSPEC_CMD_CONNECT:
3027 /*
3028 * netif adapter isn't created exclusively for kernel.
3029 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
3030 * na_special() connect and disconnect.
3031 */
3032 if (NA_KERNEL_ONLY(na)) {
3033 error = EBUSY;
3034 goto done;
3035 }
3036 ASSERT(!(na->na_flags & NAF_SPEC_INIT));
3037
3038 os_atomic_or(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3039 error = na_bind_channel(na, ch, chr);
3040 if (error != 0) {
3041 os_atomic_andnot(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3042 goto done;
3043 }
3044 os_atomic_or(&na->na_flags, NAF_SPEC_INIT, relaxed);
3045 break;
3046
3047 case NXSPEC_CMD_DISCONNECT:
3048 ASSERT(NA_KERNEL_ONLY(na));
3049 ASSERT(na->na_channels > 0);
3050 ASSERT(na->na_flags & NAF_SPEC_INIT);
3051 na_unbind_channel(ch);
3052 os_atomic_andnot(&na->na_flags, (NAF_SPEC_INIT | NAF_KERNEL_ONLY), relaxed);
3053 break;
3054
3055 case NXSPEC_CMD_START:
3056 na_kr_drop(na, FALSE);
3057 break;
3058
3059 case NXSPEC_CMD_STOP:
3060 na_kr_drop(na, TRUE);
3061 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
3062 lck_mtx_lock(&ch->ch_lock);
3063 nxprov_advise_disconnect(na->na_nx, ch);
3064 lck_mtx_unlock(&ch->ch_lock);
3065 break;
3066
3067 default:
3068 error = EINVAL;
3069 break;
3070 }
3071
3072 done:
3073 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
3074 "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
3075 "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
3076 na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
3077
3078 return error;
3079 }
3080
3081 /*
3082 * Get a skywalk netif adapter for the port.
3083 */
3084 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)3085 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
3086 struct chreq *chr, struct nxbind *nxb, struct proc *p,
3087 struct nexus_adapter **nap, boolean_t create)
3088 {
3089 #pragma unused(ch)
3090 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3091 boolean_t anon = NX_ANONYMOUS_PROV(nx);
3092 ch_endpoint_t ep = chr->cr_endpoint;
3093 nexus_port_t nx_port = chr->cr_port;
3094 struct nexus_adapter *__single na = NULL;
3095 struct ifnet *ifp;
3096 int err = 0;
3097
3098 SK_LOCK_ASSERT_HELD();
3099 *nap = NULL; /* default */
3100
3101 #if SK_LOG
3102 uuid_string_t uuidstr;
3103 SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
3104 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
3105 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
3106 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
3107 chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
3108 chr->cr_real_endpoint, chr->cr_endpoint, create,
3109 (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
3110 #endif /* SK_LOG */
3111
3112 if (!create || ep != CH_ENDPOINT_NET_IF) {
3113 err = ENODEV;
3114 goto done;
3115 }
3116
3117 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
3118 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
3119 err = ENXIO;
3120 goto done;
3121 }
3122 ifp = nif->nif_ifp;
3123 if (!(SKYWALK_CAPABLE(ifp))) {
3124 SK_ERR("interface %s is no longer usable", if_name(ifp));
3125 err = ENOTSUP;
3126 goto done;
3127 }
3128
3129 if (chr->cr_mode & CHMODE_LOW_LATENCY) {
3130 SK_ERR("low latency is not supported for netif channel");
3131 err = ENOTSUP;
3132 goto done;
3133 }
3134
3135 switch (nx_port) {
3136 case NEXUS_PORT_NET_IF_DEV:
3137 /*
3138 * We have to reject direct user open that's not explicitly
3139 * allowed because netif nexuses do not by default have
3140 * user memory regions.
3141 */
3142 if (p != kernproc &&
3143 (!skywalk_netif_direct_allowed(ifp->if_xname) ||
3144 (kauth_cred_issuser(kauth_cred_get()) == 0 &&
3145 (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
3146 !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
3147 DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
3148 ifp, struct chreq *, chr);
3149 err = ENOTSUP;
3150 goto done;
3151 }
3152 if (chr->cr_mode & CHMODE_EVENT_RING) {
3153 SK_ERR("event ring is not supported for netif dev port channel");
3154 err = ENOTSUP;
3155 goto done;
3156 }
3157 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
3158 break;
3159
3160 case NEXUS_PORT_NET_IF_HOST:
3161 if (p != kernproc) {
3162 err = ENOTSUP;
3163 goto done;
3164 }
3165 if (chr->cr_mode & CHMODE_EVENT_RING) {
3166 SK_ERR("event ring is not supported for netif host port channel");
3167 err = ENOTSUP;
3168 goto done;
3169 }
3170 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
3171 break;
3172
3173 default:
3174 ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
3175
3176 NETIF_WLOCK(nif);
3177 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3178 if (err != 0) {
3179 NETIF_WUNLOCK(nif);
3180 goto done;
3181 }
3182
3183 if (na == NULL) {
3184 if (chr->cr_mode & CHMODE_FILTER) {
3185 err = netif_filter_na_create(nx, chr, &na);
3186 } else {
3187 err = netif_vp_na_create(nx, chr, &na);
3188 }
3189 if (err != 0) {
3190 NETIF_WUNLOCK(nif);
3191 goto done;
3192 }
3193 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3194 if (err != 0) {
3195 NETIF_WUNLOCK(nif);
3196 goto done;
3197 }
3198 }
3199 NETIF_WUNLOCK(nif);
3200
3201 break;
3202 }
3203
3204 ASSERT(err == 0);
3205 ASSERT(na != NULL);
3206
3207 #if CONFIG_NEXUS_USER_PIPE
3208 if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
3209 #else /* !CONFIG_NEXUS_USER_PIPE */
3210 if (NA_OWNED_BY_ANY(na)) {
3211 #endif /* !CONFIG_NEXUS_USER_PIPE */
3212 err = EBUSY;
3213 na = NULL;
3214 goto done;
3215 }
3216
3217 *nap = na;
3218 na_retain_locked(na);
3219
3220 done:
3221 ASSERT(err != 0 || na != NULL);
3222 if (err) {
3223 SK_ERR("na not found, err(%d)", err);
3224 } else {
3225 SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
3226 }
3227 return err;
3228 }
3229
3230 /* na_krings_create callback for all netif device adapters */
3231 int
3232 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3233 {
3234 int ret;
3235
3236 ASSERT(na->na_type == NA_NETIF_DEV ||
3237 na->na_type == NA_NETIF_COMPAT_DEV);
3238 /*
3239 * Allocate context structures for native netif only, for
3240 * IOSkywalkFamily to store its object references.
3241 */
3242 ret = na_rings_mem_setup(na, (na->na_flags & NAF_NATIVE), ch);
3243
3244 /*
3245 * We mark CKRF_DROP for kernel-only rings (kernel channel
3246 * opened by the flowswitch, etc.) to prevent packets from
3247 * going thru until after the client of the kernel channel
3248 * has fully plumbed things on its side. For userland-facing
3249 * rings (regular channel opened to netif), this is not
3250 * required, and so don't mark CKRF_DROP there.
3251 */
3252 if (ret == 0 && NA_KERNEL_ONLY(na)) {
3253 na_kr_drop(na, TRUE);
3254 }
3255
3256 return ret;
3257 }
3258
3259 /* call with SK_LOCK held */
3260 void
3261 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3262 boolean_t defunct)
3263 {
3264 ASSERT(na->na_type == NA_NETIF_DEV ||
3265 na->na_type == NA_NETIF_COMPAT_DEV);
3266
3267 /* see comments in nx_netif_dev_krings_create() */
3268 if (NA_KERNEL_ONLY(na)) {
3269 na_kr_drop(na, TRUE);
3270 }
3271
3272 na_rings_mem_teardown(na, ch, defunct);
3273 }
3274
3275 struct nx_netif *
3276 nx_netif_alloc(zalloc_flags_t how)
3277 {
3278 struct nx_netif *n;
3279
3280 SK_LOCK_ASSERT_HELD();
3281
3282 n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
3283 if (n == NULL) {
3284 return NULL;
3285 }
3286
3287 NETIF_RWINIT(n);
3288 os_ref_init(&n->nif_refcnt, NULL);
3289 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3290
3291 return n;
3292 }
3293
3294 static void
3295 nx_netif_destroy(struct nx_netif *n)
3296 {
3297 ASSERT(n->nif_dev_nxb == NULL);
3298 ASSERT(n->nif_host_nxb == NULL);
3299 ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3300 nx_netif_llink_config_free(n);
3301 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3302 NETIF_RWDESTROY(n);
3303 zfree(nx_netif_zone, n);
3304 }
3305
3306 void
3307 nx_netif_release(struct nx_netif *n)
3308 {
3309 SK_LOCK_ASSERT_HELD();
3310
3311 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3312 os_ref_get_count(&n->nif_refcnt));
3313 if (os_ref_release(&n->nif_refcnt) == 0) {
3314 nx_netif_destroy(n);
3315 }
3316 }
3317
3318 void
3319 nx_netif_retain(struct nx_netif *n)
3320 {
3321 SK_LOCK_ASSERT_HELD();
3322
3323 /* retaining an object with a zero refcount is not allowed */
3324 ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3325 os_ref_retain(&n->nif_refcnt);
3326 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3327 os_ref_get_count(&n->nif_refcnt));
3328 }
3329
3330 void
3331 nx_netif_free(struct nx_netif *n)
3332 {
3333 nx_netif_release(n);
3334 }
3335
3336 static int
3337 nx_netif_interface_advisory_report(struct kern_nexus *nx,
3338 const struct ifnet_interface_advisory *advisory)
3339 {
3340 struct kern_nexus *notify_nx;
3341 struct __kern_netif_intf_advisory *intf_adv;
3342 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3343 ifnet_t difp = nif->nif_ifp;
3344 ifnet_t __single parent = NULL;
3345
3346 /* If we are a delegate, notify the parent instead */
3347 if (ifnet_get_delegate_parent(difp, &parent) == 0) {
3348 nif = parent->if_na->nifna_netif;
3349 }
3350 if (nif->nif_fsw_nxadv != NULL) {
3351 ASSERT(nif->nif_fsw != NULL);
3352 intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3353 notify_nx = nif->nif_fsw->fsw_nx;
3354 } else {
3355 intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3356 notify_nx = nif->nif_nx;
3357 }
3358 /*
3359 * copy the advisory report in shared memory
3360 */
3361 intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3362 sizeof(*advisory), 0);
3363 STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3364 /*
3365 * notify user channels on advisory report availability
3366 */
3367 nx_interface_advisory_notify(notify_nx);
3368 if (parent != NULL) {
3369 ifnet_release_delegate_parent(difp);
3370 }
3371 return 0;
3372 }
3373
3374 static errno_t
3375 nx_netif_interface_advisory_notify(void *kern_ctx,
3376 const struct ifnet_interface_advisory *advisory)
3377 {
3378 _CASSERT(offsetof(struct ifnet_interface_advisory, version) ==
3379 offsetof(struct ifnet_interface_advisory, header.version));
3380 _CASSERT(offsetof(struct ifnet_interface_advisory, direction) ==
3381 offsetof(struct ifnet_interface_advisory, header.direction));
3382 _CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) ==
3383 offsetof(struct ifnet_interface_advisory, header.interface_type));
3384
3385 if (__improbable(kern_ctx == NULL || advisory == NULL)) {
3386 return EINVAL;
3387 }
3388 if (__improbable((advisory->header.version <
3389 IF_INTERFACE_ADVISORY_VERSION_MIN) ||
3390 (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
3391 SK_ERR("Invalid advisory version %d", advisory->header.version);
3392 return EINVAL;
3393 }
3394 if (__improbable((advisory->header.direction !=
3395 IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3396 (advisory->header.direction !=
3397 IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3398 SK_ERR("Invalid advisory direction %d",
3399 advisory->header.direction);
3400 return EINVAL;
3401 }
3402 if (__improbable(((advisory->header.interface_type <
3403 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
3404 (advisory->header.interface_type >
3405 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
3406 (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
3407 SK_ERR("Invalid advisory interface type %d",
3408 advisory->header.interface_type);
3409 return EINVAL;
3410 }
3411 return nx_netif_interface_advisory_report(kern_ctx, advisory);
3412 }
3413
3414 void
3415 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3416 {
3417 struct kern_nexus *nx_netif;
3418 struct nx_netif *nif;
3419
3420 if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3421 return;
3422 }
3423 if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3424 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3425 nx_netif = fsw->fsw_nifna->na_nx;
3426 } else {
3427 nx_netif = nx;
3428 }
3429 ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3430 nif = NX_NETIF_PRIVATE(nx_netif);
3431 if (nif->nif_intf_adv_config != NULL) {
3432 nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3433 }
3434 }
3435
3436 /*
3437 * This function has no use anymore since we are now passing truncated packets
3438 * to filters. We keep this logic just in case we need to prevent certain
3439 * packets from being passed to filters.
3440 */
3441 static boolean_t
3442 packet_is_filterable(struct nexus_netif_adapter *nifna,
3443 struct __kern_packet *pkt)
3444 {
3445 #pragma unused (nifna, pkt)
3446 return TRUE;
3447 }
3448
3449 /*
3450 * This function is only meant for supporting the RX path because the TX path
3451 * will not send packets > MTU size due to the disabling of TSO when filters
3452 * are enabled.
3453 */
3454 static void
3455 get_filterable_packets(struct nexus_netif_adapter *nifna,
3456 struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3457 struct __kern_packet **passthrough_chain)
3458 {
3459 struct nx_netif *nif = nifna->nifna_netif;
3460 struct netif_stats *nifs = &nif->nif_stats;
3461 struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3462 struct __kern_packet *__single fpkt_head = NULL;
3463 struct __kern_packet *__single passthrough_head = NULL;
3464 struct __kern_packet **fpkt_tailp = &fpkt_head;
3465 struct __kern_packet **passthrough_tailp = &passthrough_head;
3466 int fcnt = 0, pcnt = 0, dcnt = 0;
3467
3468 while (pkt != NULL) {
3469 next = pkt->pkt_nextpkt;
3470 pkt->pkt_nextpkt = NULL;
3471
3472 if (!packet_is_filterable(nifna, pkt)) {
3473 pcnt++;
3474 *passthrough_tailp = pkt;
3475 passthrough_tailp = &pkt->pkt_nextpkt;
3476 pkt = next;
3477 continue;
3478 }
3479 fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3480 if (fpkt != NULL) {
3481 fcnt++;
3482 *fpkt_tailp = fpkt;
3483 fpkt_tailp = &fpkt->pkt_nextpkt;
3484 } else {
3485 dcnt++;
3486 }
3487 pkt = next;
3488 }
3489 *fpkt_chain = fpkt_head;
3490 *passthrough_chain = passthrough_head;
3491
3492 /*
3493 * No need to increment drop stats because that's already
3494 * done in nx_netif_pkt_to_filter_pkt.
3495 */
3496 STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3497 DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3498 int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3499 fpkt_head, struct __kern_packet *, passthrough_head);
3500 }
3501
3502 /*
3503 * This is only used by ring-based notify functions for now.
3504 * When a qset-based notify becomes available, this function can be used
3505 * unmodified.
3506 */
3507 void
3508 netif_receive(struct nexus_netif_adapter *nifna,
3509 struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3510 {
3511 struct nx_netif *nif = nifna->nifna_netif;
3512 struct nexus_adapter *na = &nifna->nifna_up;
3513 struct netif_stats *nifs = &nif->nif_stats;
3514 int err, dropcnt, dropstat = -1;
3515
3516 if ((nif->nif_ifp->if_xflags & IFXF_DISABLE_INPUT) != 0) {
3517 uint64_t byte_cnt = 0;
3518 struct __kern_packet *pkt;
3519 struct ifnet *ifp = nif->nif_ifp;
3520
3521 dropcnt = 0;
3522 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
3523 dropcnt++;
3524 byte_cnt += ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) ?
3525 m_pktlen(pkt->pkt_mbuf) : pkt->pkt_length;
3526 }
3527 os_atomic_add(&ifp->if_data.ifi_ipackets, dropcnt, relaxed);
3528 os_atomic_add(&ifp->if_data.ifi_ibytes, byte_cnt, relaxed);
3529
3530 dropstat = NETIF_STATS_DROP_INPUT_DISABLED;
3531 goto drop;
3532 }
3533
3534 /* update our work timestamp */
3535 na->na_work_ts = _net_uptime;
3536
3537 if (nif->nif_filter_cnt > 0) {
3538 struct __kern_packet *__single fpkt_chain = NULL;
3539 struct __kern_packet *__single passthrough_chain = NULL;
3540
3541 get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3542 &passthrough_chain);
3543 if (fpkt_chain != NULL) {
3544 (void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3545 NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3546 }
3547 if (passthrough_chain != NULL) {
3548 pkt_chain = passthrough_chain;
3549 } else {
3550 return;
3551 }
3552 } else if (!NETIF_IS_LOW_LATENCY(nif) && nx_netif_filter_default_drop != 0) {
3553 /*
3554 * Default drop is meant for dropping packets on interfaces without
3555 * interface filters attached. It can be skipped for LLW because it
3556 * doesn't have a network stack path.
3557 */
3558 DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3559 struct __kern_packet *, pkt_chain);
3560 dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3561 goto drop;
3562 }
3563
3564 if (nif->nif_flow_cnt > 0) {
3565 struct __kern_packet *__single remain = NULL;
3566
3567 err = nx_netif_demux(nifna, pkt_chain, &remain, stats, NETIF_FLOW_SOURCE);
3568 if (remain == NULL) {
3569 return;
3570 }
3571 pkt_chain = remain;
3572 }
3573
3574 if (na->na_rx != NULL) {
3575 na->na_rx(na, pkt_chain, stats);
3576 } else {
3577 DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3578 struct __kern_packet *, pkt_chain);
3579 dropstat = NETIF_STATS_DROP_NO_RX_CB;
3580 goto drop;
3581 }
3582
3583 return;
3584
3585 drop:
3586 dropcnt = 0;
3587 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3588 if (dropstat != -1) {
3589 STATS_ADD(nifs, dropstat, dropcnt);
3590 }
3591 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3592 }
3593
3594 static slot_idx_t
3595 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3596 slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3597 {
3598 uint64_t elapsed;
3599 uint64_t now;
3600 struct __kern_packet *pkt;
3601 clock_sec_t sec;
3602 clock_usec_t usec;
3603 slot_idx_t i;
3604
3605 if (__probable(rate == 0)) {
3606 return end;
3607 }
3608
3609 /* init tbr if not so */
3610 if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3611 r->ckr_tbr_token = rate;
3612 r->ckr_tbr_depth = rate;
3613 r->ckr_tbr_last = mach_absolute_time();
3614 } else {
3615 now = mach_absolute_time();
3616 elapsed = now - r->ckr_tbr_last;
3617 absolutetime_to_microtime(elapsed, &sec, &usec);
3618 r->ckr_tbr_token +=
3619 ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3620 if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3621 r->ckr_tbr_token = r->ckr_tbr_depth;
3622 }
3623 r->ckr_tbr_last = now;
3624 }
3625
3626 *rate_limited = FALSE;
3627 for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3628 pkt = KR_KSD(r, i)->sd_pkt;
3629 if (__improbable(pkt == NULL)) {
3630 continue;
3631 }
3632 if (__improbable(r->ckr_tbr_token <= 0)) {
3633 end = i;
3634 *rate_limited = TRUE;
3635 break;
3636 }
3637 r->ckr_tbr_token -= pkt->pkt_length * 8;
3638 }
3639
3640 SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3641 r, r->ckr_name, i);
3642
3643 return end;
3644 }
3645
3646 SK_NO_INLINE_ATTRIBUTE
3647 static struct __kern_packet *
3648 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3649 {
3650 struct __kern_packet *__single pkt_chain = NULL;
3651 struct __kern_packet **tailp = &pkt_chain;
3652 slot_idx_t idx = ring->ckr_rhead;
3653
3654 while (idx != end) {
3655 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3656 struct __kern_packet *pkt = ksd->sd_pkt;
3657
3658 ASSERT(pkt->pkt_nextpkt == NULL);
3659 KR_SLOT_DETACH_METADATA(ring, ksd);
3660 *tailp = pkt;
3661 tailp = &pkt->pkt_nextpkt;
3662 idx = SLOT_NEXT(idx, ring->ckr_lim);
3663 }
3664 ring->ckr_rhead = end;
3665 ring->ckr_rtail = ring->ckr_ktail;
3666 return pkt_chain;
3667 }
3668
3669 int
3670 netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3671 uint32_t flags)
3672 {
3673 struct nexus_adapter *hwna;
3674 struct nexus_netif_adapter *nifna;
3675 struct nx_netif *nif;
3676 struct __kern_packet *pkt_chain;
3677 struct nexus_pkt_stats stats = {0};
3678 sk_protect_t protect;
3679 slot_idx_t ktail;
3680 int err = 0;
3681
3682 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3683 SK_KVA(ring));
3684
3685 ASSERT(ring->ckr_tx == NR_RX);
3686 ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3687
3688 err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3689 if (err != 0) {
3690 /* not a serious error, so no need to be chatty here */
3691 SK_DF(SK_VERB_FSW,
3692 "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3693 "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3694 ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3695 CKRF_BITS, err);
3696 goto out;
3697 }
3698 if (__improbable(KR_DROP(ring))) {
3699 kr_exit(ring);
3700 err = ENODEV;
3701 goto out;
3702 }
3703 hwna = KRNA(ring);
3704 nifna = NIFNA(hwna);
3705 nif = nifna->nifna_netif;
3706 if (__improbable(hwna->na_ifp == NULL)) {
3707 kr_exit(ring);
3708 err = ENODEV;
3709 goto out;
3710 }
3711 protect = sk_sync_protect();
3712 err = ring->ckr_na_sync(ring, p, 0);
3713 if (err != 0 && err != EAGAIN) {
3714 goto put_out;
3715 }
3716
3717 /* read the tail pointer once */
3718 ktail = ring->ckr_ktail;
3719 if (__improbable(ring->ckr_khead == ktail)) {
3720 SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3721 "how strange, interrupt with no packets on hwna "
3722 "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3723 goto put_out;
3724 }
3725 ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3726 ktail, &ring->ckr_rate_limited);
3727
3728 pkt_chain = consume_pkts(ring, ktail);
3729 if (pkt_chain != NULL) {
3730 netif_receive(nifna, pkt_chain, &stats);
3731
3732 if (ring->ckr_netif_mit_stats != NULL &&
3733 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3734 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3735 stats.nps_bytes);
3736 }
3737 }
3738
3739 put_out:
3740 sk_sync_unprotect(protect);
3741 kr_exit(ring);
3742
3743 out:
3744 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3745 SK_KVA(ring), err);
3746 return err;
3747 }
3748
3749 int
3750 netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3751 uint32_t flags)
3752 {
3753 #pragma unused(p, flags)
3754 sk_protect_t protect;
3755 struct nexus_adapter *hwna;
3756 struct nexus_pkt_stats stats = {0};
3757 uint32_t i, count;
3758 int err = 0;
3759
3760 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3761 SK_KVA(ring));
3762
3763 /* XXX
3764 * sk_sync_protect() is not needed for this case because
3765 * we are not using the dev ring. Unfortunately lots of
3766 * macros used by fsw still require this.
3767 */
3768 protect = sk_sync_protect();
3769 hwna = KRNA(ring);
3770 count = na_get_nslots(hwna, NR_RX);
3771 err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
3772 if (__improbable(err != 0)) {
3773 SK_ERR("nx_rx_sync_packets failed: %d", err);
3774 DTRACE_SKYWALK2(rx__sync__packets__failed,
3775 struct __kern_channel_ring *, ring, int, err);
3776 goto out;
3777 }
3778 DTRACE_SKYWALK1(chain__count, uint32_t, count);
3779 for (i = 0; i < count; i++) {
3780 struct __kern_packet *pkt_chain;
3781
3782 pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3783 ASSERT(pkt_chain != NULL);
3784 netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);
3785
3786 if (ring->ckr_netif_mit_stats != NULL &&
3787 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3788 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3789 stats.nps_bytes);
3790 }
3791 }
3792 out:
3793 sk_sync_unprotect(protect);
3794 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3795 SK_KVA(ring), err);
3796 return err;
3797 }
3798
3799
3800 /*
3801 * Configure the NA to operate in a particular mode.
3802 */
3803 static channel_ring_notify_t
3804 netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3805 {
3806 channel_ring_notify_t notify = NULL;
3807 boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3808 nx_has_rx_sync_packets(ring));
3809
3810 if (mode == NETIF_MODE_FSW) {
3811 notify = (has_sync_pkts ? netif_rx_notify_fast :
3812 netif_rx_notify_default);
3813 } else if (mode == NETIF_MODE_LLW) {
3814 notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3815 netif_llw_rx_notify_default);
3816 }
3817 return notify;
3818 }
3819
3820
3821 static uint32_t
3822 netif_mode_to_flag(netif_mode_t mode)
3823 {
3824 uint32_t flag = 0;
3825
3826 if (mode == NETIF_MODE_FSW) {
3827 flag = NAF_MODE_FSW;
3828 } else if (mode == NETIF_MODE_LLW) {
3829 flag = NAF_MODE_LLW;
3830 }
3831 return flag;
3832 }
3833
3834 static void
3835 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3836 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3837 struct nexus_pkt_stats *), boolean_t set)
3838 {
3839 uint32_t i;
3840 uint32_t flag;
3841
3842 ASSERT(hwna->na_type == NA_NETIF_DEV ||
3843 hwna->na_type == NA_NETIF_COMPAT_DEV);
3844
3845 for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3846 struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3847 channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);
3848
3849 if (set) {
3850 kr->ckr_save_notify = kr->ckr_netif_notify;
3851 kr->ckr_netif_notify = notify;
3852 } else {
3853 kr->ckr_netif_notify = kr->ckr_save_notify;
3854 kr->ckr_save_notify = NULL;
3855 }
3856 }
3857 if (set) {
3858 hwna->na_rx = rx;
3859 flag = netif_mode_to_flag(mode);
3860 os_atomic_or(&hwna->na_flags, flag, relaxed);
3861 } else {
3862 hwna->na_rx = NULL;
3863 os_atomic_andnot(&hwna->na_flags, (NAF_MODE_FSW | NAF_MODE_LLW), relaxed);
3864 }
3865 }
3866
3867 void
3868 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3869 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3870 struct nexus_pkt_stats *))
3871 {
3872 return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3873 }
3874
3875 void
3876 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3877 {
3878 return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3879 }
3880
3881 static void
3882 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3883 {
3884 struct nexus_netif_adapter *nifna = NIFNA(na);
3885 struct nx_netif *nif = nifna->nifna_netif;
3886 struct netif_stats *nifs = &nif->nif_stats;
3887 struct __kern_channel_ring *r;
3888 struct nexus_pkt_stats stats;
3889 sk_protect_t protect;
3890 boolean_t ring_drop = FALSE;
3891 int err, dropcnt;
3892
3893 if (!NA_OWNED_BY_FSW(na)) {
3894 DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3895 goto fail;
3896 }
3897 ASSERT(na->na_rx != NULL);
3898
3899 /*
3900 * XXX
3901 * This function is called when a filter injects a packet back to the
3902 * regular RX path. We can assume the ring is 0 for now because RSS
3903 * is not supported. This needs to be revisited when we add support for
3904 * RSS.
3905 */
3906 r = &na->na_rx_rings[0];
3907 ASSERT(r->ckr_tx == NR_RX);
3908 err = kr_enter(r, TRUE);
3909 VERIFY(err == 0);
3910
3911 if (__improbable(KR_DROP(r))) {
3912 kr_exit(r);
3913 DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3914 struct __kern_channel_ring *, r);
3915 ring_drop = TRUE;
3916 goto fail;
3917 }
3918 protect = sk_sync_protect();
3919 na->na_rx(na, pkt_chain, &stats);
3920
3921 if (r->ckr_netif_mit_stats != NULL &&
3922 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3923 r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3924 }
3925 sk_sync_unprotect(protect);
3926
3927 kr_exit(r);
3928 return;
3929
3930 fail:
3931 dropcnt = 0;
3932 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3933 if (ring_drop) {
3934 STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3935 }
3936 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3937 }
3938
3939 /*
3940 * This is called when an inbound packet has traversed all filters.
3941 */
3942 errno_t
3943 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3944 struct __kern_packet *fpkt_chain, uint32_t flags)
3945 {
3946 #pragma unused (flags)
3947 struct nx_netif *nif = nifna->nifna_netif;
3948 struct netif_stats *nifs = &nif->nif_stats;
3949 struct nexus_adapter *na = &nifna->nifna_up;
3950 struct __kern_packet *pkt_chain;
3951 int err;
3952
3953 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3954 fpkt_chain, NETIF_CONVERT_RX);
3955 if (pkt_chain == NULL) {
3956 return ENOMEM;
3957 }
3958 if (nif->nif_flow_cnt > 0) {
3959 struct __kern_packet *__single remain = NULL;
3960
3961 err = nx_netif_demux(nifna, pkt_chain, &remain,
3962 NULL, NETIF_FLOW_INJECT);
3963 if (remain == NULL) {
3964 return err;
3965 }
3966 pkt_chain = remain;
3967 }
3968 if (na->na_rx != NULL) {
3969 netif_inject_rx(na, pkt_chain);
3970 } else {
3971 int dropcnt = 0;
3972 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3973 STATS_ADD(nifs,
3974 NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3975 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3976 }
3977 return 0;
3978 }
3979
3980 /*
3981 * This is called when an outbound packet has traversed all filters.
3982 */
3983 errno_t
3984 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3985 struct __kern_packet *fpkt_chain, uint32_t flags)
3986 {
3987 #pragma unused (flags)
3988 struct nx_netif *nif = nifna->nifna_netif;
3989 struct nexus_adapter *na = &nifna->nifna_up;
3990 int err;
3991
3992 if (NETIF_IS_COMPAT(nif)) {
3993 struct mbuf *m_chain;
3994 mbuf_svc_class_t sc;
3995
3996 m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3997 fpkt_chain, NETIF_CONVERT_TX);
3998 if (m_chain == NULL) {
3999 return ENOMEM;
4000 }
4001 /*
4002 * All packets in the chain have the same service class.
4003 * If the sc is missing or invalid, a valid value will be
4004 * returned.
4005 */
4006 sc = mbuf_get_service_class(m_chain);
4007 err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
4008 sc, m_chain);
4009 } else {
4010 struct __kern_packet *pkt_chain;
4011 kern_packet_svc_class_t sc;
4012
4013 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
4014 fpkt_chain, NETIF_CONVERT_TX);
4015 if (pkt_chain == NULL) {
4016 return ENOMEM;
4017 }
4018 /*
4019 * All packets in the chain have the same service class.
4020 * If the sc is missing or invalid, a valid value will be
4021 * returned.
4022 */
4023 sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
4024 err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
4025 sc, pkt_chain);
4026 }
4027 /* Tell driver to resume dequeuing */
4028 ifnet_start(na->na_ifp);
4029 return err;
4030 }
4031
4032 void
4033 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
4034 struct skmem_region_params *srp)
4035 {
4036 #pragma unused(na, srp)
4037 return;
4038 }
4039
4040 /* returns true, if starter thread is utilized */
4041 static bool
4042 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
4043 {
4044 #if (DEVELOPMENT || DEBUG)
4045 if (__improbable(nx_netif_force_ifnet_start != 0)) {
4046 ifnet_start(ifp);
4047 return true;
4048 }
4049 #endif /* !DEVELOPMENT && !DEBUG */
4050 /*
4051 * use starter thread in following conditions:
4052 * - interface is not skywalk native
4053 * - interface attached to virtual driver (ipsec, utun)
4054 * - TBR is enabled
4055 * - delayed start mechanism is in use
4056 * - remaining stack space on the thread is not enough for driver
4057 * - caller is in rx workloop context
4058 * - caller is from the flowswitch path doing ARP resolving
4059 * - caller requires the use of starter thread (stack usage)
4060 * - caller requires starter thread for pacing
4061 */
4062 if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
4063 !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
4064 ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
4065 IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
4066 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4067 (flags & NETIF_XMIT_FLAG_PACING) != 0 ||
4068 sk_is_rx_notify_protected() ||
4069 sk_is_async_transmit_protected() ||
4070 (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
4071 DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
4072 uint32_t, flags);
4073 ifnet_start(ifp);
4074 return true;
4075 }
4076 lck_mtx_lock_spin(&ifp->if_start_lock);
4077 /* interface is flow controlled */
4078 if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
4079 lck_mtx_unlock(&ifp->if_start_lock);
4080 return true;
4081 }
4082 /* if starter thread is active, utilize it */
4083 if (ifp->if_start_active) {
4084 ifp->if_start_req++;
4085 lck_mtx_unlock(&ifp->if_start_lock);
4086 return true;
4087 }
4088 lck_mtx_unlock(&ifp->if_start_lock);
4089 /* Check remaining stack space */
4090 if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
4091 ifnet_start(ifp);
4092 return true;
4093 }
4094 return false;
4095 }
4096
4097 void
4098 netif_transmit(struct ifnet *ifp, uint32_t flags)
4099 {
4100 if (netif_use_starter_thread(ifp, flags)) {
4101 return;
4102 }
4103 nx_netif_doorbell_internal(ifp, flags);
4104 }
4105
4106 static struct ifclassq *
4107 netif_get_default_ifcq(struct nexus_adapter *hwna)
4108 {
4109 struct nx_netif *nif;
4110 struct ifclassq *ifcq;
4111
4112 nif = NX_NETIF_PRIVATE(hwna->na_nx);
4113 if (NETIF_LLINK_ENABLED(nif)) {
4114 struct netif_qset *qset;
4115
4116 /*
4117 * Use the default ifcq for now.
4118 * In the future this could be chosen by the caller.
4119 */
4120 qset = nx_netif_get_default_qset_noref(nif);
4121 ASSERT(qset != NULL);
4122 ifcq = qset->nqs_ifcq;
4123 } else {
4124 ifcq = nif->nif_ifp->if_snd;
4125 }
4126 return ifcq;
4127 }
4128
4129 static errno_t
4130 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
4131 uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
4132 boolean_t *pkts_pending, kern_packet_svc_class_t sc,
4133 uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
4134 {
4135 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
4136 struct ifnet *ifp = hwna->na_ifp;
4137 uint32_t pkts_cnt;
4138 uint32_t bytes_cnt;
4139 errno_t rc;
4140
4141 ASSERT(ifp != NULL);
4142 ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
4143 ASSERT((pkt_limit != 0) && (byte_limit != 0));
4144
4145 if (ifcq == NULL) {
4146 ifcq = netif_get_default_ifcq(hwna);
4147 }
4148 if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
4149 rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
4150 pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4151 } else {
4152 rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
4153 &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4154 }
4155 ASSERT((rc == 0) || (rc == EAGAIN));
4156 ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
4157
4158 ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
4159 &pkts_cnt, &bytes_cnt);
4160 *pkts_pending = pkts_cnt > 0;
4161
4162 *head = pkt_head.cp_kpkt;
4163 return rc;
4164 }
4165
4166 #if SK_LOG
4167 /* Hoisted out of line to reduce kernel stack footprint */
4168 SK_LOG_ATTRIBUTE
4169 static void
4170 netif_no_ring_space_log(const struct nexus_adapter *na,
4171 const kern_channel_ring_t ring)
4172 {
4173 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4174 "no ring space: na \"%s\" [%u] "
4175 "\"%s\"(kh %u kt %u | rh %u rt %u)",
4176 na->na_name, ring->ckr_ring_id,
4177 ring->ckr_name, ring->ckr_khead,
4178 ring->ckr_ktail, ring->ckr_rhead,
4179 ring->ckr_rtail);
4180 }
4181 #endif /* SK_LOG */
4182
4183 /*
4184 * netif refill function for rings
4185 */
4186 errno_t
4187 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
4188 uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
4189 boolean_t canblock)
4190 {
4191 struct nexus_adapter *hwna;
4192 struct ifnet *ifp;
4193 struct __kern_packet *__single head = NULL;
4194 sk_protect_t protect;
4195 errno_t rc = 0;
4196 errno_t sync_err = 0;
4197 uint32_t npkts = 0, consumed = 0;
4198 uint32_t flags;
4199 slot_idx_t idx, ktail;
4200 int ring_space = 0;
4201
4202 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
4203
4204 VERIFY(ring != NULL);
4205 hwna = KRNA(ring);
4206 ifp = hwna->na_ifp;
4207
4208 ASSERT(hwna->na_type == NA_NETIF_DEV);
4209 ASSERT(ring->ckr_tx == NR_TX);
4210 *pkts_pending = FALSE;
4211
4212 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
4213 SK_ERR("invalid limits plim %d, blim %d",
4214 pkt_limit, byte_limit);
4215 rc = EINVAL;
4216 goto out;
4217 }
4218
4219 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
4220 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
4221 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4222 rc = ENXIO;
4223 goto out;
4224 }
4225
4226 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
4227 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
4228 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4229 rc = ENXIO;
4230 goto out;
4231 }
4232
4233 /*
4234 * if the ring is busy, it means another dequeue is in
4235 * progress, so ignore this request and return success.
4236 */
4237 if (kr_enter(ring, canblock) != 0) {
4238 rc = 0;
4239 goto out;
4240 }
4241 /* mark thread with sync-in-progress flag */
4242 protect = sk_sync_protect();
4243
4244 if (__improbable(KR_DROP(ring) ||
4245 !NA_IS_ACTIVE(ring->ckr_na))) {
4246 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
4247 rc = ENXIO;
4248 goto done;
4249 }
4250
4251 idx = ring->ckr_rhead;
4252 ktail = ring->ckr_ktail;
4253 /* calculate available space on tx ring */
4254 ring_space = ktail - idx;
4255 if (ring_space < 0) {
4256 ring_space += ring->ckr_num_slots;
4257 }
4258 if (ring_space == 0) {
4259 struct ifclassq *ifcq;
4260
4261 /* no space in ring, driver should retry */
4262 #if SK_LOG
4263 if (__improbable((sk_verbose &
4264 (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
4265 netif_no_ring_space_log(hwna, ring);
4266 }
4267 #endif /* SK_LOG */
4268 ifcq = netif_get_default_ifcq(hwna);
4269 if (IFCQ_LEN(ifcq) != 0) {
4270 *pkts_pending = TRUE;
4271 }
4272 /*
4273 * We ran out of space in ring, most probably
4274 * because the driver is slow to drain its TX queue.
4275 * We want another doorbell to be generated as soon
4276 * as the TX notify completion happens; mark this
4277 * through ckr_pending_doorbell counter. Do this
4278 * regardless of whether there's any pending packet.
4279 */
4280 ring->ckr_pending_doorbell++;
4281 rc = EAGAIN;
4282 goto sync_ring;
4283 }
4284
4285 if ((uint32_t)ring_space < pkt_limit) {
4286 pkt_limit = ring_space;
4287 }
4288
4289 if (tx_doorbell_ctxt &&
4290 ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4291 pkt_limit = MIN(pkt_limit,
4292 nx_netif_doorbell_max_dequeue);
4293 }
4294
4295 rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4296 &head, pkts_pending, ring->ckr_svc, NULL, NULL, 0);
4297
4298 /*
4299 * There's room in ring; if we haven't dequeued everything,
4300 * mark ckr_pending_doorbell for the next TX notify to issue
4301 * a TX door bell; otherwise, clear it. The next packet that
4302 * gets enqueued will trigger a door bell again.
4303 */
4304 if (*pkts_pending) {
4305 ring->ckr_pending_doorbell++;
4306 } else if (ring->ckr_pending_doorbell != 0) {
4307 ring->ckr_pending_doorbell = 0;
4308 }
4309
4310 if (rc != 0) {
4311 /*
4312 * This is expected sometimes as the IOSkywalkFamily
4313 * errs on the side of caution to perform an extra
4314 * dequeue when multiple doorbells are pending;
4315 * nothing to dequeue, do a sync if there are slots
4316 * to reclaim else just return.
4317 */
4318 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4319 "nothing to dequeue, err %d", rc);
4320
4321 if ((uint32_t)ring_space == ring->ckr_lim) {
4322 goto done;
4323 } else {
4324 goto sync_ring;
4325 }
4326 }
4327 /* move the dequeued packets to tx ring */
4328 while (head != NULL && idx != ktail) {
4329 ASSERT(npkts <= pkt_limit);
4330 struct __kern_packet *pkt = head;
4331 KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
4332 (struct __kern_quantum *)pkt);
4333 npkts++;
4334 if (__improbable(pkt->pkt_trace_id != 0)) {
4335 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4336 KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4337 }
4338 idx = SLOT_NEXT(idx, ring->ckr_lim);
4339 head = pkt->pkt_nextpkt;
4340 pkt->pkt_nextpkt = NULL;
4341 }
4342
4343 /*
4344 * We checked for ring space earlier so the ring should have enough
4345 * space for the entire chain.
4346 */
4347 ASSERT(head == NULL);
4348 ring->ckr_rhead = idx;
4349
4350 sync_ring:
4351 flags = NA_SYNCF_NETIF;
4352 if (ring->ckr_pending_doorbell != 0) {
4353 flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4354 }
4355
4356 ring->ckr_khead_pre = ring->ckr_khead;
4357 sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4358 if (sync_err != 0 && sync_err != EAGAIN) {
4359 SK_ERR("unexpected sync err %d", sync_err);
4360 if (rc == 0) {
4361 rc = sync_err;
4362 }
4363 goto done;
4364 }
4365 /*
4366 * Verify that the driver has detached packets from the consumed slots.
4367 */
4368 idx = ring->ckr_khead_pre;
4369 consumed = 0;
4370 while (idx != ring->ckr_khead) {
4371 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4372
4373 consumed++;
4374 VERIFY(!KSD_VALID_METADATA(ksd));
4375 idx = SLOT_NEXT(idx, ring->ckr_lim);
4376 }
4377 ring->ckr_khead_pre = ring->ckr_khead;
4378
4379 done:
4380 sk_sync_unprotect(protect);
4381 kr_exit(ring);
4382 out:
4383 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4384 SK_KVA(ring), rc, 0, npkts);
4385
4386 return rc;
4387 }
4388
4389 #define NQ_EWMA(old, new, decay) do { \
4390 u_int64_t _avg; \
4391 if (__probable((_avg = (old)) > 0)) \
4392 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
4393 else \
4394 _avg = (new); \
4395 (old) = _avg; \
4396 } while (0)
4397
4398 void
4399 kern_netif_increment_queue_stats(kern_netif_queue_t queue,
4400 uint32_t pkt_count, uint32_t byte_count)
4401 {
4402 struct netif_llink *llink = queue->nq_qset->nqs_llink;
4403 struct ifnet *ifp = llink->nll_nif->nif_ifp;
4404 if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
4405 os_atomic_add(&ifp->if_data.ifi_opackets, pkt_count, relaxed);
4406 os_atomic_add(&ifp->if_data.ifi_obytes, byte_count, relaxed);
4407 } else {
4408 os_atomic_add(&ifp->if_data.ifi_ipackets, pkt_count, relaxed);
4409 os_atomic_add(&ifp->if_data.ifi_ibytes, byte_count, relaxed);
4410 }
4411
4412 if (ifp->if_data_threshold != 0) {
4413 ifnet_notify_data_threshold(ifp);
4414 }
4415
4416 uint64_t now;
4417 uint64_t diff_secs;
4418 struct netif_qstats *stats = &queue->nq_stats;
4419
4420 if (sk_netif_queue_stat_enable == 0) {
4421 return;
4422 }
4423
4424 if (__improbable(pkt_count == 0)) {
4425 return;
4426 }
4427
4428 stats->nq_num_xfers++;
4429 stats->nq_total_bytes += byte_count;
4430 stats->nq_total_pkts += pkt_count;
4431 if (pkt_count > stats->nq_max_pkts) {
4432 stats->nq_max_pkts = pkt_count;
4433 }
4434 if (stats->nq_min_pkts == 0 ||
4435 pkt_count < stats->nq_min_pkts) {
4436 stats->nq_min_pkts = pkt_count;
4437 }
4438
4439 now = net_uptime();
4440 if (__probable(queue->nq_accumulate_start != 0)) {
4441 diff_secs = now - queue->nq_accumulate_start;
4442 if (diff_secs >= nq_accumulate_interval) {
4443 uint64_t bps;
4444 uint64_t pps;
4445 uint64_t pps_ma;
4446
4447 /* bytes per second */
4448 bps = queue->nq_accumulated_bytes / diff_secs;
4449 NQ_EWMA(stats->nq_bytes_ps_ma,
4450 bps, nq_transfer_decay);
4451 stats->nq_bytes_ps = bps;
4452
4453 /* pkts per second */
4454 pps = queue->nq_accumulated_pkts / diff_secs;
4455 pps_ma = stats->nq_pkts_ps_ma;
4456 NQ_EWMA(pps_ma, pps, nq_transfer_decay);
4457 stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
4458 stats->nq_pkts_ps = (uint32_t)pps;
4459
4460 /* start over */
4461 queue->nq_accumulate_start = now;
4462 queue->nq_accumulated_bytes = 0;
4463 queue->nq_accumulated_pkts = 0;
4464
4465 stats->nq_min_pkts = 0;
4466 stats->nq_max_pkts = 0;
4467 }
4468 } else {
4469 queue->nq_accumulate_start = now;
4470 }
4471 queue->nq_accumulated_bytes += byte_count;
4472 queue->nq_accumulated_pkts += pkt_count;
4473 }
4474
4475 void
4476 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4477 uint32_t count, uint32_t flags)
4478 {
4479 #pragma unused (count)
4480 struct netif_queue *q = queue;
4481 struct netif_llink *llink = q->nq_qset->nqs_llink;
4482 struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4483 bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4484 struct pktq *pktq = &q->nq_pktq;
4485 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4486 struct nexus_pkt_stats stats = {0};
4487 sk_protect_t protect;
4488
4489 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4490 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4491 int drop_cnt = 0;
4492
4493 pp_free_packet_chain(pkt_chain, &drop_cnt);
4494 STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4495 return;
4496 }
4497 KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4498 if (flush) {
4499 pkt_chain = KPKTQ_FIRST(pktq);
4500 KPKTQ_INIT(pktq);
4501
4502 protect = sk_sync_protect();
4503 netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4504 sk_sync_unprotect(protect);
4505 kern_netif_increment_queue_stats(queue, (uint32_t)stats.nps_pkts,
4506 (uint32_t)stats.nps_bytes);
4507 }
4508 }
4509
4510 errno_t
4511 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4512 uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4513 {
4514 struct netif_queue *q = queue;
4515 struct netif_llink *llink = q->nq_qset->nqs_llink;
4516 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4517 struct nexus_adapter *hwna;
4518 struct __kern_packet *__single pkt_chain = NULL;
4519 uint32_t bytes = 0, pkt_cnt = 0;
4520 errno_t rc;
4521
4522 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4523 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4524 STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4525 return ENXIO;
4526 }
4527 hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4528
4529 if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4530 sk_is_tx_notify_protected()) {
4531 pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4532 }
4533 rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4534 byte_limit, &pkt_chain, pending, q->nq_svc, &pkt_cnt, &bytes,
4535 q->nq_qset->nqs_idx);
4536
4537 if (pkt_cnt > 0) {
4538 kern_netif_increment_queue_stats(queue, pkt_cnt, bytes);
4539 }
4540 if (pkt_chain != NULL) {
4541 *ph_chain = SK_PKT2PH(pkt_chain);
4542 }
4543 return rc;
4544 }
4545
4546 errno_t
4547 kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
4548 uint32_t * pkts_cnt, uint32_t * bytes_cnt)
4549 {
4550 VERIFY(qset != NULL);
4551 VERIFY(pkts_cnt != NULL);
4552 VERIFY(bytes_cnt != NULL);
4553
4554 return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
4555 bytes_cnt);
4556 }
4557
4558 void
4559 kern_netif_set_qset_combined(kern_netif_qset_t qset)
4560 {
4561 VERIFY(qset != NULL);
4562 VERIFY(qset->nqs_ifcq != NULL);
4563
4564 ifclassq_set_grp_combined(qset->nqs_ifcq, qset->nqs_idx);
4565 }
4566
4567 void
4568 kern_netif_set_qset_separate(kern_netif_qset_t qset)
4569 {
4570 VERIFY(qset != NULL);
4571 VERIFY(qset->nqs_ifcq != NULL);
4572
4573 ifclassq_set_grp_separated(qset->nqs_ifcq, qset->nqs_idx);
4574 }
4575
4576 errno_t
4577 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4578 struct kern_nexus_netif_llink_init *llink_init)
4579 {
4580 errno_t err;
4581 struct nx_netif *nif;
4582 struct netif_llink *__single llink;
4583 struct netif_stats *nifs;
4584
4585 VERIFY(nx != NULL);
4586 VERIFY(llink_init != NULL);
4587 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4588
4589 nif = NX_NETIF_PRIVATE(nx);
4590 nifs = &nif->nif_stats;
4591
4592 err = nx_netif_validate_llink_config(llink_init, false);
4593 if (err != 0) {
4594 SK_ERR("Invalid llink init params");
4595 STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4596 return err;
4597 }
4598
4599 err = nx_netif_llink_add(nif, llink_init, &llink);
4600 return err;
4601 }
4602
4603 errno_t
4604 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4605 kern_nexus_netif_llink_id_t llink_id)
4606 {
4607 struct nx_netif *nif;
4608
4609 VERIFY(nx != NULL);
4610 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4611
4612 nif = NX_NETIF_PRIVATE(nx);
4613 return nx_netif_llink_remove(nif, llink_id);
4614 }
4615
4616 errno_t
4617 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4618 kern_packet_svc_class_t *svc)
4619 {
4620 *svc = queue->nq_svc;
4621 return 0;
4622 }
4623