1 /*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * The netif nexus domain has two domain providers: native and compat, with
31 * the latter being the default provider of this domain. The compat provider
32 * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33 *
34 * A netif nexus instance can be in a native or compat mode; in either case,
35 * it is associated with two instances of a nexus_adapter structure, and allows
36 * at most two channels opened to the nexus. Two two adapters correspond to
37 * host and device ports, respectively.
38 *
39 * By itself, a netif nexus isn't associated with a network interface. The
40 * association happens by attaching a network interface to the nexus instance.
41 * A channel can only be successfully opened to a netif nexus after it has an
42 * interface attached to it.
43 *
44 * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45 * structure refers to the attached netif nexus adapter via its if_na field.
46 * The nexus also holds a reference to the interface on its na_ifp field. Note
47 * that attaching to a netif_compat nexus does not alter the input/output data
48 * path, nor does it remove any of the interface's hardware offload flags. It
49 * merely associates the interface and netif nexus together.
50 *
51 * During a detach, the above references are dropped and the fields are cleared;
52 * the interface is also marked as non-Skywalk-capable. This detach can happen
53 * explicitly via a command down the nexus, or implicitly when the nexus goes
54 * away (assuming there's no channel opened to it.)
55 *
56 * A userland channel can be opened to a netif nexus via the usual ch_open()
57 * way, assuming the nexus provider is setup to allow access for the userland
58 * process (either by binding the nexus port to PID, etc. or by creating the
59 * nexus in the anonymous mode.)
60 *
61 * Alternatively, a kernel channel can also be opened to it by some kernel
62 * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63 * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64 * indicate that.
65 *
66 * Opening a channel to the host port of a native or compat netif causes the
67 * ifnet output path to be redirected to nx_netif_host_transmit(). We also,
68 * at present, disable any hardware offload features.
69 *
70 * Opening a channel to the device port of a compat netif causes the ifnet
71 * input path to be redirected to nx_netif_compat_receive(). This is specific
72 * to the compat variant, as the native variant's RX path already goes to
73 * the native netif.
74 *
75 * During channel close, we restore the original I/O callbacks, as well as the
76 * interface's offload flags.
77 */
78
79 #include <skywalk/os_skywalk_private.h>
80 #include <skywalk/nexus/netif/nx_netif.h>
81 #include <skywalk/nexus/upipe/nx_user_pipe.h>
82 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83 #include <sys/kdebug.h>
84 #include <sys/sdt.h>
85 #include <os/refcnt.h>
86 #include <libkern/OSDebug.h>
87
88 #define NX_NETIF_MAXRINGS NX_MAX_NUM_RING_PAIR
89 #define NX_NETIF_MINSLOTS 2 /* XXX same as above */
90 #define NX_NETIF_MAXSLOTS NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91 #define NX_NETIF_TXRINGSIZE 512 /* default TX ring size */
92 #define NX_NETIF_RXRINGSIZE 1024 /* default RX ring size */
93 #define NX_NETIF_BUFSIZE (2 * 1024) /* default buffer size */
94 #define NX_NETIF_MINBUFSIZE (128) /* min buffer size */
95 #define NX_NETIF_MAXBUFSIZE (32 * 1024) /* max buffer size */
96
97 /*
98 * TODO: [email protected] -- minimum buflets for now; we will need to
99 * have a way to adjust this based on the underlying interface's
100 * parameters, e.g. jumbo MTU, large segment offload, etc.
101 */
102 #define NX_NETIF_UMD_SIZE _USER_PACKET_SIZE(BUFLETS_MIN)
103 #define NX_NETIF_KMD_SIZE _KERN_PACKET_SIZE(BUFLETS_MIN)
104
105 /*
106 * minimum stack space required for IOSkywalkFamily and Driver execution.
107 */
108 #if XNU_TARGET_OS_OSX
109 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 1)
110 #else /* !XNU_TARGET_OS_OSX */
111 #define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 2)
112 #endif /* XNU_TARGET_OS_OSX */
113
114 static void nx_netif_dom_init(struct nxdom *);
115 static void nx_netif_dom_terminate(struct nxdom *);
116 static void nx_netif_dom_fini(struct nxdom *);
117 static int nx_netif_prov_params_adjust(
118 const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 struct nxprov_adjusted_params *);
120
121 static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122 struct nxbind *, void *);
123 static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124 static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125 struct kern_nexus *, struct kern_channel *, struct chreq *,
126 struct kern_channel *, struct nxbind *, struct proc *);
127 static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128 struct kern_nexus *, struct kern_channel *);
129 static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130 struct kern_nexus *, struct kern_channel *, struct proc *);
131 static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132 struct kern_nexus *, struct kern_channel *, boolean_t);
133
134 static void nx_netif_doorbell(struct ifnet *);
135 static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136 uint32_t);
137 static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138 uint32_t);
139 static void nx_netif_na_dtor(struct nexus_adapter *na);
140 static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141 uint32_t);
142 static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143 uint32_t);
144 static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145
146 static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147 struct proc *);
148 static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149 struct proc *);
150 static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151 static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152 static void nx_netif_flags_init(struct nx_netif *);
153 static void nx_netif_flags_fini(struct nx_netif *);
154 static void nx_netif_capabilities_fini(struct nx_netif *);
155 static errno_t nx_netif_interface_advisory_notify(void *,
156 const struct ifnet_interface_advisory *);
157
158 struct nxdom nx_netif_dom_s = {
159 .nxdom_prov_head =
160 STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
161 .nxdom_type = NEXUS_TYPE_NET_IF,
162 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
163 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
164 .nxdom_name = "netif",
165 .nxdom_ports = {
166 .nb_def = 2,
167 .nb_min = 2,
168 .nb_max = NX_NETIF_MAXPORTS,
169 },
170 .nxdom_tx_rings = {
171 .nb_def = 1,
172 .nb_min = 1,
173 .nb_max = NX_NETIF_MAXRINGS,
174 },
175 .nxdom_rx_rings = {
176 .nb_def = 1,
177 .nb_min = 1,
178 .nb_max = NX_NETIF_MAXRINGS,
179 },
180 .nxdom_tx_slots = {
181 .nb_def = NX_NETIF_TXRINGSIZE,
182 .nb_min = NX_NETIF_MINSLOTS,
183 .nb_max = NX_NETIF_MAXSLOTS,
184 },
185 .nxdom_rx_slots = {
186 .nb_def = NX_NETIF_RXRINGSIZE,
187 .nb_min = NX_NETIF_MINSLOTS,
188 .nb_max = NX_NETIF_MAXSLOTS,
189 },
190 .nxdom_buf_size = {
191 .nb_def = NX_NETIF_BUFSIZE,
192 .nb_min = NX_NETIF_MINBUFSIZE,
193 .nb_max = NX_NETIF_MAXBUFSIZE,
194 },
195 .nxdom_large_buf_size = {
196 .nb_def = 0,
197 .nb_min = 0,
198 .nb_max = 0,
199 },
200 .nxdom_meta_size = {
201 .nb_def = NX_NETIF_UMD_SIZE,
202 .nb_min = NX_NETIF_UMD_SIZE,
203 .nb_max = NX_METADATA_USR_MAX_SZ,
204 },
205 .nxdom_stats_size = {
206 .nb_def = 0,
207 .nb_min = 0,
208 .nb_max = NX_STATS_MAX_SZ,
209 },
210 .nxdom_pipes = {
211 .nb_def = 0,
212 .nb_min = 0,
213 .nb_max = NX_UPIPE_MAXPIPES,
214 },
215 .nxdom_flowadv_max = {
216 .nb_def = 0,
217 .nb_min = 0,
218 .nb_max = NX_FLOWADV_MAX,
219 },
220 .nxdom_nexusadv_size = {
221 .nb_def = 0,
222 .nb_min = 0,
223 .nb_max = NX_NEXUSADV_MAX_SZ,
224 },
225 .nxdom_capabilities = {
226 .nb_def = NXPCAP_USER_CHANNEL,
227 .nb_min = 0,
228 .nb_max = NXPCAP_USER_CHANNEL,
229 },
230 .nxdom_qmap = {
231 .nb_def = NEXUS_QMAP_TYPE_DEFAULT,
232 .nb_min = NEXUS_QMAP_TYPE_DEFAULT,
233 .nb_max = NEXUS_QMAP_TYPE_WMM,
234 },
235 .nxdom_max_frags = {
236 .nb_def = NX_PBUF_FRAGS_DEFAULT,
237 .nb_min = NX_PBUF_FRAGS_MIN,
238 .nb_max = NX_PBUF_FRAGS_MAX,
239 },
240 .nxdom_init = nx_netif_dom_init,
241 .nxdom_terminate = nx_netif_dom_terminate,
242 .nxdom_fini = nx_netif_dom_fini,
243 .nxdom_find_port = NULL,
244 .nxdom_port_is_reserved = NULL,
245 .nxdom_bind_port = nx_netif_dom_bind_port,
246 .nxdom_unbind_port = nx_netif_dom_unbind_port,
247 .nxdom_connect = nx_netif_dom_connect,
248 .nxdom_disconnect = nx_netif_dom_disconnect,
249 .nxdom_defunct = nx_netif_dom_defunct,
250 .nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
251 };
252
253 struct kern_nexus_domain_provider nx_netif_prov_s = {
254 .nxdom_prov_name = NEXUS_PROVIDER_NET_IF,
255 /*
256 * Don't install this as the default domain provider, i.e.
257 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
258 * provider to be the one handling userland-issued requests
259 * coming down thru nxprov_create() instead.
260 */
261 .nxdom_prov_flags = 0,
262 .nxdom_prov_cb = {
263 .dp_cb_init = nx_netif_prov_init,
264 .dp_cb_fini = nx_netif_prov_fini,
265 .dp_cb_params = nx_netif_prov_params,
266 .dp_cb_mem_new = nx_netif_prov_mem_new,
267 .dp_cb_config = nx_netif_prov_config,
268 .dp_cb_nx_ctor = nx_netif_prov_nx_ctor,
269 .dp_cb_nx_dtor = nx_netif_prov_nx_dtor,
270 .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info,
271 .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get,
272 .dp_cb_nx_stop = nx_netif_prov_nx_stop,
273 },
274 };
275
276 struct nexus_ifnet_ops na_netif_ops = {
277 .ni_finalize = na_netif_finalize,
278 .ni_reap = nx_netif_reap,
279 .ni_dequeue = nx_netif_native_tx_dequeue,
280 .ni_get_len = nx_netif_native_tx_get_len,
281 .ni_detach_notify = nx_netif_detach_notify
282 };
283
284 #define NX_NETIF_DOORBELL_MAX_DEQUEUE 64
285 uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
286
287 #define NQ_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
288 static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;
289
290 #define NQ_ACCUMULATE_INTERVAL 2 /* 2 seconds */
291 static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;
292
293 static uint32_t nq_stat_enable = 0;
294
295 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
296 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
297 #if (DEVELOPMENT || DEBUG)
298 SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
299 CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
300 "ifname prefix for enabling low latency support");
301 static uint32_t nx_netif_force_ifnet_start = 0;
302 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
303 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
304 "always use ifnet starter thread");
305 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
306 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
307 NX_NETIF_DOORBELL_MAX_DEQUEUE,
308 "max packets to dequeue in doorbell context");
309 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
310 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
311 NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
312 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
313 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
314 NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
315 #endif /* !DEVELOPMENT && !DEBUG */
316
317 SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
318 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_stat_enable,
319 0, "enable/disable stats collection for netif queue");
320
321 static ZONE_DEFINE(na_netif_zone, SKMEM_ZONE_PREFIX ".na.netif",
322 sizeof(struct nexus_netif_adapter), ZC_ZFREE_CLEARMEM);
323
324 static ZONE_DEFINE(nx_netif_zone, SKMEM_ZONE_PREFIX ".nx.netif",
325 sizeof(struct nx_netif), ZC_ZFREE_CLEARMEM);
326
327 #define SKMEM_TAG_NETIF_MIT "com.apple.skywalk.netif.mit"
328 static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
329
330 #define SKMEM_TAG_NETIF_FILTER "com.apple.skywalk.netif.filter"
331 SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
332
333 #define SKMEM_TAG_NETIF_FLOW "com.apple.skywalk.netif.flow"
334 SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
335
336 #define SKMEM_TAG_NETIF_AGENT_FLOW "com.apple.skywalk.netif.agent_flow"
337 SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
338
339 #define SKMEM_TAG_NETIF_LLINK "com.apple.skywalk.netif.llink"
340 SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
341
342 #define SKMEM_TAG_NETIF_QSET "com.apple.skywalk.netif.qset"
343 SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
344
345 #define SKMEM_TAG_NETIF_LLINK_INFO "com.apple.skywalk.netif.llink_info"
346 SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
347
348 /* use this for any temporary allocations */
349 #define SKMEM_TAG_NETIF_TEMP "com.apple.skywalk.netif.temp"
350 static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);
351
352 static void
nx_netif_dom_init(struct nxdom * nxdom)353 nx_netif_dom_init(struct nxdom *nxdom)
354 {
355 SK_LOCK_ASSERT_HELD();
356 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
357
358 _CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
359 _CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
360 _CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
361 _CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
362 _CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
363 _CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
364 _CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
365
366 (void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
367
368 nx_netif_compat_init(nxdom);
369
370 ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
371 strcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
372 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
373
374 netif_gso_init();
375 }
376
377 static void
nx_netif_dom_terminate(struct nxdom * nxdom)378 nx_netif_dom_terminate(struct nxdom *nxdom)
379 {
380 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
381
382 SK_LOCK_ASSERT_HELD();
383
384 netif_gso_fini();
385 nx_netif_compat_fini();
386
387 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
388 nxdom_prov_link, tnxdp) {
389 (void) nxdom_prov_del(nxdom_prov);
390 }
391 }
392
393 static void
nx_netif_dom_fini(struct nxdom * nxdom)394 nx_netif_dom_fini(struct nxdom *nxdom)
395 {
396 #pragma unused(nxdom)
397 }
398
399 int
nx_netif_prov_init(struct kern_nexus_domain_provider * nxdom_prov)400 nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
401 {
402 #pragma unused(nxdom_prov)
403 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
404 return 0;
405 }
406
407 static int
nx_netif_na_notify_drop(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)408 nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
409 uint32_t flags)
410 {
411 #pragma unused(kring, p, flags)
412 return ENXIO;
413 }
414
415 int
nx_netif_prov_nx_stop(struct kern_nexus * nx)416 nx_netif_prov_nx_stop(struct kern_nexus *nx)
417 {
418 uint32_t r;
419 struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
420 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
421
422 SK_LOCK_ASSERT_HELD();
423 ASSERT(nx != NULL);
424
425 /* place all rings in drop mode */
426 na_kr_drop(na, TRUE);
427
428 /* ensure global visibility */
429 membar_sync();
430
431 /* reset all TX notify callbacks */
432 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
433 while (!atomic_test_set_ptr(&na->na_tx_rings[r].ckr_na_notify,
434 ptrauth_nop_cast(void *, na->na_tx_rings[r].ckr_na_notify),
435 ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
436 ;
437 }
438 membar_sync();
439 if (nifna->nifna_tx_mit != NULL) {
440 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
441 }
442 }
443 if (nifna->nifna_tx_mit != NULL) {
444 skn_free_type_array(tx, struct nx_netif_mit,
445 na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
446 nifna->nifna_tx_mit = NULL;
447 }
448
449 /* reset all RX notify callbacks */
450 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
451 while (!atomic_test_set_ptr(&na->na_rx_rings[r].ckr_na_notify,
452 ptrauth_nop_cast(void *, na->na_rx_rings[r].ckr_na_notify),
453 ptrauth_nop_cast(void *, &nx_netif_na_notify_drop))) {
454 ;
455 }
456 membar_sync();
457 if (nifna->nifna_rx_mit != NULL) {
458 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
459 }
460 }
461 if (nifna->nifna_rx_mit != NULL) {
462 skn_free_type_array(rx, struct nx_netif_mit,
463 na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
464 nifna->nifna_rx_mit = NULL;
465 }
466 return 0;
467 }
468
469 static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params * adj,ifnet_t ifp)470 nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
471 ifnet_t ifp)
472 {
473 if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
474 *(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
475 *(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
476 } else if (IFNET_IS_WIFI(ifp)) {
477 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
478 ifp->if_name[2] == '\0') {
479 /* Wi-Fi Access Point */
480 *(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
481 *(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
482 } else if (ifp->if_eflags & IFEF_AWDL) {
483 /* AWDL */
484 *(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
485 *(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
486 } else {
487 /* Wi-Fi infrastructure */
488 *(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
489 *(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
490 }
491 } else if (IFNET_IS_ETHERNET(ifp)) {
492 #if !XNU_TARGET_OS_OSX
493 /*
494 * On non-macOS platforms, treat all compat Ethernet
495 * interfaces as USB Ethernet with reduced ring sizes.
496 */
497 *(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
498 *(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
499 #else /* XNU_TARGET_OS_OSX */
500 if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
501 *(adj->adj_rx_slots) =
502 sk_netif_compat_usb_eth_rx_ring_sz;
503 *(adj->adj_tx_slots) =
504 sk_netif_compat_usb_eth_tx_ring_sz;
505 }
506 #endif /* XNU_TARGET_OS_OSX */
507 }
508 }
509
510 static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)511 nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
512 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
513 {
514 /*
515 * for netif compat adjust the following parameters for memory
516 * optimization:
517 * - change the size of buffer object to 128 bytes.
518 * - don't allocate rx ring for host port and tx ring for dev port.
519 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
520 * Assumption here is that pdp_ip0 is always used as the data
521 * interface.
522 * - reduce the ring size for AWDL interface.
523 * - reduce the ring size for USB ethernet interface.
524 */
525 if (strcmp(nxdom_prov->nxdom_prov_name,
526 NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
527 /*
528 * Leave the parameters default if userspace access may be
529 * needed. We can't use skywalk_direct_allowed() here because
530 * the drivers have not attached yet.
531 */
532 if (skywalk_netif_direct_enabled()) {
533 goto done;
534 }
535
536 *(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
537 *(adj->adj_tx_rings) = 1;
538 if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
539 ifnet_t ifp;
540 ifnet_head_lock_shared();
541 ifp = ifindex2ifnet[nxp->nxp_ifindex];
542 ifnet_head_done();
543 VERIFY(ifp != NULL);
544 nx_netif_compat_adjust_ring_size(adj, ifp);
545 }
546 } else { /* netif native */
547 if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
548 *(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
549 *(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
550 }
551 /*
552 * Add another extra ring for host port. Note that if the
553 * nexus isn't configured to use the same pbufpool for all of
554 * its ports, we'd end up allocating extra here.
555 * Not a big deal since that case isn't the default.
556 */
557 *(adj->adj_tx_rings) += 1;
558 *(adj->adj_rx_rings) += 1;
559
560 if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
561 SK_ERR("buf size too small, min (%d)",
562 PKT_MAX_PROTO_HEADER_SIZE);
563 return EINVAL;
564 }
565 _CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
566 NX_INTF_ADV_SIZE);
567 *(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
568 }
569 done:
570 return 0;
571 }
572
573 int
nx_netif_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)574 nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
575 const uint32_t req, const struct nxprov_params *nxp0,
576 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
577 uint32_t pp_region_config_flags)
578 {
579 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
580
581 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
582 nxdom, nxdom, nxdom, pp_region_config_flags,
583 nx_netif_prov_params_adjust);
584 }
585
586 int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)587 nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
588 struct kern_nexus *nx, struct nexus_adapter *na)
589 {
590 #pragma unused(nxdom_prov)
591 int err = 0;
592 boolean_t allow_direct;
593 uint32_t pp_flags = 0;
594
595 SK_DF(SK_VERB_NETIF,
596 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
597 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
598 SK_KVA(na));
599
600 ASSERT(na->na_arena == NULL);
601 if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
602 (na->na_type == NA_NETIF_COMPAT_HOST)) {
603 pp_flags |= SKMEM_PP_FLAG_TRUNCATED_BUF;
604 }
605 /*
606 * We do this check to determine whether to create the extra
607 * regions needed for userspace access. This is per interface.
608 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
609 */
610 allow_direct = skywalk_netif_direct_allowed(na->na_name);
611
612 /*
613 * Both ports (host and dev) share the same packet buffer pool;
614 * the first time a port gets opened will allocate the pp that
615 * gets stored in the nexus, which will then be used by any
616 * subsequent opens.
617 */
618 if (!allow_direct || !NX_USER_CHANNEL_PROV(nx)) {
619 pp_flags |= SKMEM_PP_FLAG_KERNEL_ONLY;
620 }
621 na->na_arena = skmem_arena_create_for_nexus(na,
622 NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
623 &nx->nx_rx_pp, pp_flags, &nx->nx_adv, &err);
624 ASSERT(na->na_arena != NULL || err != 0);
625 ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
626 NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
627 NX_DOM(nx)->nxdom_md_subtype));
628
629 return err;
630 }
631
632 SK_NO_INLINE_ATTRIBUTE
633 static int
nx_netif_get_llink_info(struct sockopt * sopt,struct kern_nexus * nx)634 nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
635 {
636 struct nx_llink_info_req *nlir = NULL;
637 struct nx_netif *nif;
638 struct netif_llink *llink;
639 uint16_t llink_cnt;
640 size_t len, user_len;
641 int err, i;
642
643 nif = NX_NETIF_PRIVATE(nx);
644 if (!NETIF_LLINK_ENABLED(nif)) {
645 SK_ERR("llink mode not enabled");
646 return ENOTSUP;
647 }
648 lck_rw_lock_shared(&nif->nif_llink_lock);
649 llink_cnt = nif->nif_llink_cnt;
650 if (llink_cnt == 0) {
651 SK_ERR("zero llink cnt");
652 err = ENXIO;
653 goto done;
654 }
655 len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
656 /* preserve sopt_valsize because it gets overwritten by copyin */
657 user_len = sopt->sopt_valsize;
658 if (user_len < len) {
659 SK_ERR("buffer too small");
660 err = ENOBUFS;
661 goto done;
662 }
663 nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
664 if (nlir == NULL) {
665 SK_ERR("failed to allocate nlir");
666 err = ENOMEM;
667 goto done;
668 }
669 err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
670 if (err != 0) {
671 SK_ERR("copyin failed: %d", err);
672 goto done;
673 }
674 if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
675 SK_ERR("nlir version mismatch: %d != %d",
676 nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
677 err = ENOTSUP;
678 goto done;
679 }
680 nlir->nlir_llink_cnt = llink_cnt;
681 i = 0;
682 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
683 struct nx_llink_info *nli;
684 struct netif_qset *qset;
685 uint16_t qset_cnt;
686 int j;
687
688 nli = &nlir->nlir_llink[i];
689 nli->nli_link_id = llink->nll_link_id;
690 nli->nli_link_id_internal = llink->nll_link_id_internal;
691 nli->nli_state = llink->nll_state;
692 nli->nli_flags = llink->nll_flags;
693
694 qset_cnt = llink->nll_qset_cnt;
695 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
696 nli->nli_qset_cnt = qset_cnt;
697
698 j = 0;
699 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
700 struct nx_qset_info *nqi;
701
702 nqi = &nli->nli_qset[j];
703 nqi->nqi_id = qset->nqs_id;
704 nqi->nqi_flags = qset->nqs_flags;
705 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
706 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
707 j++;
708 }
709 ASSERT(j == qset_cnt);
710 i++;
711 }
712 ASSERT(i == llink_cnt);
713 sopt->sopt_valsize = user_len;
714 err = sooptcopyout(sopt, nlir, len);
715 if (err != 0) {
716 SK_ERR("sooptcopyout failed: %d", err);
717 }
718 done:
719 lck_rw_unlock_shared(&nif->nif_llink_lock);
720 if (nlir != NULL) {
721 sk_free_data(nlir, len);
722 }
723 return err;
724 }
725
726 int
nx_netif_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)727 nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
728 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
729 struct proc *p, kauth_cred_t cred)
730 {
731 #pragma unused(nxdom_prov)
732 struct sockopt sopt;
733 int err = 0;
734
735 SK_LOCK_ASSERT_HELD();
736
737 /* proceed only if the client possesses netif entitlement */
738 if ((err = skywalk_priv_check_cred(p, cred,
739 PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
740 goto done;
741 }
742
743 if (ncr->nc_req == USER_ADDR_NULL) {
744 err = EINVAL;
745 goto done;
746 }
747
748 /* to make life easier for handling copies */
749 bzero(&sopt, sizeof(sopt));
750 sopt.sopt_dir = sopt_dir;
751 sopt.sopt_val = ncr->nc_req;
752 sopt.sopt_valsize = ncr->nc_req_len;
753 sopt.sopt_p = p;
754
755 switch (ncr->nc_cmd) {
756 case NXCFG_CMD_ATTACH:
757 case NXCFG_CMD_DETACH: {
758 struct nx_spec_req nsr;
759
760 bzero(&nsr, sizeof(nsr));
761 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
762 if (err != 0) {
763 goto done;
764 }
765
766 /*
767 * Null-terminate in case this has an interface name;
768 * the union is already large enough for uuid_t.
769 */
770 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
771 if (p != kernproc) {
772 nsr.nsr_flags &= NXSPECREQ_MASK;
773 }
774
775 err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
776 if (err != 0) {
777 goto done;
778 }
779
780 /* XXX: [email protected] -- can this copyout fail? */
781 (void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
782 break;
783 }
784 case NXCFG_CMD_FLOW_ADD:
785 case NXCFG_CMD_FLOW_DEL: {
786 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
787 offsetof(struct nx_flow_req, _nfr_common_field_end));
788 struct nx_flow_req nfr;
789
790 bzero(&nfr, sizeof(nfr));
791 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
792 if (err != 0) {
793 goto done;
794 }
795
796 err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
797 if (err != 0) {
798 goto done;
799 }
800
801 /* XXX: [email protected] -- can this copyout fail? */
802 (void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
803 break;
804 }
805 case NXCFG_CMD_GET_LLINK_INFO: {
806 err = nx_netif_get_llink_info(&sopt, nx);
807 break;
808 }
809 default:
810 err = EINVAL;
811 goto done;
812 }
813 done:
814 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
815 "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
816 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
817 return err;
818 }
819
820 void
nx_netif_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)821 nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
822 {
823 #pragma unused(nxdom_prov)
824 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
825 }
826
827 int
nx_netif_prov_nx_ctor(struct kern_nexus * nx)828 nx_netif_prov_nx_ctor(struct kern_nexus *nx)
829 {
830 struct nx_netif *n;
831 char name[64];
832 int error;
833
834 SK_LOCK_ASSERT_HELD();
835 ASSERT(nx->nx_arg == NULL);
836
837 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
838
839 nx->nx_arg = nx_netif_alloc(Z_WAITOK);
840 n = NX_NETIF_PRIVATE(nx);
841 if (NX_USER_CHANNEL_PROV(nx) &&
842 NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
843 (void) snprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
844 error = nx_advisory_alloc(nx, name,
845 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
846 NEXUS_ADVISORY_TYPE_NETIF);
847 if (error != 0) {
848 nx_netif_free(n);
849 return error;
850 }
851 }
852 n->nif_nx = nx;
853 SK_D("create new netif 0x%llx for nexus 0x%llx",
854 SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
855 return 0;
856 }
857
858 void
nx_netif_prov_nx_dtor(struct kern_nexus * nx)859 nx_netif_prov_nx_dtor(struct kern_nexus *nx)
860 {
861 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
862
863 SK_LOCK_ASSERT_HELD();
864
865 SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
866 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
867
868 /*
869 * XXX
870 * detach should be done separately to be symmetrical with attach.
871 */
872 nx_advisory_free(nx);
873 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
874 /* we're called by nx_detach(), so this cannot fail */
875 int err = nx_netif_ctl_detach(nx, NULL);
876 VERIFY(err == 0);
877 }
878 if (n->nif_dev_nxb != NULL) {
879 nxb_free(n->nif_dev_nxb);
880 n->nif_dev_nxb = NULL;
881 }
882 if (n->nif_host_nxb != NULL) {
883 nxb_free(n->nif_host_nxb);
884 n->nif_host_nxb = NULL;
885 }
886 SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
887 nx_netif_free(n);
888 nx->nx_arg = NULL;
889 }
890
891 int
nx_netif_prov_nx_mem_info(struct kern_nexus * nx,struct kern_pbufpool ** tpp,struct kern_pbufpool ** rpp)892 nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
893 struct kern_pbufpool **rpp)
894 {
895 ASSERT(nx->nx_tx_pp != NULL);
896 ASSERT(nx->nx_rx_pp != NULL);
897
898 if (tpp != NULL) {
899 *tpp = nx->nx_tx_pp;
900 }
901 if (rpp != NULL) {
902 *rpp = nx->nx_rx_pp;
903 }
904
905 return 0;
906 }
907
908 static size_t
__netif_mib_get_stats(struct kern_nexus * nx,void * out,size_t len)909 __netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
910 {
911 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
912 struct ifnet *ifp = nif->nif_ifp;
913 struct sk_stats_net_if *sns = out;
914 size_t actual_space = sizeof(struct sk_stats_net_if);
915
916 if (out != NULL && actual_space <= len) {
917 uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
918 if (ifp != NULL) {
919 (void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
920 }
921 sns->sns_nifs = nif->nif_stats;
922 }
923
924 return actual_space;
925 }
926
927 static size_t
__netif_mib_get_llinks(struct kern_nexus * nx,void * out,size_t len)928 __netif_mib_get_llinks(struct kern_nexus *nx, void *out, size_t len)
929 {
930 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
931 struct nx_llink_info *nli_list = out;
932 size_t actual_space = 0;
933 if (NETIF_LLINK_ENABLED(nif)) {
934 lck_rw_lock_shared(&nif->nif_llink_lock);
935 actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
936
937 if (out != NULL && actual_space <= len) {
938 struct netif_llink *llink;
939 int i = 0;
940 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
941 struct nx_llink_info *nli;
942 struct netif_qset *qset;
943 uint16_t qset_cnt;
944 int j;
945
946 nli = &nli_list[i];
947 uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
948 nli->nli_link_id = llink->nll_link_id;
949 nli->nli_link_id_internal = llink->nll_link_id_internal;
950 nli->nli_state = llink->nll_state;
951 nli->nli_flags = llink->nll_flags;
952
953 qset_cnt = llink->nll_qset_cnt;
954 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
955 nli->nli_qset_cnt = qset_cnt;
956
957 j = 0;
958 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
959 struct nx_qset_info *nqi;
960
961 nqi = &nli->nli_qset[j];
962 nqi->nqi_id = qset->nqs_id;
963 nqi->nqi_flags = qset->nqs_flags;
964 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
965 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
966 j++;
967 }
968 ASSERT(j == qset_cnt);
969 i++;
970 }
971 ASSERT(i == nif->nif_llink_cnt);
972 }
973 lck_rw_unlock_shared(&nif->nif_llink_lock);
974 }
975
976 return actual_space;
977 }
978
979 static size_t
__netif_mib_get_queue_stats(struct kern_nexus * nx,void * out,size_t len)980 __netif_mib_get_queue_stats(struct kern_nexus *nx, void *out, size_t len)
981 {
982 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
983 uint8_t *itr = out;
984 size_t actual_space = 0;
985 if (!NETIF_LLINK_ENABLED(nif)) {
986 return actual_space;
987 }
988
989 lck_rw_lock_shared(&nif->nif_llink_lock);
990 struct netif_llink *llink;
991 struct netif_qset *qset;
992 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
993 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
994 actual_space += sizeof(struct netif_qstats_info) *
995 (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
996 }
997 }
998 if (out == NULL || actual_space > len) {
999 lck_rw_unlock_shared(&nif->nif_llink_lock);
1000 return actual_space;
1001 }
1002
1003 llink = NULL;
1004 qset = NULL;
1005 uint16_t i = 0, j = 0;
1006 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1007 uint16_t qset_cnt;
1008 j = 0;
1009 qset_cnt = llink->nll_qset_cnt;
1010 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
1011 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1012 int queue_cnt = qset->nqs_num_rx_queues +
1013 qset->nqs_num_tx_queues;
1014 for (uint16_t k = 0; k < queue_cnt; k++) {
1015 struct netif_qstats_info *nqi =
1016 (struct netif_qstats_info *)(void *)itr;
1017 struct netif_queue *nq = &qset->nqs_driver_queues[k];
1018 nqi->nqi_qset_id = qset->nqs_id;
1019 nqi->nqi_queue_idx = k;
1020 if (KPKT_VALID_SVC(nq->nq_svc)) {
1021 nqi->nqi_svc = nq->nq_svc;
1022 }
1023 if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
1024 nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
1025 }
1026
1027 struct netif_qstats *nq_out = &nqi->nqi_stats;
1028 struct netif_qstats *nq_src = &nq->nq_stats;
1029 memcpy(nq_out, nq_src, sizeof(struct netif_qstats));
1030
1031 itr += sizeof(struct netif_qstats_info);
1032 }
1033 j++;
1034 }
1035 ASSERT(j == qset_cnt);
1036 i++;
1037 }
1038 ASSERT(i == nif->nif_llink_cnt);
1039
1040 lck_rw_unlock_shared(&nif->nif_llink_lock);
1041 return actual_space;
1042 }
1043
1044 size_t
nx_netif_prov_nx_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1045 nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1046 void *out, size_t len, struct proc *p)
1047 {
1048 #pragma unused(p)
1049 size_t ret;
1050
1051 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1052 (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
1053 return 0;
1054 }
1055
1056 switch (filter->nmf_type) {
1057 case NXMIB_NETIF_STATS:
1058 ret = __netif_mib_get_stats(nx, out, len);
1059 break;
1060 case NXMIB_LLINK_LIST:
1061 ret = __netif_mib_get_llinks(nx, out, len);
1062 break;
1063 case NXMIB_NETIF_QUEUE_STATS:
1064 ret = __netif_mib_get_queue_stats(nx, out, len);
1065 break;
1066 default:
1067 ret = 0;
1068 break;
1069 }
1070 return ret;
1071 }
1072
1073 static int
nx_netif_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)1074 nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1075 struct nxbind *nxb, void *info)
1076 {
1077 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1078 nexus_port_t first, last, port;
1079 int error;
1080
1081 ASSERT(nx_port != NULL);
1082 ASSERT(nxb != NULL);
1083
1084 port = *nx_port;
1085
1086 /*
1087 * If port is:
1088 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1089 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1090 * return back the assigned port.
1091 */
1092 first = NEXUS_PORT_NET_IF_CLIENT;
1093 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
1094 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
1095 ASSERT(first <= last);
1096
1097 NETIF_WLOCK(nif);
1098
1099 if (__improbable(first == last)) {
1100 error = ENOMEM;
1101 } else if (port != NEXUS_PORT_ANY) {
1102 error = nx_port_bind_info(nx, port, nxb, info);
1103 SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1104 } else {
1105 error = nx_port_find(nx, first, last - 1, &port);
1106 ASSERT(error != 0 || (port >= first && port < last));
1107 if (error == 0) {
1108 error = nx_port_bind_info(nx, port, nxb, info);
1109 SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1110 port, error);
1111 }
1112 }
1113 NETIF_WUNLOCK(nif);
1114
1115 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1116 if (error == 0) {
1117 *nx_port = port;
1118 }
1119
1120 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1121 "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1122 SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1123 nx->nx_active_ports, error);
1124
1125 return error;
1126 }
1127
1128 static int
nx_netif_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)1129 nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1130 {
1131 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1132 int error = 0;
1133
1134 ASSERT(nx_port != NEXUS_PORT_ANY);
1135
1136 NETIF_WLOCK(nif);
1137 error = nx_port_unbind(nx, nx_port);
1138 NETIF_WUNLOCK(nif);
1139
1140 return error;
1141 }
1142
1143 static int
nx_netif_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)1144 nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1145 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1146 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1147 {
1148 #pragma unused(nxdom_prov)
1149 int err = 0;
1150
1151 SK_LOCK_ASSERT_HELD();
1152
1153 ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1154 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1155 nxdom_prov->nxdom_prov_dom->nxdom_type &&
1156 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1157 ASSERT(!(ch->ch_flags & CHANF_HOST));
1158
1159 switch (chr->cr_port) {
1160 case NEXUS_PORT_NET_IF_DEV:
1161 if (chr->cr_mode & CHMODE_HOST) {
1162 err = EINVAL;
1163 goto done;
1164 }
1165 break;
1166
1167 case NEXUS_PORT_NET_IF_HOST:
1168 if (!(chr->cr_mode & CHMODE_HOST)) {
1169 if (ch->ch_flags & CHANF_KERNEL) {
1170 err = EINVAL;
1171 goto done;
1172 }
1173 chr->cr_mode |= CHMODE_HOST;
1174 }
1175 /*
1176 * This channel is exclusively opened to the host
1177 * rings; don't notify the external provider.
1178 */
1179 atomic_bitset_32(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP);
1180 break;
1181
1182 default:
1183 /*
1184 * This channel is shared between netif and user process;
1185 * don't notify the external provider.
1186 */
1187 atomic_bitset_32(&ch->ch_flags, CHANF_EXT_SKIP);
1188 break;
1189 }
1190
1191 chr->cr_ring_set = RING_SET_DEFAULT;
1192 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1193 (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
1194 nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1195 nx->nx_prov->nxprov_params->nxp_name);
1196
1197 if (ch->ch_flags & CHANF_KERNEL) {
1198 err = na_connect_spec(nx, ch, chr, p);
1199 } else {
1200 err = na_connect(nx, ch, chr, ch0, nxb, p);
1201 }
1202
1203 if (err == 0) {
1204 /*
1205 * Mark the kernel slot descriptor region as busy; this
1206 * prevents it from being torn-down at channel defunct
1207 * time, as the (external) nexus owner may be calling
1208 * KPIs that require accessing the slots.
1209 */
1210 skmem_arena_nexus_sd_set_noidle(
1211 skmem_arena_nexus(ch->ch_na->na_arena), 1);
1212 }
1213
1214 done:
1215 return err;
1216 }
1217
1218 static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)1219 nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1220 struct kern_nexus *nx, struct kern_channel *ch)
1221 {
1222 #pragma unused(nxdom_prov)
1223 SK_LOCK_ASSERT_HELD();
1224
1225 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1226 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1227 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1228
1229 /*
1230 * Release busy assertion held earlier in nx_netif_dom_connect();
1231 * this allows for the final arena teardown to succeed.
1232 */
1233 skmem_arena_nexus_sd_set_noidle(
1234 skmem_arena_nexus(ch->ch_na->na_arena), -1);
1235
1236 if (ch->ch_flags & CHANF_KERNEL) {
1237 na_disconnect_spec(nx, ch);
1238 } else {
1239 na_disconnect(nx, ch);
1240 }
1241 }
1242
1243 static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1244 nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1245 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1246 {
1247 #pragma unused(nxdom_prov, nx)
1248 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1249 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1250 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1251 ch->ch_na->na_type == NA_NETIF_HOST ||
1252 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1253 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1254 ch->ch_na->na_type == NA_NETIF_VP);
1255
1256 na_ch_rings_defunct(ch, p);
1257 }
1258
1259 static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)1260 nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1261 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1262 {
1263 #pragma unused(nxdom_prov)
1264 struct ifnet *ifp;
1265
1266 if (!locked) {
1267 SK_LOCK_ASSERT_NOTHELD();
1268 SK_LOCK();
1269 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1270 } else {
1271 SK_LOCK_ASSERT_HELD();
1272 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1273 }
1274
1275 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1276 ch->ch_na->na_type == NA_NETIF_HOST ||
1277 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1278 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1279 ch->ch_na->na_type == NA_NETIF_VP);
1280
1281 na_defunct(nx, ch, ch->ch_na, locked);
1282 ifp = ch->ch_na->na_ifp;
1283 if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
1284 ifnet_is_low_latency(ifp)) {
1285 /*
1286 * We release the VPNA's ifp here instead of waiting for the
1287 * application to close the channel to trigger the release.
1288 */
1289 DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
1290 ch->ch_na, struct ifnet *, ifp);
1291 ifnet_decr_iorefcnt(ifp);
1292 ch->ch_na->na_ifp = NULL;
1293 }
1294 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1295 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1296 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1297 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1298
1299 if (!locked) {
1300 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1301 SK_UNLOCK();
1302 } else {
1303 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1304 SK_LOCK_ASSERT_HELD();
1305 }
1306 }
1307
1308 struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)1309 na_netif_alloc(zalloc_flags_t how)
1310 {
1311 _CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1312
1313 return zalloc_flags(na_netif_zone, how | Z_ZERO);
1314 }
1315
1316 void
na_netif_free(struct nexus_adapter * na)1317 na_netif_free(struct nexus_adapter *na)
1318 {
1319 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1320
1321 SK_LOCK_ASSERT_HELD();
1322 SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1323
1324 ASSERT(na->na_refcount == 0);
1325 ASSERT(nifna->nifna_tx_mit == NULL);
1326 ASSERT(nifna->nifna_rx_mit == NULL);
1327 bzero(nifna, sizeof(*nifna));
1328
1329 zfree(na_netif_zone, nifna);
1330 }
1331
1332 /* Process NXCFG_CMD_ATTACH */
1333 SK_NO_INLINE_ATTRIBUTE
1334 static int
nx_netif_ctl_attach(struct kern_nexus * nx,struct nx_spec_req * nsr,struct proc * p)1335 nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1336 struct proc *p)
1337 {
1338 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1339 struct ifnet *ifp = NULL;
1340 boolean_t compat;
1341 int err = 0;
1342
1343 SK_LOCK_ASSERT_HELD();
1344
1345 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1346 compat = (strcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1347 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1348
1349 uuid_clear(nsr->nsr_if_uuid);
1350 /*
1351 * The netif accepts either an interface name or a pointer to
1352 * an ifnet, but never a UUID.
1353 */
1354 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1355 err = EINVAL;
1356 goto done;
1357 }
1358 if (nsr->nsr_flags & NXSPECREQ_IFP) {
1359 if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1360 err = EINVAL;
1361 goto done;
1362 }
1363 } else if ((ifp = ifunit_ref(nsr->nsr_name)) == NULL) {
1364 err = ENXIO;
1365 goto done;
1366 }
1367
1368 if ((compat && SKYWALK_NATIVE(ifp)) ||
1369 (!compat && !SKYWALK_NATIVE(ifp))) {
1370 /* native driver for netif; non-native for netif_compat */
1371 err = ENODEV;
1372 } else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
1373 err = EBUSY;
1374 } else {
1375 ASSERT(uuid_is_null(n->nif_uuid));
1376 /*
1377 * Upon success, callee will hold its own ifnet iorefcnt
1378 * as well as a retain count on the nexus adapter.
1379 */
1380 if (compat) {
1381 err = nx_netif_compat_attach(nx, ifp);
1382 } else {
1383 err = nx_netif_attach(nx, ifp);
1384 }
1385
1386 if (err == 0) {
1387 /* return the adapter UUID */
1388 uuid_generate_random(n->nif_uuid);
1389 uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
1390 #if (DEVELOPMENT || DEBUG)
1391 skoid_create(&n->nif_skoid,
1392 SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1393 CTLFLAG_RW);
1394 #endif /* !DEVELOPMENT && !DEBUG */
1395 }
1396 }
1397 done:
1398 /* drop I/O refcnt from ifunit_ref() */
1399 if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1400 ifnet_decr_iorefcnt(ifp);
1401 }
1402
1403 #if SK_LOG
1404 uuid_string_t uuidstr, ifuuidstr;
1405 const char *nustr;
1406 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1407 nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1408 } else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1409 (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1410 SK_KVA(nsr->nsr_ifp));
1411 nustr = uuidstr;
1412 } else {
1413 nustr = nsr->nsr_name;
1414 }
1415 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1416 "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1417 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1418 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1419 #endif /* SK_LOG */
1420
1421 return err;
1422 }
1423
1424 SK_NO_INLINE_ATTRIBUTE
1425 static int
nx_netif_clean(struct nx_netif * nif,boolean_t quiesce_needed)1426 nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
1427 {
1428 struct kern_nexus *nx = nif->nif_nx;
1429 struct ifnet *ifp;
1430 boolean_t suspended = FALSE;
1431
1432 ifp = nif->nif_ifp;
1433 if (ifp == NULL) {
1434 return EALREADY;
1435 }
1436 /*
1437 * For regular kernel-attached interfaces, quiescing is handled by
1438 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1439 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1440 * are constructed on the fly and can also be torn down on the fly.
1441 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1442 * can be detached while the interface is still attached.
1443 */
1444 if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
1445 SK_UNLOCK();
1446 suspended = TRUE;
1447 ifnet_datamov_drain(ifp);
1448 SK_LOCK();
1449 }
1450 nx_netif_agent_fini(nif);
1451 nx_netif_capabilities_fini(nif);
1452 nx_netif_flow_fini(nif);
1453 nx_netif_filter_fini(nif);
1454 nx_netif_llink_fini(nif);
1455 nx_netif_flags_fini(nif);
1456
1457 uuid_clear(nif->nif_uuid);
1458 /* nx_netif_{compat_}attach() held both references */
1459 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1460 na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1461 nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1462 nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1463
1464 ifp->if_na_ops = NULL;
1465 ifp->if_na = NULL;
1466 nif->nif_ifp = NULL;
1467 nif->nif_netif_nxadv = NULL;
1468 SKYWALK_CLEAR_CAPABLE(ifp);
1469 if (suspended) {
1470 ifnet_datamov_resume(ifp);
1471 }
1472
1473 #if (DEVELOPMENT || DEBUG)
1474 skoid_destroy(&nif->nif_skoid);
1475 #endif /* !DEVELOPMENT && !DEBUG */
1476 return 0;
1477 }
1478
1479 /* process NXCFG_CMD_DETACH */
1480 SK_NO_INLINE_ATTRIBUTE
1481 static int
nx_netif_ctl_detach(struct kern_nexus * nx,struct nx_spec_req * nsr)1482 nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1483 {
1484 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1485 int err = 0;
1486
1487 SK_LOCK_ASSERT_HELD();
1488
1489 /*
1490 * nsr is NULL when we're called from the destructor, and it
1491 * implies that we'll detach whatever that is attached.
1492 */
1493 if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
1494 err = EINVAL;
1495 } else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
1496 nif->nif_uuid) != 0) {
1497 err = ESRCH;
1498 } else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1499 /* nx_netif_ctl_attach() not yet done or already detached */
1500 err = ENXIO;
1501 } else if (nx->nx_ch_count != 0) {
1502 /*
1503 * There's at least a channel opened; we can't
1504 * yank the interface from underneath the nexus
1505 * since our dlil input/output handler may be
1506 * running now. Bail out and come back here
1507 * again when the nexus detaches.
1508 */
1509 err = EBUSY;
1510 } else {
1511 err = nx_netif_clean(nif, TRUE);
1512 }
1513
1514 #if SK_LOG
1515 if (nsr != NULL) {
1516 uuid_string_t ifuuidstr;
1517 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1518 "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1519 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1520 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1521 nsr->nsr_flags, err);
1522 } else {
1523 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1524 "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1525 NX_DOM_PROV(nx)->nxdom_prov_name, err);
1526 }
1527 #endif /* SK_LOG */
1528
1529 return err;
1530 }
1531
1532 /*
1533 * XXX
1534 * These checks are copied from fsw.c
1535 * There are no tests exercising this code. Do we still need this?
1536 */
1537 SK_NO_INLINE_ATTRIBUTE
1538 static int
nx_netif_ctl_flow_check(struct nx_netif * nif,nxcfg_cmd_t cmd,struct proc * p,struct nx_flow_req * req)1539 nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1540 struct proc *p, struct nx_flow_req *req)
1541 {
1542 #pragma unused(nif)
1543 boolean_t need_check;
1544 int error;
1545
1546 if (uuid_is_null(req->nfr_flow_uuid)) {
1547 return EINVAL;
1548 }
1549 req->nfr_flags &= NXFLOWREQF_MASK;
1550 req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1551
1552 if (cmd == NXCFG_CMD_FLOW_DEL) {
1553 return 0;
1554 }
1555 need_check = FALSE;
1556 if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1557 need_check = TRUE;
1558 } else if (!uuid_is_null(req->nfr_euuid)) {
1559 uuid_t uuid;
1560
1561 /* get the UUID of the issuing process */
1562 proc_getexecutableuuid(p, uuid, sizeof(uuid));
1563
1564 /*
1565 * If this is not issued by a process for its own
1566 * executable UUID and if the process does not have
1567 * the necessary privilege, reject the request.
1568 * The logic is similar to so_set_effective_uuid().
1569 */
1570 if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1571 need_check = TRUE;
1572 }
1573 }
1574 if (need_check) {
1575 kauth_cred_t cred = kauth_cred_proc_ref(p);
1576 error = priv_check_cred(cred,
1577 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1578 kauth_cred_unref(&cred);
1579 if (error != 0) {
1580 return error;
1581 }
1582 }
1583 return 0;
1584 }
1585
1586 SK_NO_INLINE_ATTRIBUTE
1587 static int
nx_netif_ctl_flow_add(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1588 nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1589 struct nx_flow_req *req)
1590 {
1591 int err;
1592
1593 ASSERT(p != PROC_NULL);
1594 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
1595 if (err != 0) {
1596 return err;
1597 }
1598
1599 /* init kernel only fields */
1600 nx_flow_req_internalize(req);
1601 req->nfr_context = NULL;
1602 req->nfr_flow_stats = NULL;
1603 req->nfr_port_reservation = NULL;
1604 req->nfr_pid = proc_pid(p);
1605
1606 err = nx_netif_netagent_flow_add(nif, req);
1607 nx_flow_req_externalize(req);
1608 return err;
1609 }
1610
1611 SK_NO_INLINE_ATTRIBUTE
1612 static int
nx_netif_ctl_flow_del(struct nx_netif * nif,struct proc * p,struct nx_flow_req * req)1613 nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1614 struct nx_flow_req *req)
1615 {
1616 int err;
1617
1618 err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
1619 if (err != 0) {
1620 return err;
1621 }
1622
1623 nx_flow_req_internalize(req);
1624 req->nfr_pid = proc_pid(p);
1625
1626 err = nx_netif_netagent_flow_del(nif, req);
1627 nx_flow_req_externalize(req);
1628 return err;
1629 }
1630
1631 SK_NO_INLINE_ATTRIBUTE
1632 static int
nx_netif_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,void * data,struct proc * p)1633 nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1634 struct proc *p)
1635 {
1636 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1637 struct nx_spec_req *nsr = data;
1638 struct nx_flow_req *nfr = data;
1639 int error = 0;
1640
1641 SK_LOCK_ASSERT_HELD();
1642
1643 switch (nc_cmd) {
1644 case NXCFG_CMD_ATTACH:
1645 error = nx_netif_ctl_attach(nx, nsr, p);
1646 break;
1647
1648 case NXCFG_CMD_DETACH:
1649 error = nx_netif_ctl_detach(nx, nsr);
1650 break;
1651
1652 case NXCFG_CMD_FLOW_ADD:
1653 error = nx_netif_ctl_flow_add(nif, p, nfr);
1654 break;
1655
1656 case NXCFG_CMD_FLOW_DEL:
1657 error = nx_netif_ctl_flow_del(nif, p, nfr);
1658 break;
1659
1660 default:
1661 SK_ERR("invalid cmd %u", nc_cmd);
1662 error = EINVAL;
1663 break;
1664 }
1665 return error;
1666 }
1667
1668 static void
nx_netif_llink_notify(struct kern_nexus * nx,struct netif_llink * llink,uint32_t flags)1669 nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1670 uint32_t flags)
1671 {
1672 #pragma unused(flags)
1673 struct netif_qset *qset;
1674
1675 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1676 (void) nx_tx_qset_notify(nx, qset->nqs_ctx);
1677 }
1678 }
1679
1680 static void
nx_netif_llink_notify_all(struct kern_nexus * nx,uint32_t flags)1681 nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1682 {
1683 struct nx_netif *nif;
1684 struct netif_llink *llink;
1685
1686 nif = NX_NETIF_PRIVATE(nx);
1687
1688 lck_rw_lock_shared(&nif->nif_llink_lock);
1689 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1690 nx_netif_llink_notify(nx, llink, flags);
1691 }
1692 lck_rw_unlock_shared(&nif->nif_llink_lock);
1693 }
1694
1695 /*
1696 * if_start() callback for native Skywalk interfaces, registered
1697 * at ifnet_allocate_extended() time, and invoked by the ifnet
1698 * starter thread.
1699 */
1700 static void
nx_netif_doorbell_internal(struct ifnet * ifp,uint32_t flags)1701 nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1702 {
1703 if (__improbable(ifp->if_na == NULL)) {
1704 return;
1705 }
1706
1707 /*
1708 * Do this only if the nexus adapter is active, i.e. a channel
1709 * has been opened to it by the module above (flowswitch, etc.)
1710 */
1711 struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1712 if (__probable(NA_IS_ACTIVE(hwna))) {
1713 struct kern_nexus *nx = hwna->na_nx;
1714
1715 /* update our work timestamp */
1716 hwna->na_work_ts = _net_uptime;
1717
1718 if (NX_LLINK_PROV(nx)) {
1719 nx_netif_llink_notify_all(nx, flags);
1720 } else {
1721 struct __kern_channel_ring *kring;
1722
1723 /* for doorbell purposes, use TX ring 0 */
1724 kring = &hwna->na_tx_rings[0];
1725
1726 /* Issue a synchronous TX doorbell on the netif device ring */
1727 kring->ckr_na_sync(kring, PROC_NULL,
1728 (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1729 }
1730 } else {
1731 struct netif_stats *nifs =
1732 &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1733 STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1734 }
1735 }
1736
1737 static void
nx_netif_doorbell(struct ifnet * ifp)1738 nx_netif_doorbell(struct ifnet *ifp)
1739 {
1740 nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1741 }
1742
1743 /*
1744 * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1745 * perform synchronous TX doorbell to the driver, by invoking the driver's
1746 * doorbell callback directly in the same thread context. It is also called
1747 * when the layer above performs a TX sync operation, where we might need
1748 * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1749 */
1750 static int
nx_netif_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1751 nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1752 uint32_t flags)
1753 {
1754 #pragma unused(p)
1755 struct ifnet *ifp = KRNA(kring)->na_ifp;
1756 boolean_t sync_only;
1757 int ret = 0;
1758
1759 ASSERT(ifp != NULL);
1760
1761 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1762 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1763 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1764 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1765 flags);
1766
1767 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1768 SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1769 SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1770 return ENXIO;
1771 }
1772
1773 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1774 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1775 "flow control ON", SK_KVA(kring), if_name(ifp),
1776 SK_KVA(ifp));
1777 return ENXIO;
1778 }
1779
1780 /* update our work timestamp */
1781 KRNA(kring)->na_work_ts = _net_uptime;
1782
1783 sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1784 !KR_KERNEL_ONLY(kring);
1785 /* regular sync (reclaim) */
1786 if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1787 ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
1788 kring->ckr_pending_intr != 0);
1789 kring->ckr_pending_intr = 0;
1790
1791 /* direct user channels do not need to use the doorbell */
1792 if (__improbable(sync_only)) {
1793 return ret;
1794 }
1795 }
1796
1797 /*
1798 * Doorbell call. Here we do doorbell explicitly if the flag is
1799 * set or implicitly if we're opened directly by a user channel.
1800 * Synchronous vs. asynchronous depending on the context.
1801 */
1802 if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1803 if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1804 ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1805 !(flags & NA_SYNCF_NETIF_ASYNC));
1806 nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
1807 } else {
1808 ifnet_start(ifp);
1809 }
1810 }
1811
1812 return ret;
1813 }
1814
1815 static int
nx_netif_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1816 nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1817 uint32_t flags)
1818 {
1819 #pragma unused(p)
1820 int ret;
1821
1822 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1823 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1824 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1825 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1826 flags);
1827
1828 ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1829
1830 /* update our work timestamp */
1831 KRNA(kring)->na_work_ts = _net_uptime;
1832
1833 ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
1834 kring->ckr_pending_intr != 0);
1835 kring->ckr_pending_intr = 0;
1836
1837 return ret;
1838 }
1839
1840 static void
nx_netif_na_dtor(struct nexus_adapter * na)1841 nx_netif_na_dtor(struct nexus_adapter *na)
1842 {
1843 struct ifnet *ifp;
1844 struct nexus_netif_adapter *nifna = NIFNA(na);
1845
1846 SK_LOCK_ASSERT_HELD();
1847 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1848
1849 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1850
1851 /*
1852 * If the finalizer callback hasn't been called for whatever
1853 * reasons, pick up the embryonic ifnet stored in na_private.
1854 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1855 */
1856 if ((ifp = na->na_ifp) == NULL) {
1857 ifp = na->na_private;
1858 na->na_private = NULL;
1859 } else {
1860 ifnet_decr_iorefcnt(ifp);
1861 na->na_ifp = NULL;
1862 }
1863
1864 if (nifna->nifna_netif != NULL) {
1865 nx_netif_release(nifna->nifna_netif);
1866 nifna->nifna_netif = NULL;
1867 }
1868 ASSERT(SKYWALK_NATIVE(ifp));
1869 }
1870
1871 /*
1872 * Dispatch rx/tx interrupts to the channel rings.
1873 *
1874 * The 'notify' routine depends on what the ring is attached to.
1875 * - for a channel file descriptor, do an event wakeup on the individual
1876 * waitqueue, plus one on the global one if needed (see na_notify)
1877 * - for a device port connected to a FlowSwitch, call the proper
1878 * forwarding routine; see nx_fsw_tx_hwna_notify()
1879 * or nx_fsw_rx_hwna_notify().
1880 */
1881 int
nx_netif_common_intr(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,uint32_t * work_done)1882 nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1883 uint32_t flags, uint32_t *work_done)
1884 {
1885 struct netif_stats *nifs =
1886 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1887 int (*notify)(struct __kern_channel_ring *kring,
1888 struct proc *, uint32_t flags);
1889 int ret;
1890
1891 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1892
1893 SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1894 ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1895 "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1896 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1897 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1898
1899 /* update our work timestamp */
1900 KRNA(kring)->na_work_ts = _net_uptime;
1901
1902 kring->ckr_pending_intr++;
1903 if (work_done != NULL) {
1904 *work_done = 1; /* do not fire again */
1905 }
1906 /*
1907 * We can't be calling ckr_na_notify here since we could already be
1908 * intercepting it, else we'd end up recursively calling ourselves.
1909 * Use the original na_notify callback saved during na_activate, or in
1910 * the case when the module above us is the flowswitch, the notify
1911 * routine that it has installed in place of our original one.
1912 */
1913 if (__probable(!KR_DROP(kring) &&
1914 (notify = kring->ckr_netif_notify) != NULL)) {
1915 ret = notify(kring, p, flags);
1916 } else {
1917 /*
1918 * If the ring is in drop mode, pretend as if it's busy.
1919 * This allows the mitigation thread to pause for a while
1920 * before attempting again.
1921 */
1922 ret = EBUSY;
1923 }
1924 if (__improbable(ret != 0)) {
1925 switch (kring->ckr_tx) {
1926 case NR_RX:
1927 if (ret == EBUSY) {
1928 STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1929 } else if (ret == EAGAIN) {
1930 STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1931 } else {
1932 STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1933 }
1934 break;
1935
1936 case NR_TX:
1937 if (ret == EBUSY) {
1938 STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1939 } else if (ret == EAGAIN) {
1940 STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1941 } else {
1942 STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1943 }
1944 break;
1945
1946 default:
1947 break;
1948 }
1949 }
1950
1951 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1952
1953 return ret;
1954 }
1955
1956 static int
nx_netif_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1957 nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1958 uint32_t flags)
1959 {
1960 return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1961 }
1962
1963 static int
nx_netif_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1964 nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1965 uint32_t flags)
1966 {
1967 int ret;
1968
1969 /*
1970 * In the event the mitigation thread is disabled, protect
1971 * against recursion by detecting if we're already in the
1972 * context of an RX notify. IOSkywalkFamily may invoke the
1973 * notify callback as part of its RX sync callback.
1974 */
1975 if (__probable(!sk_is_rx_notify_protected())) {
1976 sk_protect_t protect;
1977 uint32_t work_done;
1978
1979 protect = sk_rx_notify_protect();
1980 ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1981 sk_sync_unprotect(protect);
1982 } else {
1983 ret = EAGAIN;
1984 }
1985
1986 return ret;
1987 }
1988
1989 void
nx_netif_mit_config(struct nexus_netif_adapter * nifna,boolean_t * tx_mit,boolean_t * tx_mit_simple,boolean_t * rx_mit,boolean_t * rx_mit_simple)1990 nx_netif_mit_config(struct nexus_netif_adapter *nifna,
1991 boolean_t *tx_mit, boolean_t *tx_mit_simple,
1992 boolean_t *rx_mit, boolean_t *rx_mit_simple)
1993 {
1994 struct nx_netif *nif = nifna->nifna_netif;
1995
1996 /*
1997 * TX mitigation is disabled by default, but can be
1998 * overridden via "sk_netif_tx_mit=N" boot-arg, where
1999 * N is one of SK_NETIF_MIT_FORCE_* values.
2000 */
2001 *tx_mit = *tx_mit_simple = FALSE;
2002 switch (sk_netif_tx_mit) {
2003 case SK_NETIF_MIT_FORCE_SIMPLE:
2004 *tx_mit_simple = TRUE;
2005 OS_FALLTHROUGH;
2006 case SK_NETIF_MIT_FORCE_ADVANCED:
2007 *tx_mit = TRUE;
2008 break;
2009 case SK_NETIF_MIT_FORCE_OFF:
2010 case SK_NETIF_MIT_AUTO:
2011 ASSERT(*tx_mit == FALSE);
2012 break;
2013 default:
2014 VERIFY(0);
2015 /* NOTREACHED */
2016 __builtin_unreachable();
2017 }
2018
2019 /*
2020 * RX mitigation is enabled by default only for BSD-style
2021 * virtual network interfaces, but can be overridden
2022 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
2023 * SK_NETIF_MIT_FORCE_* values.
2024 */
2025 *rx_mit = *rx_mit_simple = FALSE;
2026 switch (sk_netif_rx_mit) {
2027 case SK_NETIF_MIT_FORCE_OFF:
2028 ASSERT(*rx_mit == FALSE);
2029 break;
2030 case SK_NETIF_MIT_FORCE_SIMPLE:
2031 *rx_mit_simple = TRUE;
2032 OS_FALLTHROUGH;
2033 case SK_NETIF_MIT_FORCE_ADVANCED:
2034 *rx_mit = TRUE;
2035 break;
2036 case SK_NETIF_MIT_AUTO:
2037 *rx_mit_simple = TRUE;
2038 /*
2039 * Enable RX mitigation thread only for BSD-style virtual (and
2040 * regular) interfaces, since otherwise we may run out of stack
2041 * when subjected to IPsec processing, etc.
2042 */
2043 *rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
2044 NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
2045 break;
2046 default:
2047 VERIFY(0);
2048 /* NOTREACHED */
2049 __builtin_unreachable();
2050 }
2051 }
2052
2053 static int
nx_netif_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)2054 nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2055 {
2056 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2057 boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2058 struct nx_netif *nif = nifna->nifna_netif;
2059 struct ifnet *ifp = na->na_ifp;
2060 int error = 0;
2061 uint32_t r;
2062
2063 ASSERT(na->na_type == NA_NETIF_DEV);
2064 ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2065
2066 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
2067 SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2068
2069 switch (mode) {
2070 case NA_ACTIVATE_MODE_ON:
2071 ASSERT(SKYWALK_CAPABLE(ifp));
2072
2073 nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
2074 &rx_mit, &rx_mit_simple);
2075
2076 /*
2077 * Init the mitigation support on all the dev TX rings.
2078 */
2079 if (tx_mit) {
2080 nifna->nifna_tx_mit =
2081 skn_alloc_type_array(tx_on, struct nx_netif_mit,
2082 na_get_nrings(na, NR_TX), Z_WAITOK,
2083 skmem_tag_netif_mit);
2084 if (nifna->nifna_tx_mit == NULL) {
2085 SK_ERR("TX mitigation allocation failed");
2086 error = ENOMEM;
2087 goto out;
2088 }
2089 } else {
2090 ASSERT(nifna->nifna_tx_mit == NULL);
2091 }
2092
2093 /*
2094 * Init the mitigation support on all the dev RX rings.
2095 */
2096 if (rx_mit) {
2097 nifna->nifna_rx_mit =
2098 skn_alloc_type_array(rx_on, struct nx_netif_mit,
2099 na_get_nrings(na, NR_RX), Z_WAITOK,
2100 skmem_tag_netif_mit);
2101 if (nifna->nifna_rx_mit == NULL) {
2102 SK_ERR("RX mitigation allocation failed");
2103 if (nifna->nifna_tx_mit != NULL) {
2104 skn_free_type_array(rx_fail,
2105 struct nx_netif_mit,
2106 na_get_nrings(na, NR_TX),
2107 nifna->nifna_tx_mit);
2108 nifna->nifna_tx_mit = NULL;
2109 }
2110 error = ENOMEM;
2111 goto out;
2112 }
2113 } else {
2114 ASSERT(nifna->nifna_rx_mit == NULL);
2115 }
2116
2117 /* intercept na_notify callback on the TX rings */
2118 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2119 na->na_tx_rings[r].ckr_netif_notify =
2120 na->na_tx_rings[r].ckr_na_notify;
2121 na->na_tx_rings[r].ckr_na_notify =
2122 nx_netif_na_notify_tx;
2123 if (nifna->nifna_tx_mit != NULL) {
2124 nx_netif_mit_init(nif, ifp,
2125 &nifna->nifna_tx_mit[r],
2126 &na->na_tx_rings[r], tx_mit_simple);
2127 }
2128 }
2129
2130 /* intercept na_notify callback on the RX rings */
2131 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2132 na->na_rx_rings[r].ckr_netif_notify =
2133 na->na_rx_rings[r].ckr_na_notify;
2134 na->na_rx_rings[r].ckr_na_notify =
2135 nx_netif_na_notify_rx;
2136 if (nifna->nifna_rx_mit != NULL) {
2137 nx_netif_mit_init(nif, ifp,
2138 &nifna->nifna_rx_mit[r],
2139 &na->na_rx_rings[r], rx_mit_simple);
2140 }
2141 }
2142 nx_netif_filter_enable(nif);
2143 nx_netif_flow_enable(nif);
2144 atomic_bitset_32(&na->na_flags, NAF_ACTIVE);
2145
2146 /* steer all start requests to netif; this must not fail */
2147 lck_mtx_lock(&ifp->if_start_lock);
2148 error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2149 VERIFY(error == 0);
2150 lck_mtx_unlock(&ifp->if_start_lock);
2151 break;
2152
2153 case NA_ACTIVATE_MODE_DEFUNCT:
2154 ASSERT(SKYWALK_CAPABLE(ifp));
2155 break;
2156
2157 case NA_ACTIVATE_MODE_OFF:
2158 /*
2159 * Note that here we cannot assert SKYWALK_CAPABLE()
2160 * as we're called in the destructor path.
2161 */
2162 atomic_bitclear_32(&na->na_flags, NAF_ACTIVE);
2163 nx_netif_flow_disable(nif);
2164 nx_netif_filter_disable(nif);
2165
2166 /*
2167 * Here we may block while holding sk_lock, but because
2168 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2169 * should immediately return. A better approach would be
2170 * to drop sk_lock and add a monitor for this routine.
2171 */
2172 lck_mtx_lock(&ifp->if_start_lock);
2173 while (ifp->if_start_active != 0) {
2174 ++ifp->if_start_waiters;
2175 (void) msleep(&ifp->if_start_waiters,
2176 &ifp->if_start_lock, (PZERO - 1),
2177 na->na_name, NULL);
2178 }
2179 /* steer all start requests to default handler */
2180 ifnet_reset_start_handler(ifp);
2181 lck_mtx_unlock(&ifp->if_start_lock);
2182
2183 /* reset all TX notify callbacks */
2184 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
2185 na->na_tx_rings[r].ckr_na_notify =
2186 na->na_tx_rings[r].ckr_netif_notify;
2187 na->na_tx_rings[r].ckr_netif_notify = NULL;
2188 if (nifna->nifna_tx_mit != NULL) {
2189 na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2190 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2191 }
2192 }
2193
2194 if (nifna->nifna_tx_mit != NULL) {
2195 skn_free_type_array(tx_off, struct nx_netif_mit,
2196 na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
2197 nifna->nifna_tx_mit = NULL;
2198 }
2199
2200 /* reset all RX notify callbacks */
2201 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
2202 na->na_rx_rings[r].ckr_na_notify =
2203 na->na_rx_rings[r].ckr_netif_notify;
2204 na->na_rx_rings[r].ckr_netif_notify = NULL;
2205 if (nifna->nifna_rx_mit != NULL) {
2206 na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2207 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2208 }
2209 }
2210 if (nifna->nifna_rx_mit != NULL) {
2211 skn_free_type_array(rx_off, struct nx_netif_mit,
2212 na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
2213 nifna->nifna_rx_mit = NULL;
2214 }
2215 break;
2216
2217 default:
2218 VERIFY(0);
2219 /* NOTREACHED */
2220 __builtin_unreachable();
2221 }
2222 out:
2223 return error;
2224 }
2225
2226 SK_NO_INLINE_ATTRIBUTE
2227 static int
nx_netif_attach(struct kern_nexus * nx,struct ifnet * ifp)2228 nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2229 __attribute__((optnone))
2230 {
2231 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2232 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2233 struct nexus_netif_adapter *devnifna = NULL;
2234 struct nexus_netif_adapter *hostnifna = NULL;
2235 struct nexus_adapter *devna = NULL;
2236 struct nexus_adapter *hostna = NULL;
2237 boolean_t embryonic = FALSE;
2238 int retval = 0;
2239 uint32_t na_flags;
2240
2241 SK_LOCK_ASSERT_HELD();
2242 ASSERT(SKYWALK_NATIVE(ifp));
2243 ASSERT(!SKYWALK_CAPABLE(ifp));
2244 ASSERT(ifp->if_na == NULL);
2245 ASSERT(ifp->if_na_ops == NULL);
2246
2247 devnifna = na_netif_alloc(Z_WAITOK);
2248 hostnifna = na_netif_alloc(Z_WAITOK);
2249
2250 /*
2251 * We can be called for two different interface states:
2252 *
2253 * Fully attached: get an io ref count; upon success, this
2254 * holds a reference to the ifnet for the ifp pointer stored
2255 * in 'na_ifp' down below for both adapters.
2256 *
2257 * Embryonic: temporary hold the ifnet in na_private, which
2258 * upon a successful ifnet_attach(), will be moved over to
2259 * the 'na_ifp' with an io ref count held.
2260 *
2261 * The ifnet in 'na_ifp' will be released by na_release_locked().
2262 */
2263 if (!ifnet_is_attached(ifp, 1)) {
2264 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2265 ifp = NULL;
2266 retval = ENXIO;
2267 goto err;
2268 }
2269 embryonic = TRUE;
2270 }
2271
2272 /* initialize the device netif adapter */
2273 devnifna->nifna_netif = nif;
2274 nx_netif_retain(nif);
2275 devna = &devnifna->nifna_up;
2276 devna->na_type = NA_NETIF_DEV;
2277 devna->na_free = na_netif_free;
2278 (void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1);
2279 devna->na_name[sizeof(devna->na_name) - 1] = '\0';
2280 uuid_generate_random(devna->na_uuid);
2281 if (embryonic) {
2282 /*
2283 * We will move this over to na_ifp once
2284 * the interface is fully attached.
2285 */
2286 devna->na_private = ifp;
2287 ASSERT(devna->na_ifp == NULL);
2288 } else {
2289 ASSERT(devna->na_private == NULL);
2290 /* use I/O refcnt from ifnet_is_attached() */
2291 devna->na_ifp = ifp;
2292 }
2293 devna->na_activate = nx_netif_na_activate;
2294 devna->na_txsync = nx_netif_na_txsync;
2295 devna->na_rxsync = nx_netif_na_rxsync;
2296 devna->na_dtor = nx_netif_na_dtor;
2297 devna->na_krings_create = nx_netif_dev_krings_create;
2298 devna->na_krings_delete = nx_netif_dev_krings_delete;
2299 devna->na_special = nx_netif_na_special;
2300
2301 na_flags = NAF_NATIVE;
2302 if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2303 na_flags |= NAF_VIRTUAL_DEVICE;
2304 }
2305 if (NX_LLINK_PROV(nx)) {
2306 /*
2307 * while operating in logical link mode, we don't need to
2308 * create backing memory regions for the rings as they are
2309 * not used.
2310 */
2311 na_flags |= NAF_MEM_NO_INIT;
2312 }
2313 atomic_bitset_32(&devna->na_flags, na_flags);
2314 *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2315 NEXUS_STATS_TYPE_INVALID;
2316
2317 na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
2318 na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
2319 na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
2320 na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
2321 /*
2322 * Verify upper bounds; the parameters must have already been
2323 * validated by nxdom_prov_params() by the time we get here.
2324 */
2325 ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2326 ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2327 ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2328 ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2329
2330 na_attach_common(devna, nx, &nx_netif_prov_s);
2331
2332 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2333 nx, devna)) != 0) {
2334 ASSERT(devna->na_arena == NULL);
2335 goto err;
2336 }
2337 ASSERT(devna->na_arena != NULL);
2338
2339 *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2340 ASSERT(devna->na_flowadv_max == 0 ||
2341 skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2342
2343 /* setup packet copy routines */
2344 if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2345 nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2346 nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2347 nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2348 } else {
2349 nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2350 nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2351 nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2352 }
2353
2354 /* initialize the host netif adapter */
2355 hostnifna->nifna_netif = nif;
2356 nx_netif_retain(nif);
2357 hostna = &hostnifna->nifna_up;
2358 (void) snprintf(hostna->na_name, sizeof(hostna->na_name),
2359 "%s^", devna->na_name);
2360 uuid_generate_random(hostna->na_uuid);
2361 if (embryonic) {
2362 /*
2363 * We will move this over to na_ifp once
2364 * the interface is fully attached.
2365 */
2366 hostna->na_private = ifp;
2367 ASSERT(hostna->na_ifp == NULL);
2368 } else {
2369 ASSERT(hostna->na_private == NULL);
2370 hostna->na_ifp = devna->na_ifp;
2371 ifnet_incr_iorefcnt(hostna->na_ifp);
2372 }
2373 hostna->na_type = NA_NETIF_HOST;
2374 hostna->na_free = na_netif_free;
2375 hostna->na_activate = nx_netif_host_na_activate;
2376 hostna->na_txsync = nx_netif_host_na_txsync;
2377 hostna->na_rxsync = nx_netif_host_na_rxsync;
2378 hostna->na_dtor = nx_netif_na_dtor;
2379 hostna->na_krings_create = nx_netif_host_krings_create;
2380 hostna->na_krings_delete = nx_netif_host_krings_delete;
2381 hostna->na_special = nx_netif_host_na_special;
2382
2383 na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2384 if (NX_LLINK_PROV(nx)) {
2385 /*
2386 * while operating in logical link mode, we don't need to
2387 * create backing memory regions for the rings as they are
2388 * not used.
2389 */
2390 na_flags |= NAF_MEM_NO_INIT;
2391 }
2392 atomic_bitset_32(&hostna->na_flags, na_flags);
2393 *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2394 NEXUS_STATS_TYPE_INVALID;
2395
2396 na_set_nrings(hostna, NR_TX, 1);
2397 na_set_nrings(hostna, NR_RX, 1);
2398 na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
2399 na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);
2400
2401 na_attach_common(hostna, nx, &nx_netif_prov_s);
2402
2403 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2404 nx, hostna)) != 0) {
2405 ASSERT(hostna->na_arena == NULL);
2406 goto err;
2407 }
2408 ASSERT(hostna->na_arena != NULL);
2409
2410 *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2411 ASSERT(hostna->na_flowadv_max == 0 ||
2412 skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2413
2414 /* adjust the classq packet drop limit */
2415 if (embryonic) {
2416 uint32_t drop_lim;
2417 struct kern_pbufpool_memory_info pp_info;
2418
2419 retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
2420 VERIFY(retval == 0);
2421
2422 /* set the drop limit as 80% of size of packet pool */
2423 drop_lim = (pp_info.kpm_packets * 4) / 5;
2424 VERIFY(drop_lim != 0);
2425 IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2426 }
2427
2428 /* these will be undone by destructor */
2429 ifp->if_na_ops = &na_netif_ops;
2430 ifp->if_na = devnifna;
2431 na_retain_locked(devna);
2432 na_retain_locked(hostna);
2433
2434 SKYWALK_SET_CAPABLE(ifp);
2435
2436 NETIF_WLOCK(nif);
2437 nif->nif_ifp = ifp;
2438 nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2439 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2440 kernproc);
2441 ASSERT(retval == 0);
2442 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2443 kernproc);
2444 ASSERT(retval == 0);
2445 NETIF_WUNLOCK(nif);
2446
2447 #if SK_LOG
2448 uuid_string_t uuidstr;
2449 SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2450 SK_DF(SK_VERB_NETIF, " UUID: %s",
2451 sk_uuid_unparse(devna->na_uuid, uuidstr));
2452 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2453 SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2454 NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2455 SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS);
2456 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max);
2457 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2458 na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2459 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2460 na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2461 #if CONFIG_NEXUS_USER_PIPE
2462 SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe);
2463 SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes);
2464 #endif /* CONFIG_NEXUS_USER_PIPE */
2465 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2466 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2467 SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2468 SK_DF(SK_VERB_NETIF, " UUID: %s",
2469 sk_uuid_unparse(hostna->na_uuid, uuidstr));
2470 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2471 SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2472 NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2473 SK_DF(SK_VERB_NETIF, " flags: 0x%b",
2474 hostna->na_flags, NAF_BITS);
2475 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max);
2476 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2477 na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2478 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2479 na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2480 #if CONFIG_NEXUS_USER_PIPE
2481 SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe);
2482 SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes);
2483 #endif /* CONFIG_NEXUS_USER_PIPE */
2484 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2485 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2486 #endif /* SK_LOG */
2487
2488 err:
2489 if (retval != 0) {
2490 if (ifp != NULL) {
2491 if (!embryonic) {
2492 ifnet_decr_iorefcnt(ifp);
2493 }
2494 ifp = NULL;
2495 }
2496 if (devna != NULL) {
2497 if (devna->na_arena != NULL) {
2498 skmem_arena_release(devna->na_arena);
2499 devna->na_arena = NULL;
2500 }
2501 if (devna->na_ifp != NULL) {
2502 ifnet_decr_iorefcnt(devna->na_ifp);
2503 devna->na_ifp = NULL;
2504 }
2505 devna->na_private = NULL;
2506 }
2507 if (hostna != NULL) {
2508 if (hostna->na_arena != NULL) {
2509 skmem_arena_release(hostna->na_arena);
2510 hostna->na_arena = NULL;
2511 }
2512 if (hostna->na_ifp != NULL) {
2513 ifnet_decr_iorefcnt(hostna->na_ifp);
2514 hostna->na_ifp = NULL;
2515 }
2516 hostna->na_private = NULL;
2517 }
2518 if (devnifna != NULL) {
2519 if (devnifna->nifna_netif != NULL) {
2520 nx_netif_release(devnifna->nifna_netif);
2521 devnifna->nifna_netif = NULL;
2522 }
2523 na_netif_free((struct nexus_adapter *)devnifna);
2524 }
2525 if (hostnifna != NULL) {
2526 if (hostnifna->nifna_netif != NULL) {
2527 nx_netif_release(hostnifna->nifna_netif);
2528 hostnifna->nifna_netif = NULL;
2529 }
2530 na_netif_free((struct nexus_adapter *)hostnifna);
2531 }
2532 }
2533 return retval;
2534 }
2535
2536 /*
2537 * Any per-netif state that can be discovered at attach time should be
2538 * initialized here.
2539 */
2540 static void
nx_netif_flags_init(struct nx_netif * nif)2541 nx_netif_flags_init(struct nx_netif *nif)
2542 {
2543 ifnet_t ifp = nif->nif_ifp;
2544 struct kern_nexus *nx = nif->nif_nx;
2545 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2546
2547 switch (devna->na_type) {
2548 case NA_NETIF_DEV:
2549 if (strcmp(ifp->if_name, sk_ll_prefix) == 0) {
2550 nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2551 if_set_xflags(ifp, IFXF_LOW_LATENCY);
2552 }
2553 break;
2554 case NA_NETIF_COMPAT_DEV:
2555 nif->nif_flags |= NETIF_FLAG_COMPAT;
2556 break;
2557 default:
2558 break;
2559 }
2560 }
2561
2562 /*
2563 * This is also supposed to check for any inconsistent state at detach time.
2564 */
2565 static void
nx_netif_flags_fini(struct nx_netif * nif)2566 nx_netif_flags_fini(struct nx_netif *nif)
2567 {
2568 ifnet_t ifp = nif->nif_ifp;
2569
2570 if (ifp != NULL) {
2571 if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2572 }
2573 nif->nif_flags = 0;
2574 }
2575
2576 static void
configure_capab_interface_advisory(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2577 configure_capab_interface_advisory(struct nx_netif *nif,
2578 nxprov_capab_config_fn_t capab_fn)
2579 {
2580 struct kern_nexus_capab_interface_advisory capab;
2581 struct kern_nexus *nx = nif->nif_nx;
2582 uint32_t capab_len;
2583 int error;
2584
2585 /* check/configure interface advisory notifications */
2586 if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
2587 return;
2588 }
2589 bzero(&capab, sizeof(capab));
2590 capab.kncia_version =
2591 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2592 *__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2593 &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
2594 *__DECONST(void **, &(capab.kncia_kern_context)) = nx;
2595 capab_len = sizeof(capab);
2596 error = capab_fn(NX_PROV(nx), nx,
2597 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
2598 if (error != 0) {
2599 DTRACE_SKYWALK2(interface__advisory__capab__error,
2600 struct nx_netif *, nif, int, error);
2601 return;
2602 }
2603 VERIFY(capab.kncia_config != NULL);
2604 VERIFY(capab.kncia_provider_context != NULL);
2605 nif->nif_intf_adv_config = capab.kncia_config;
2606 nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
2607 nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
2608 }
2609
2610 static void
unconfigure_capab_interface_advisory(struct nx_netif * nif)2611 unconfigure_capab_interface_advisory(struct nx_netif *nif)
2612 {
2613 if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
2614 return;
2615 }
2616 nif->nif_intf_adv_config = NULL;
2617 nif->nif_intf_adv_prov_ctx = NULL;
2618 nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
2619 }
2620
2621 static void
configure_capab_qset_extensions(struct nx_netif * nif,nxprov_capab_config_fn_t capab_fn)2622 configure_capab_qset_extensions(struct nx_netif *nif,
2623 nxprov_capab_config_fn_t capab_fn)
2624 {
2625 struct kern_nexus_capab_qset_extensions capab;
2626 struct kern_nexus *nx = nif->nif_nx;
2627 uint32_t capab_len;
2628 int error;
2629
2630 if (!NX_LLINK_PROV(nx)) {
2631 DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
2632 return;
2633 }
2634 bzero(&capab, sizeof(capab));
2635 capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
2636 capab_len = sizeof(capab);
2637 error = capab_fn(NX_PROV(nx), nx,
2638 KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
2639 if (error != 0) {
2640 DTRACE_SKYWALK2(qset__extensions__capab__error,
2641 struct nx_netif *, nif, int, error);
2642 return;
2643 }
2644 VERIFY(capab.cqe_notify_steering_info != NULL);
2645 VERIFY(capab.cqe_prov_ctx != NULL);
2646 nif->nif_qset_extensions.qe_notify_steering_info =
2647 capab.cqe_notify_steering_info;
2648 nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
2649 nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
2650 }
2651
2652 static void
unconfigure_capab_qset_extensions(struct nx_netif * nif)2653 unconfigure_capab_qset_extensions(struct nx_netif *nif)
2654 {
2655 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2656 return;
2657 }
2658 bzero(&nif->nif_qset_extensions, sizeof(nif->nif_qset_extensions));
2659 nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
2660 }
2661
2662 int
nx_netif_notify_steering_info(struct nx_netif * nif,struct netif_qset * qset,struct ifnet_traffic_descriptor_common * td,bool add)2663 nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
2664 struct ifnet_traffic_descriptor_common *td, bool add)
2665 {
2666 struct netif_qset_extensions *qset_ext;
2667 int err;
2668
2669 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2670 return ENOTSUP;
2671 }
2672 qset_ext = &nif->nif_qset_extensions;
2673 VERIFY(qset_ext->qe_prov_ctx != NULL);
2674 VERIFY(qset_ext->qe_notify_steering_info != NULL);
2675 err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
2676 qset->nqs_ctx, td, add);
2677 return err;
2678 }
2679
2680 static void
nx_netif_capabilities_init(struct nx_netif * nif)2681 nx_netif_capabilities_init(struct nx_netif *nif)
2682 {
2683 struct kern_nexus *nx = nif->nif_nx;
2684 nxprov_capab_config_fn_t capab_fn;
2685
2686 if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2687 KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2688 capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2689 ASSERT(capab_fn != NULL);
2690 } else {
2691 capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2692 }
2693 if (capab_fn == NULL) {
2694 return;
2695 }
2696 configure_capab_interface_advisory(nif, capab_fn);
2697 configure_capab_qset_extensions(nif, capab_fn);
2698 }
2699
2700 static void
nx_netif_capabilities_fini(struct nx_netif * nif)2701 nx_netif_capabilities_fini(struct nx_netif *nif)
2702 {
2703 unconfigure_capab_interface_advisory(nif);
2704 unconfigure_capab_qset_extensions(nif);
2705 }
2706
2707 static void
nx_netif_verify_tso_config(struct nx_netif * nif,struct ifnet * ifp)2708 nx_netif_verify_tso_config(struct nx_netif *nif, struct ifnet *ifp)
2709 {
2710 uint32_t tso_v4_mtu = 0;
2711 uint32_t tso_v6_mtu = 0;
2712
2713 if ((nif->nif_hwassist & IFNET_TSO_IPV4) != 0) {
2714 tso_v4_mtu = ifp->if_tso_v4_mtu;
2715 }
2716 if ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0) {
2717 tso_v6_mtu = ifp->if_tso_v6_mtu;
2718 }
2719 VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
2720 max(tso_v4_mtu, tso_v6_mtu));
2721 }
2722
2723 void
na_netif_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)2724 na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2725 {
2726 struct nx_netif *nif = nifna->nifna_netif;
2727 struct kern_nexus *nx = nif->nif_nx;
2728 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2729 struct nexus_adapter *hostna = nx_port_get_na(nx,
2730 NEXUS_PORT_NET_IF_HOST);
2731
2732 ASSERT(devna != NULL);
2733 ASSERT(hostna != NULL);
2734
2735 if (!ifnet_is_attached(ifp, 1)) {
2736 VERIFY(0);
2737 /* NOTREACHED */
2738 __builtin_unreachable();
2739 }
2740
2741 ASSERT(devna->na_private == ifp);
2742 ASSERT(devna->na_ifp == NULL);
2743 /* use I/O refcnt held by ifnet_is_attached() above */
2744 devna->na_ifp = devna->na_private;
2745 devna->na_private = NULL;
2746
2747 ASSERT(hostna->na_private == ifp);
2748 ASSERT(hostna->na_ifp == NULL);
2749 hostna->na_ifp = hostna->na_private;
2750 hostna->na_private = NULL;
2751 ifnet_incr_iorefcnt(hostna->na_ifp);
2752
2753 nx_netif_flags_init(nif);
2754 nx_netif_llink_init(nif);
2755 nx_netif_filter_init(nif);
2756 nx_netif_flow_init(nif);
2757 nx_netif_capabilities_init(nif);
2758 nx_netif_agent_init(nif);
2759 (void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
2760 &ifp->if_traffic_rule_count);
2761 nx_netif_verify_tso_config(nif, ifp);
2762 }
2763
2764 void
nx_netif_reap(struct nexus_netif_adapter * nifna,struct ifnet * ifp,uint32_t thres,boolean_t low)2765 nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2766 uint32_t thres, boolean_t low)
2767 {
2768 #pragma unused(ifp)
2769 struct nx_netif *nif = nifna->nifna_netif;
2770 struct kern_nexus *nx = nif->nif_nx;
2771 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2772 uint64_t now = _net_uptime;
2773 boolean_t purge;
2774
2775 ASSERT(thres != 0);
2776
2777 if (devna->na_work_ts == 0) {
2778 return;
2779 }
2780
2781 /*
2782 * Purge if it's has been inactive for some time (twice the drain
2783 * threshold), and clear the work timestamp to temporarily skip this
2784 * adapter until it's active again. Purging cached objects can be
2785 * expensive since we'd need to allocate and construct them again,
2786 * so we do it only when necessary.
2787 */
2788 if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2789 devna->na_work_ts = 0;
2790 purge = TRUE;
2791 } else {
2792 purge = FALSE;
2793 }
2794
2795 SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2796 (purge ? "purging" : "pruning"), devna->na_name);
2797
2798 /*
2799 * Device and host adapters share the same packet buffer pool,
2800 * so just reap the arena belonging to the device instance.
2801 */
2802 skmem_arena_reap(devna->na_arena, purge);
2803
2804 /*
2805 * Reap any caches configured for classq.
2806 */
2807 ifclassq_reap_caches(purge);
2808 }
2809
2810 /*
2811 * The purpose of this callback is to forceably remove resources held by VPNAs
2812 * in event of an interface detach. Without this callback an application can
2813 * prevent the detach from completing indefinitely. Note that this is only needed
2814 * for low latency VPNAs. Userspace do get notified about interface detach events
2815 * for other NA types (custom ether and filter) and will do the necessary cleanup.
2816 * The cleanup is done in two phases:
2817 * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
2818 * causes the device channel to be closed. All ifnet references held by VPNAs
2819 * are also released.
2820 * 2) This cleans up the netif nexus and releases the two remaining ifnet
2821 * references held by the device and host ports (nx_netif_clean()).
2822 */
2823 void
nx_netif_detach_notify(struct nexus_netif_adapter * nifna)2824 nx_netif_detach_notify(struct nexus_netif_adapter *nifna)
2825 {
2826 struct nx_netif *nif = nifna->nifna_netif;
2827 struct kern_nexus *nx = nif->nif_nx;
2828 struct kern_channel **ch_list = NULL;
2829 struct kern_channel *ch;
2830 int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
2831 struct proc *p;
2832
2833 ASSERT(NETIF_IS_LOW_LATENCY(nif));
2834 /*
2835 * kern_channel_defunct() requires sk_lock to be not held. We
2836 * will first find the list of channels we want to defunct and
2837 * then call kern_channel_defunct() on each of them. The number
2838 * of channels cannot increase after sk_lock is released since
2839 * this interface is being detached.
2840 */
2841 SK_LOCK();
2842 all_ch_cnt = nx->nx_ch_count;
2843 if (all_ch_cnt == 0) {
2844 DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
2845 SK_UNLOCK();
2846 return;
2847 }
2848 ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
2849 Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);
2850
2851 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
2852 struct nexus_adapter *na = ch->ch_na;
2853
2854 if (na != NULL && na->na_type == NA_NETIF_VP) {
2855 ASSERT(vp_ch_cnt < all_ch_cnt);
2856
2857 /* retain channel to prevent it from being freed */
2858 ch_retain_locked(ch);
2859 ch_list[vp_ch_cnt] = ch;
2860 DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
2861 struct kern_channel *, ch, struct nexus_adapter *, na);
2862 vp_ch_cnt++;
2863 }
2864 }
2865 if (vp_ch_cnt == 0) {
2866 DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
2867 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2868 SK_UNLOCK();
2869 return;
2870 }
2871 /* prevents the netif from being freed */
2872 nx_netif_retain(nif);
2873 SK_UNLOCK();
2874
2875 for (i = 0; i < vp_ch_cnt; i++) {
2876 ch = ch_list[i];
2877 p = proc_find(ch->ch_pid);
2878 if (p == NULL) {
2879 SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid);
2880 DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
2881 struct kern_channel *, ch, pid_t, ch->ch_pid);
2882 ch_release(ch);
2883 continue;
2884 }
2885 /*
2886 * It is possible for the channel to be closed before defunct gets
2887 * called. We need to get the fd lock here to ensure that the check
2888 * for the closed state and the calling of channel defunct are done
2889 * atomically.
2890 */
2891 proc_fdlock(p);
2892 if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
2893 kern_channel_defunct(p, ch);
2894 }
2895 proc_fdunlock(p);
2896 proc_rele(p);
2897 ch_release(ch);
2898 }
2899 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2900
2901 SK_LOCK();
2902 /*
2903 * Quiescing is not needed because:
2904 * The defuncting above ensures that no more tx syncs could enter.
2905 * The driver layer ensures that ifnet_detach() (this path) does not get
2906 * called until RX upcalls have returned.
2907 *
2908 * Before sk_lock is reacquired above, userspace could close its channels
2909 * and cause the nexus's destructor to be called. This is fine because we
2910 * have retained the nif so it can't disappear.
2911 */
2912 err = nx_netif_clean(nif, FALSE);
2913 if (err != 0) {
2914 SK_ERR("netif clean failed: err %d", err);
2915 DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
2916 }
2917 nx_netif_release(nif);
2918 SK_UNLOCK();
2919 }
2920
2921 void
nx_netif_copy_stats(struct nexus_netif_adapter * nifna,struct if_netif_stats * if_ns)2922 nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2923 struct if_netif_stats *if_ns)
2924 {
2925 struct nx_netif_mit *mit;
2926 struct mit_cfg_tbl *mit_cfg;
2927
2928 if ((mit = nifna->nifna_rx_mit) == NULL) {
2929 return;
2930 }
2931
2932 if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2933 return;
2934 }
2935
2936 if_ns->ifn_rx_mit_interval = mit->mit_interval;
2937 if_ns->ifn_rx_mit_mode = mit->mit_mode;
2938 if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2939 if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2940 if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2941 if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2942 if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2943 if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2944 if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2945
2946 VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2947 mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
2948 if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
2949 if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
2950 if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
2951 if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
2952 if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
2953 }
2954
2955 int
nx_netif_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2956 nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
2957 struct chreq *chr, nxspec_cmd_t spec_cmd)
2958 {
2959 ASSERT(na->na_type == NA_NETIF_DEV ||
2960 na->na_type == NA_NETIF_COMPAT_DEV);
2961 return nx_netif_na_special_common(na, ch, chr, spec_cmd);
2962 }
2963
2964 int
nx_netif_na_special_common(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)2965 nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
2966 struct chreq *chr, nxspec_cmd_t spec_cmd)
2967 {
2968 int error = 0;
2969
2970 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
2971 na->na_type == NA_NETIF_COMPAT_DEV ||
2972 na->na_type == NA_NETIF_COMPAT_HOST);
2973 SK_LOCK_ASSERT_HELD();
2974
2975 switch (spec_cmd) {
2976 case NXSPEC_CMD_CONNECT:
2977 /*
2978 * netif adapter isn't created exclusively for kernel.
2979 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
2980 * na_special() connect and disconnect.
2981 */
2982 if (NA_KERNEL_ONLY(na)) {
2983 error = EBUSY;
2984 goto done;
2985 }
2986 ASSERT(!(na->na_flags & NAF_SPEC_INIT));
2987
2988 atomic_bitset_32(&na->na_flags, NAF_KERNEL_ONLY);
2989 error = na_bind_channel(na, ch, chr);
2990 if (error != 0) {
2991 atomic_bitclear_32(&na->na_flags, NAF_KERNEL_ONLY);
2992 goto done;
2993 }
2994 atomic_bitset_32(&na->na_flags, NAF_SPEC_INIT);
2995 break;
2996
2997 case NXSPEC_CMD_DISCONNECT:
2998 ASSERT(NA_KERNEL_ONLY(na));
2999 ASSERT(na->na_channels > 0);
3000 ASSERT(na->na_flags & NAF_SPEC_INIT);
3001 na_unbind_channel(ch);
3002 atomic_bitclear_32(&na->na_flags,
3003 (NAF_SPEC_INIT | NAF_KERNEL_ONLY));
3004 break;
3005
3006 case NXSPEC_CMD_START:
3007 na_kr_drop(na, FALSE);
3008 break;
3009
3010 case NXSPEC_CMD_STOP:
3011 na_kr_drop(na, TRUE);
3012 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
3013 lck_mtx_lock(&ch->ch_lock);
3014 nxprov_advise_disconnect(na->na_nx, ch);
3015 lck_mtx_unlock(&ch->ch_lock);
3016 break;
3017
3018 default:
3019 error = EINVAL;
3020 break;
3021 }
3022
3023 done:
3024 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
3025 "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
3026 "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
3027 na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
3028
3029 return error;
3030 }
3031
3032 /*
3033 * Get a skywalk netif adapter for the port.
3034 */
3035 int
nx_netif_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** nap,boolean_t create)3036 nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
3037 struct chreq *chr, struct nxbind *nxb, struct proc *p,
3038 struct nexus_adapter **nap, boolean_t create)
3039 {
3040 #pragma unused(ch)
3041 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3042 boolean_t anon = NX_ANONYMOUS_PROV(nx);
3043 ch_endpoint_t ep = chr->cr_endpoint;
3044 nexus_port_t nx_port = chr->cr_port;
3045 struct nexus_adapter *na = NULL;
3046 struct ifnet *ifp;
3047 int err = 0;
3048
3049 SK_LOCK_ASSERT_HELD();
3050 *nap = NULL; /* default */
3051
3052 #if SK_LOG
3053 uuid_string_t uuidstr;
3054 SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
3055 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
3056 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
3057 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
3058 chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
3059 chr->cr_real_endpoint, chr->cr_endpoint, create,
3060 (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
3061 #endif /* SK_LOG */
3062
3063 if (!create || ep != CH_ENDPOINT_NET_IF) {
3064 err = ENODEV;
3065 goto done;
3066 }
3067
3068 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
3069 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
3070 err = ENXIO;
3071 goto done;
3072 }
3073 ifp = nif->nif_ifp;
3074 if (!(SKYWALK_CAPABLE(ifp))) {
3075 SK_ERR("interface %s is no longer usable", if_name(ifp));
3076 err = ENOTSUP;
3077 goto done;
3078 }
3079
3080 if (chr->cr_mode & CHMODE_LOW_LATENCY) {
3081 SK_ERR("low latency is not supported for netif channel");
3082 err = ENOTSUP;
3083 goto done;
3084 }
3085
3086 switch (nx_port) {
3087 case NEXUS_PORT_NET_IF_DEV:
3088 /*
3089 * We have to reject direct user open that's not explicitly
3090 * allowed because netif nexuses do not by default have
3091 * user memory regions.
3092 */
3093 if (p != kernproc &&
3094 (!skywalk_netif_direct_allowed(ifp->if_xname) ||
3095 (kauth_cred_issuser(kauth_cred_get()) == 0 &&
3096 (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
3097 !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
3098 DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
3099 ifp, struct chreq *, chr);
3100 err = ENOTSUP;
3101 goto done;
3102 }
3103 if (chr->cr_mode & CHMODE_EVENT_RING) {
3104 SK_ERR("event ring is not supported for netif dev port channel");
3105 err = ENOTSUP;
3106 goto done;
3107 }
3108 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
3109 break;
3110
3111 case NEXUS_PORT_NET_IF_HOST:
3112 if (p != kernproc) {
3113 err = ENOTSUP;
3114 goto done;
3115 }
3116 if (chr->cr_mode & CHMODE_EVENT_RING) {
3117 SK_ERR("event ring is not supported for netif host port channel");
3118 err = ENOTSUP;
3119 goto done;
3120 }
3121 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
3122 break;
3123
3124 default:
3125 ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
3126
3127 NETIF_WLOCK(nif);
3128 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3129 if (err != 0) {
3130 NETIF_WUNLOCK(nif);
3131 goto done;
3132 }
3133
3134 if (na == NULL) {
3135 if (chr->cr_mode & CHMODE_FILTER) {
3136 err = netif_filter_na_create(nx, chr, &na);
3137 } else {
3138 err = netif_vp_na_create(nx, chr, &na);
3139 }
3140 if (err != 0) {
3141 NETIF_WUNLOCK(nif);
3142 goto done;
3143 }
3144 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3145 if (err != 0) {
3146 NETIF_WUNLOCK(nif);
3147 goto done;
3148 }
3149 }
3150 NETIF_WUNLOCK(nif);
3151
3152 break;
3153 }
3154
3155 ASSERT(err == 0);
3156 ASSERT(na != NULL);
3157
3158 #if CONFIG_NEXUS_USER_PIPE
3159 if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
3160 #else /* !CONFIG_NEXUS_USER_PIPE */
3161 if (NA_OWNED_BY_ANY(na)) {
3162 #endif /* !CONFIG_NEXUS_USER_PIPE */
3163 err = EBUSY;
3164 na = NULL;
3165 goto done;
3166 }
3167
3168 *nap = na;
3169 na_retain_locked(na);
3170
3171 done:
3172 ASSERT(err != 0 || na != NULL);
3173 if (err) {
3174 SK_ERR("na not found, err(%d)", err);
3175 } else {
3176 SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
3177 }
3178 return err;
3179 }
3180
3181 /* na_krings_create callback for all netif device adapters */
3182 int
3183 nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3184 {
3185 int ret;
3186
3187 ASSERT(na->na_type == NA_NETIF_DEV ||
3188 na->na_type == NA_NETIF_COMPAT_DEV);
3189 /*
3190 * Allocate context structures for native netif only, for
3191 * IOSkywalkFamily to store its object references.
3192 */
3193 ret = na_rings_mem_setup(na, 0, (na->na_flags & NAF_NATIVE), ch);
3194
3195 /*
3196 * We mark CKRF_DROP for kernel-only rings (kernel channel
3197 * opened by the flowswitch, etc.) to prevent packets from
3198 * going thru until after the client of the kernel channel
3199 * has fully plumbed things on its side. For userland-facing
3200 * rings (regular channel opened to netif), this is not
3201 * required, and so don't mark CKRF_DROP there.
3202 */
3203 if (ret == 0 && NA_KERNEL_ONLY(na)) {
3204 na_kr_drop(na, TRUE);
3205 }
3206
3207 return ret;
3208 }
3209
3210 /* call with SK_LOCK held */
3211 void
3212 nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3213 boolean_t defunct)
3214 {
3215 ASSERT(na->na_type == NA_NETIF_DEV ||
3216 na->na_type == NA_NETIF_COMPAT_DEV);
3217
3218 /* see comments in nx_netif_dev_krings_create() */
3219 if (NA_KERNEL_ONLY(na)) {
3220 na_kr_drop(na, TRUE);
3221 }
3222
3223 na_rings_mem_teardown(na, ch, defunct);
3224 }
3225
3226 struct nx_netif *
3227 nx_netif_alloc(zalloc_flags_t how)
3228 {
3229 struct nx_netif *n;
3230
3231 SK_LOCK_ASSERT_HELD();
3232
3233 n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
3234 if (n == NULL) {
3235 return NULL;
3236 }
3237
3238 NETIF_RWINIT(n);
3239 os_ref_init(&n->nif_refcnt, NULL);
3240 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3241
3242 return n;
3243 }
3244
3245 static void
3246 nx_netif_destroy(struct nx_netif *n)
3247 {
3248 ASSERT(n->nif_dev_nxb == NULL);
3249 ASSERT(n->nif_host_nxb == NULL);
3250 ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3251 nx_netif_llink_config_free(n);
3252 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3253 NETIF_RWDESTROY(n);
3254 zfree(nx_netif_zone, n);
3255 }
3256
3257 void
3258 nx_netif_release(struct nx_netif *n)
3259 {
3260 SK_LOCK_ASSERT_HELD();
3261
3262 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3263 os_ref_get_count(&n->nif_refcnt));
3264 if (os_ref_release(&n->nif_refcnt) == 0) {
3265 nx_netif_destroy(n);
3266 }
3267 }
3268
3269 void
3270 nx_netif_retain(struct nx_netif *n)
3271 {
3272 SK_LOCK_ASSERT_HELD();
3273
3274 /* retaining an object with a zero refcount is not allowed */
3275 ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3276 os_ref_retain(&n->nif_refcnt);
3277 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3278 os_ref_get_count(&n->nif_refcnt));
3279 }
3280
3281 void
3282 nx_netif_free(struct nx_netif *n)
3283 {
3284 nx_netif_release(n);
3285 }
3286
3287 static int
3288 nx_netif_interface_advisory_report(struct kern_nexus *nx,
3289 const struct ifnet_interface_advisory *advisory)
3290 {
3291 struct kern_nexus *notify_nx;
3292 struct __kern_netif_intf_advisory *intf_adv;
3293 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3294
3295 if (nif->nif_fsw_nxadv != NULL) {
3296 ASSERT(nif->nif_fsw != NULL);
3297 intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3298 notify_nx = nif->nif_fsw->fsw_nx;
3299 } else {
3300 intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3301 notify_nx = nif->nif_nx;
3302 }
3303 /*
3304 * copy the advisory report in shared memory
3305 */
3306 intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3307 sizeof(*advisory), 0);
3308 STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3309 /*
3310 * notify user channels on advisory report availability
3311 */
3312 nx_interface_advisory_notify(notify_nx);
3313 return 0;
3314 }
3315
3316 static errno_t
3317 nx_netif_interface_advisory_notify(void *kern_ctx,
3318 const struct ifnet_interface_advisory *advisory)
3319 {
3320 _CASSERT(offsetof(struct ifnet_interface_advisory, version) ==
3321 offsetof(struct ifnet_interface_advisory, header.version));
3322 _CASSERT(offsetof(struct ifnet_interface_advisory, direction) ==
3323 offsetof(struct ifnet_interface_advisory, header.direction));
3324 _CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) ==
3325 offsetof(struct ifnet_interface_advisory, header.interface_type));
3326
3327 if (__improbable(kern_ctx == NULL || advisory == NULL)) {
3328 return EINVAL;
3329 }
3330 if (__improbable((advisory->header.version <
3331 IF_INTERFACE_ADVISORY_VERSION_MIN) ||
3332 (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
3333 SK_ERR("Invalid advisory version %d", advisory->header.version);
3334 return EINVAL;
3335 }
3336 if (__improbable((advisory->header.direction !=
3337 IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3338 (advisory->header.direction !=
3339 IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3340 SK_ERR("Invalid advisory direction %d",
3341 advisory->header.direction);
3342 return EINVAL;
3343 }
3344 if (__improbable(((advisory->header.interface_type <
3345 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
3346 (advisory->header.interface_type >
3347 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
3348 (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
3349 SK_ERR("Invalid advisory interface type %d",
3350 advisory->header.interface_type);
3351 return EINVAL;
3352 }
3353 return nx_netif_interface_advisory_report(kern_ctx, advisory);
3354 }
3355
3356 void
3357 nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3358 {
3359 struct kern_nexus *nx_netif;
3360 struct nx_netif *nif;
3361
3362 if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3363 return;
3364 }
3365 if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3366 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3367 nx_netif = fsw->fsw_nifna->na_nx;
3368 } else {
3369 nx_netif = nx;
3370 }
3371 ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3372 nif = NX_NETIF_PRIVATE(nx_netif);
3373 if (nif->nif_intf_adv_config != NULL) {
3374 nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3375 }
3376 }
3377
3378 void
3379 nx_netif_get_interface_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu,
3380 uint32_t *tso_v6_mtu)
3381 {
3382 #pragma unused (ifp)
3383 *tso_v4_mtu = 0;
3384 *tso_v6_mtu = 0;
3385
3386 #ifdef XNU_TARGET_OS_OSX
3387 if (SKYWALK_CAPABLE(ifp) && SKYWALK_NATIVE(ifp)) {
3388 struct nx_netif *nif = NA(ifp)->nifna_netif;
3389
3390 if ((nif->nif_hwassist & IFNET_TSO_IPV4) != 0) {
3391 *tso_v4_mtu = ifp->if_tso_v4_mtu;
3392 }
3393 if ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0) {
3394 *tso_v6_mtu = ifp->if_tso_v6_mtu;
3395 }
3396 }
3397 #endif /* XNU_TARGET_OS_OSX */
3398 }
3399
3400 /*
3401 * This function has no use anymore since we are now passing truncated packets
3402 * to filters. We keep this logic just in case we need to prevent certain
3403 * packets from being passed to filters.
3404 */
3405 static boolean_t
3406 packet_is_filterable(struct nexus_netif_adapter *nifna,
3407 struct __kern_packet *pkt)
3408 {
3409 #pragma unused (nifna, pkt)
3410 return TRUE;
3411 }
3412
3413 /*
3414 * This function is only meant for supporting the RX path because the TX path
3415 * will not send packets > MTU size due to the disabling of TSO when filters
3416 * are enabled.
3417 */
3418 static void
3419 get_filterable_packets(struct nexus_netif_adapter *nifna,
3420 struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3421 struct __kern_packet **passthrough_chain)
3422 {
3423 struct nx_netif *nif = nifna->nifna_netif;
3424 struct netif_stats *nifs = &nif->nif_stats;
3425 struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3426 struct __kern_packet *fpkt_head = NULL, *passthrough_head = NULL;
3427 struct __kern_packet **fpkt_tailp = &fpkt_head;
3428 struct __kern_packet **passthrough_tailp = &passthrough_head;
3429 int fcnt = 0, pcnt = 0, dcnt = 0;
3430
3431 while (pkt != NULL) {
3432 next = pkt->pkt_nextpkt;
3433 pkt->pkt_nextpkt = NULL;
3434
3435 if (!packet_is_filterable(nifna, pkt)) {
3436 pcnt++;
3437 *passthrough_tailp = pkt;
3438 passthrough_tailp = &pkt->pkt_nextpkt;
3439 pkt = next;
3440 continue;
3441 }
3442 fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3443 if (fpkt != NULL) {
3444 fcnt++;
3445 *fpkt_tailp = fpkt;
3446 fpkt_tailp = &fpkt->pkt_nextpkt;
3447 } else {
3448 dcnt++;
3449 }
3450 pkt = next;
3451 }
3452 *fpkt_chain = fpkt_head;
3453 *passthrough_chain = passthrough_head;
3454
3455 /*
3456 * No need to increment drop stats because that's already
3457 * done in nx_netif_pkt_to_filter_pkt.
3458 */
3459 STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3460 DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3461 int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3462 fpkt_head, struct __kern_packet *, passthrough_head);
3463 }
3464
3465 /*
3466 * This is only used by ring-based notify functions for now.
3467 * When a qset-based notify becomes available, this function can be used
3468 * unmodified.
3469 */
3470 void
3471 netif_receive(struct nexus_netif_adapter *nifna,
3472 struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3473 {
3474 struct nx_netif *nif = nifna->nifna_netif;
3475 struct nexus_adapter *na = &nifna->nifna_up;
3476 struct netif_stats *nifs = &nif->nif_stats;
3477 int err, dropcnt, dropstat = -1;
3478
3479 /* update our work timestamp */
3480 na->na_work_ts = _net_uptime;
3481
3482 if (nif->nif_filter_cnt > 0) {
3483 struct __kern_packet *fpkt_chain = NULL;
3484 struct __kern_packet *passthrough_chain = NULL;
3485
3486 get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
3487 &passthrough_chain);
3488 if (fpkt_chain != NULL) {
3489 (void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3490 NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3491 }
3492 if (passthrough_chain != NULL) {
3493 pkt_chain = passthrough_chain;
3494 } else {
3495 return;
3496 }
3497 } else if (nx_netif_filter_default_drop != 0) {
3498 DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3499 struct __kern_packet *, pkt_chain);
3500 dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3501 goto drop;
3502 }
3503 if (nif->nif_flow_cnt > 0) {
3504 struct __kern_packet *remain = NULL;
3505
3506 err = nx_netif_demux(nifna, pkt_chain, &remain,
3507 NETIF_FLOW_SOURCE);
3508 if (remain == NULL) {
3509 return;
3510 }
3511 pkt_chain = remain;
3512 }
3513 if (na->na_rx != NULL) {
3514 na->na_rx(na, pkt_chain, stats);
3515 } else {
3516 DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3517 struct __kern_packet *, pkt_chain);
3518 dropstat = NETIF_STATS_DROP_NO_RX_CB;
3519 goto drop;
3520 }
3521 return;
3522 drop:
3523 dropcnt = 0;
3524 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3525 if (dropstat != -1) {
3526 STATS_ADD(nifs, dropstat, dropcnt);
3527 }
3528 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3529 }
3530
3531 static slot_idx_t
3532 netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3533 slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3534 {
3535 uint64_t elapsed;
3536 uint64_t now;
3537 struct __kern_packet *pkt;
3538 clock_sec_t sec;
3539 clock_usec_t usec;
3540 slot_idx_t i;
3541
3542 if (__probable(rate == 0)) {
3543 return end;
3544 }
3545
3546 /* init tbr if not so */
3547 if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3548 r->ckr_tbr_token = rate;
3549 r->ckr_tbr_depth = rate;
3550 r->ckr_tbr_last = mach_absolute_time();
3551 } else {
3552 now = mach_absolute_time();
3553 elapsed = now - r->ckr_tbr_last;
3554 absolutetime_to_microtime(elapsed, &sec, &usec);
3555 r->ckr_tbr_token +=
3556 ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3557 if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3558 r->ckr_tbr_token = r->ckr_tbr_depth;
3559 }
3560 r->ckr_tbr_last = now;
3561 }
3562
3563 *rate_limited = FALSE;
3564 for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
3565 pkt = KR_KSD(r, i)->sd_pkt;
3566 if (__improbable(pkt == NULL)) {
3567 continue;
3568 }
3569 if (__improbable(r->ckr_tbr_token <= 0)) {
3570 end = i;
3571 *rate_limited = TRUE;
3572 break;
3573 }
3574 r->ckr_tbr_token -= pkt->pkt_length * 8;
3575 }
3576
3577 SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3578 r, r->ckr_name, i);
3579
3580 return end;
3581 }
3582
3583 SK_NO_INLINE_ATTRIBUTE
3584 static struct __kern_packet *
3585 consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3586 {
3587 struct __kern_packet *pkt_chain = NULL, **tailp = &pkt_chain;
3588 slot_idx_t idx = ring->ckr_rhead;
3589
3590 while (idx != end) {
3591 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3592 struct __kern_packet *pkt = ksd->sd_pkt;
3593
3594 ASSERT(pkt->pkt_nextpkt == NULL);
3595 KR_SLOT_DETACH_METADATA(ring, ksd);
3596 *tailp = pkt;
3597 tailp = &pkt->pkt_nextpkt;
3598 idx = SLOT_NEXT(idx, ring->ckr_lim);
3599 }
3600 ring->ckr_rhead = end;
3601 ring->ckr_rtail = ring->ckr_ktail;
3602 return pkt_chain;
3603 }
3604
3605 int
3606 netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3607 uint32_t flags)
3608 {
3609 struct nexus_adapter *hwna;
3610 struct nexus_netif_adapter *nifna;
3611 struct nx_netif *nif;
3612 struct __kern_packet *pkt_chain;
3613 struct nexus_pkt_stats stats;
3614 sk_protect_t protect;
3615 slot_idx_t ktail;
3616 int err = 0;
3617
3618 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3619 SK_KVA(ring));
3620
3621 ASSERT(ring->ckr_tx == NR_RX);
3622 ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3623
3624 err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3625 if (err != 0) {
3626 /* not a serious error, so no need to be chatty here */
3627 SK_DF(SK_VERB_FSW,
3628 "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3629 "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3630 ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3631 CKRF_BITS, err);
3632 goto out;
3633 }
3634 if (__improbable(KR_DROP(ring))) {
3635 kr_exit(ring);
3636 err = ENODEV;
3637 goto out;
3638 }
3639 hwna = KRNA(ring);
3640 nifna = NIFNA(hwna);
3641 nif = nifna->nifna_netif;
3642 if (__improbable(hwna->na_ifp == NULL)) {
3643 kr_exit(ring);
3644 err = ENODEV;
3645 goto out;
3646 }
3647 protect = sk_sync_protect();
3648 err = ring->ckr_na_sync(ring, p, 0);
3649 if (err != 0 && err != EAGAIN) {
3650 goto put_out;
3651 }
3652
3653 /* read the tail pointer once */
3654 ktail = ring->ckr_ktail;
3655 if (__improbable(ring->ckr_khead == ktail)) {
3656 SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3657 "how strange, interrupt with no packets on hwna "
3658 "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3659 goto put_out;
3660 }
3661 ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
3662 ktail, &ring->ckr_rate_limited);
3663
3664 pkt_chain = consume_pkts(ring, ktail);
3665 if (pkt_chain != NULL) {
3666 netif_receive(nifna, pkt_chain, &stats);
3667
3668 if (ring->ckr_netif_mit_stats != NULL &&
3669 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3670 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3671 stats.nps_bytes);
3672 }
3673 }
3674
3675 put_out:
3676 sk_sync_unprotect(protect);
3677 kr_exit(ring);
3678
3679 out:
3680 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3681 SK_KVA(ring), err);
3682 return err;
3683 }
3684
3685 int
3686 netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3687 uint32_t flags)
3688 {
3689 #pragma unused(p, flags)
3690 sk_protect_t protect;
3691 struct nexus_adapter *hwna;
3692 struct nexus_pkt_stats stats = {};
3693 uint32_t i, count;
3694 int err = 0;
3695
3696 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3697 SK_KVA(ring));
3698
3699 /* XXX
3700 * sk_sync_protect() is not needed for this case because
3701 * we are not using the dev ring. Unfortunately lots of
3702 * macros used by fsw still require this.
3703 */
3704 protect = sk_sync_protect();
3705 hwna = KRNA(ring);
3706 count = na_get_nslots(hwna, NR_RX);
3707 err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
3708 if (__improbable(err != 0)) {
3709 SK_ERR("nx_rx_sync_packets failed: %d", err);
3710 DTRACE_SKYWALK2(rx__sync__packets__failed,
3711 struct __kern_channel_ring *, ring, int, err);
3712 goto out;
3713 }
3714 DTRACE_SKYWALK1(chain__count, uint32_t, count);
3715 for (i = 0; i < count; i++) {
3716 struct __kern_packet *pkt_chain;
3717
3718 pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3719 ASSERT(pkt_chain != NULL);
3720 netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);
3721
3722 if (ring->ckr_netif_mit_stats != NULL &&
3723 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3724 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3725 stats.nps_bytes);
3726 }
3727 }
3728 out:
3729 sk_sync_unprotect(protect);
3730 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3731 SK_KVA(ring), err);
3732 return err;
3733 }
3734
3735
3736 /*
3737 * Configure the NA to operate in a particular mode.
3738 */
3739 static channel_ring_notify_t
3740 netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3741 {
3742 channel_ring_notify_t notify = NULL;
3743 boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3744 nx_has_rx_sync_packets(ring));
3745
3746 if (mode == NETIF_MODE_FSW) {
3747 notify = (has_sync_pkts ? netif_rx_notify_fast :
3748 netif_rx_notify_default);
3749 } else if (mode == NETIF_MODE_LLW) {
3750 notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3751 netif_llw_rx_notify_default);
3752 }
3753 return notify;
3754 }
3755
3756
3757 static uint32_t
3758 netif_mode_to_flag(netif_mode_t mode)
3759 {
3760 uint32_t flag = 0;
3761
3762 if (mode == NETIF_MODE_FSW) {
3763 flag = NAF_MODE_FSW;
3764 } else if (mode == NETIF_MODE_LLW) {
3765 flag = NAF_MODE_LLW;
3766 }
3767 return flag;
3768 }
3769
3770 static void
3771 netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3772 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3773 struct nexus_pkt_stats *), boolean_t set)
3774 {
3775 uint32_t i;
3776 uint32_t flag;
3777
3778 ASSERT(hwna->na_type == NA_NETIF_DEV ||
3779 hwna->na_type == NA_NETIF_COMPAT_DEV);
3780
3781 for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
3782 struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
3783 channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);
3784
3785 if (set) {
3786 kr->ckr_save_notify = kr->ckr_netif_notify;
3787 kr->ckr_netif_notify = notify;
3788 } else {
3789 kr->ckr_netif_notify = kr->ckr_save_notify;
3790 kr->ckr_save_notify = NULL;
3791 }
3792 }
3793 if (set) {
3794 hwna->na_rx = rx;
3795 flag = netif_mode_to_flag(mode);
3796 atomic_bitset_32(&hwna->na_flags, flag);
3797 } else {
3798 hwna->na_rx = NULL;
3799 atomic_bitclear_32(&hwna->na_flags,
3800 (NAF_MODE_FSW | NAF_MODE_LLW));
3801 }
3802 }
3803
3804 void
3805 netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3806 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3807 struct nexus_pkt_stats *))
3808 {
3809 return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3810 }
3811
3812 void
3813 netif_hwna_clear_mode(struct nexus_adapter *hwna)
3814 {
3815 return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
3816 }
3817
3818 static void
3819 netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3820 {
3821 struct nexus_netif_adapter *nifna = NIFNA(na);
3822 struct nx_netif *nif = nifna->nifna_netif;
3823 struct netif_stats *nifs = &nif->nif_stats;
3824 struct __kern_channel_ring *r;
3825 struct nexus_pkt_stats stats;
3826 sk_protect_t protect;
3827 boolean_t ring_drop = FALSE;
3828 int err, dropcnt;
3829
3830 if (!NA_OWNED_BY_FSW(na)) {
3831 DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3832 goto fail;
3833 }
3834 ASSERT(na->na_rx != NULL);
3835
3836 /*
3837 * XXX
3838 * This function is called when a filter injects a packet back to the
3839 * regular RX path. We can assume the ring is 0 for now because RSS
3840 * is not supported. This needs to be revisited when we add support for
3841 * RSS.
3842 */
3843 r = &na->na_rx_rings[0];
3844 ASSERT(r->ckr_tx == NR_RX);
3845 err = kr_enter(r, TRUE);
3846 VERIFY(err == 0);
3847
3848 if (__improbable(KR_DROP(r))) {
3849 kr_exit(r);
3850 DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3851 struct __kern_channel_ring *, r);
3852 ring_drop = TRUE;
3853 goto fail;
3854 }
3855 protect = sk_sync_protect();
3856 na->na_rx(na, pkt_chain, &stats);
3857
3858 if (r->ckr_netif_mit_stats != NULL &&
3859 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3860 r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3861 }
3862 sk_sync_unprotect(protect);
3863
3864 kr_exit(r);
3865 return;
3866
3867 fail:
3868 dropcnt = 0;
3869 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3870 if (ring_drop) {
3871 STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3872 }
3873 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3874 }
3875
3876 /*
3877 * This is called when an inbound packet has traversed all filters.
3878 */
3879 errno_t
3880 nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3881 struct __kern_packet *fpkt_chain, uint32_t flags)
3882 {
3883 #pragma unused (flags)
3884 struct nx_netif *nif = nifna->nifna_netif;
3885 struct netif_stats *nifs = &nif->nif_stats;
3886 struct nexus_adapter *na = &nifna->nifna_up;
3887 struct __kern_packet *pkt_chain;
3888 int err;
3889
3890 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3891 fpkt_chain, NETIF_CONVERT_RX);
3892 if (pkt_chain == NULL) {
3893 return ENOMEM;
3894 }
3895 if (nif->nif_flow_cnt > 0) {
3896 struct __kern_packet *remain = NULL;
3897
3898 err = nx_netif_demux(nifna, pkt_chain, &remain,
3899 NETIF_FLOW_INJECT);
3900 if (remain == NULL) {
3901 return err;
3902 }
3903 pkt_chain = remain;
3904 }
3905 if (na->na_rx != NULL) {
3906 netif_inject_rx(na, pkt_chain);
3907 } else {
3908 int dropcnt = 0;
3909 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3910 STATS_ADD(nifs,
3911 NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3912 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3913 }
3914 return 0;
3915 }
3916
3917 /*
3918 * This is called when an outbound packet has traversed all filters.
3919 */
3920 errno_t
3921 nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3922 struct __kern_packet *fpkt_chain, uint32_t flags)
3923 {
3924 #pragma unused (flags)
3925 struct nx_netif *nif = nifna->nifna_netif;
3926 struct nexus_adapter *na = &nifna->nifna_up;
3927 int err;
3928
3929 if (NETIF_IS_COMPAT(nif)) {
3930 struct mbuf *m_chain;
3931 mbuf_svc_class_t sc;
3932
3933 m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3934 fpkt_chain, NETIF_CONVERT_TX);
3935 if (m_chain == NULL) {
3936 return ENOMEM;
3937 }
3938 /*
3939 * All packets in the chain have the same service class.
3940 * If the sc is missing or invalid, a valid value will be
3941 * returned.
3942 */
3943 sc = mbuf_get_service_class(m_chain);
3944 err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
3945 sc, m_chain);
3946 } else {
3947 struct __kern_packet *pkt_chain;
3948 kern_packet_svc_class_t sc;
3949
3950 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3951 fpkt_chain, NETIF_CONVERT_TX);
3952 if (pkt_chain == NULL) {
3953 return ENOMEM;
3954 }
3955 /*
3956 * All packets in the chain have the same service class.
3957 * If the sc is missing or invalid, a valid value will be
3958 * returned.
3959 */
3960 sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
3961 err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
3962 sc, pkt_chain);
3963 }
3964 /* Tell driver to resume dequeuing */
3965 ifnet_start(na->na_ifp);
3966 return err;
3967 }
3968
3969 void
3970 nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
3971 struct skmem_region_params *srp)
3972 {
3973 #pragma unused(na, srp)
3974 return;
3975 }
3976
3977 /* returns true, if starter thread is utilized */
3978 static bool
3979 netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
3980 {
3981 #if (DEVELOPMENT || DEBUG)
3982 if (__improbable(nx_netif_force_ifnet_start != 0)) {
3983 ifnet_start(ifp);
3984 return true;
3985 }
3986 #endif /* !DEVELOPMENT && !DEBUG */
3987 /*
3988 * use starter thread in following conditions:
3989 * - interface is not skywalk native
3990 * - interface attached to virtual driver (ipsec, utun)
3991 * - TBR is enabled
3992 * - delayed start mechanism is in use
3993 * - remaining stack space on the thread is not enough for driver
3994 * - caller is in rx workloop context
3995 * - caller is from the flowswitch path doing ARP resolving
3996 * - caller requires the use of starter thread (stack usage)
3997 */
3998 if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
3999 !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
4000 ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
4001 IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
4002 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4003 sk_is_rx_notify_protected() ||
4004 sk_is_async_transmit_protected() ||
4005 (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
4006 DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
4007 uint32_t, flags);
4008 ifnet_start(ifp);
4009 return true;
4010 }
4011 lck_mtx_lock_spin(&ifp->if_start_lock);
4012 /* interface is flow controlled */
4013 if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
4014 lck_mtx_unlock(&ifp->if_start_lock);
4015 return true;
4016 }
4017 /* if starter thread is active, utilize it */
4018 if (ifp->if_start_active) {
4019 ifp->if_start_req++;
4020 lck_mtx_unlock(&ifp->if_start_lock);
4021 return true;
4022 }
4023 lck_mtx_unlock(&ifp->if_start_lock);
4024 /* Check remaining stack space */
4025 if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
4026 ifnet_start(ifp);
4027 return true;
4028 }
4029 return false;
4030 }
4031
4032 void
4033 netif_transmit(struct ifnet *ifp, uint32_t flags)
4034 {
4035 if (netif_use_starter_thread(ifp, flags)) {
4036 return;
4037 }
4038 /*
4039 * If no longer attached, don't issue doorbell as ifp
4040 * is being destroyed; else hold an IO refcnt to
4041 * prevent the interface from being detached.
4042 */
4043 if (!ifnet_datamov_begin(ifp)) {
4044 return;
4045 }
4046 nx_netif_doorbell_internal(ifp, flags);
4047 /*
4048 * Release the IO refcnt taken above.
4049 */
4050 ifnet_datamov_end(ifp);
4051 }
4052
4053 static struct ifclassq *
4054 netif_get_default_ifcq(struct nexus_adapter *hwna)
4055 {
4056 struct nx_netif *nif;
4057 struct ifclassq *ifcq;
4058
4059 nif = NX_NETIF_PRIVATE(hwna->na_nx);
4060 if (NETIF_LLINK_ENABLED(nif)) {
4061 struct netif_qset *qset;
4062
4063 /*
4064 * Use the default ifcq for now.
4065 * In the future this could be chosen by the caller.
4066 */
4067 qset = nx_netif_get_default_qset_noref(nif);
4068 ASSERT(qset != NULL);
4069 ifcq = qset->nqs_ifcq;
4070 } else {
4071 ifcq = nif->nif_ifp->if_snd;
4072 }
4073 return ifcq;
4074 }
4075
4076 static errno_t
4077 netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
4078 uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
4079 boolean_t *pkts_pending, kern_packet_svc_class_t sc,
4080 uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
4081 {
4082 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
4083 struct ifnet *ifp = hwna->na_ifp;
4084 uint32_t pkts_cnt;
4085 uint32_t bytes_cnt;
4086 errno_t rc;
4087
4088 ASSERT(ifp != NULL);
4089 ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
4090 ASSERT((pkt_limit != 0) && (byte_limit != 0));
4091
4092 if (ifcq == NULL) {
4093 ifcq = netif_get_default_ifcq(hwna);
4094 }
4095 if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
4096 rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
4097 pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4098 } else {
4099 rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
4100 &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4101 }
4102 ASSERT((rc == 0) || (rc == EAGAIN));
4103 ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
4104
4105 ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
4106 &pkts_cnt, &bytes_cnt);
4107 *pkts_pending = pkts_cnt > 0;
4108
4109 *head = pkt_head.cp_kpkt;
4110 return rc;
4111 }
4112
4113 #if SK_LOG
4114 /* Hoisted out of line to reduce kernel stack footprint */
4115 SK_LOG_ATTRIBUTE
4116 static void
4117 netif_no_ring_space_log(const struct nexus_adapter *na,
4118 const kern_channel_ring_t ring)
4119 {
4120 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4121 "no ring space: na \"%s\" [%u] "
4122 "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)"
4123 "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)",
4124 na->na_name, ring->ckr_ring_id,
4125 ring->ckr_name, ring->ckr_khead,
4126 ring->ckr_ktail, ring->ckr_klease,
4127 ring->ckr_rhead, ring->ckr_rtail);
4128 }
4129 #endif /* SK_LOG */
4130
4131 /*
4132 * netif refill function for rings
4133 */
4134 errno_t
4135 netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
4136 uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
4137 boolean_t canblock)
4138 {
4139 struct nexus_adapter *hwna;
4140 struct ifnet *ifp;
4141 struct __kern_packet *head = NULL;
4142 sk_protect_t protect;
4143 errno_t rc = 0;
4144 errno_t sync_err = 0;
4145 uint32_t npkts = 0, consumed = 0;
4146 uint32_t flags;
4147 slot_idx_t idx, ktail;
4148 int ring_space = 0;
4149
4150 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
4151
4152 VERIFY(ring != NULL);
4153 hwna = KRNA(ring);
4154 ifp = hwna->na_ifp;
4155
4156 ASSERT(hwna->na_type == NA_NETIF_DEV);
4157 ASSERT(ring->ckr_tx == NR_TX);
4158 *pkts_pending = FALSE;
4159
4160 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
4161 SK_ERR("invalid limits plim %d, blim %d",
4162 pkt_limit, byte_limit);
4163 rc = EINVAL;
4164 goto out;
4165 }
4166
4167 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
4168 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
4169 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4170 rc = ENXIO;
4171 goto out;
4172 }
4173
4174 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
4175 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
4176 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4177 rc = ENXIO;
4178 goto out;
4179 }
4180
4181 /*
4182 * if the ring is busy, it means another dequeue is in
4183 * progress, so ignore this request and return success.
4184 */
4185 if (kr_enter(ring, canblock) != 0) {
4186 rc = 0;
4187 goto out;
4188 }
4189 /* mark thread with sync-in-progress flag */
4190 protect = sk_sync_protect();
4191
4192 if (__improbable(KR_DROP(ring) ||
4193 !NA_IS_ACTIVE(ring->ckr_na))) {
4194 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
4195 rc = ENXIO;
4196 goto done;
4197 }
4198
4199 idx = ring->ckr_rhead;
4200 ktail = ring->ckr_ktail;
4201 /* calculate available space on tx ring */
4202 ring_space = ktail - idx;
4203 if (ring_space < 0) {
4204 ring_space += ring->ckr_num_slots;
4205 }
4206 if (ring_space == 0) {
4207 struct ifclassq *ifcq;
4208
4209 /* no space in ring, driver should retry */
4210 #if SK_LOG
4211 if (__improbable((sk_verbose &
4212 (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
4213 netif_no_ring_space_log(hwna, ring);
4214 }
4215 #endif /* SK_LOG */
4216 ifcq = netif_get_default_ifcq(hwna);
4217 if (IFCQ_LEN(ifcq) != 0) {
4218 *pkts_pending = TRUE;
4219 }
4220 /*
4221 * We ran out of space in ring, most probably
4222 * because the driver is slow to drain its TX queue.
4223 * We want another doorbell to be generated as soon
4224 * as the TX notify completion happens; mark this
4225 * through ckr_pending_doorbell counter. Do this
4226 * regardless of whether there's any pending packet.
4227 */
4228 ring->ckr_pending_doorbell++;
4229 rc = EAGAIN;
4230 goto sync_ring;
4231 }
4232
4233 if ((uint32_t)ring_space < pkt_limit) {
4234 pkt_limit = ring_space;
4235 }
4236
4237 if (tx_doorbell_ctxt &&
4238 ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4239 pkt_limit = MIN(pkt_limit,
4240 nx_netif_doorbell_max_dequeue);
4241 }
4242
4243 rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4244 &head, pkts_pending, ring->ckr_svc, NULL, NULL, 0);
4245
4246 /*
4247 * There's room in ring; if we haven't dequeued everything,
4248 * mark ckr_pending_doorbell for the next TX notify to issue
4249 * a TX door bell; otherwise, clear it. The next packet that
4250 * gets enqueued will trigger a door bell again.
4251 */
4252 if (*pkts_pending) {
4253 ring->ckr_pending_doorbell++;
4254 } else if (ring->ckr_pending_doorbell != 0) {
4255 ring->ckr_pending_doorbell = 0;
4256 }
4257
4258 if (rc != 0) {
4259 /*
4260 * This is expected sometimes as the IOSkywalkFamily
4261 * errs on the side of caution to perform an extra
4262 * dequeue when multiple doorbells are pending;
4263 * nothing to dequeue, do a sync if there are slots
4264 * to reclaim else just return.
4265 */
4266 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4267 "nothing to dequeue, err %d", rc);
4268
4269 if ((uint32_t)ring_space == ring->ckr_lim) {
4270 goto done;
4271 } else {
4272 goto sync_ring;
4273 }
4274 }
4275 /* move the dequeued packets to tx ring */
4276 while (head != NULL && idx != ktail) {
4277 ASSERT(npkts <= pkt_limit);
4278 struct __kern_packet *pkt = head;
4279 KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
4280 (struct __kern_quantum *)pkt);
4281 npkts++;
4282 if (__improbable(pkt->pkt_trace_id != 0)) {
4283 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4284 KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4285 }
4286 idx = SLOT_NEXT(idx, ring->ckr_lim);
4287 head = pkt->pkt_nextpkt;
4288 pkt->pkt_nextpkt = NULL;
4289 }
4290
4291 /*
4292 * We checked for ring space earlier so the ring should have enough
4293 * space for the entire chain.
4294 */
4295 ASSERT(head == NULL);
4296 ring->ckr_rhead = idx;
4297
4298 sync_ring:
4299 flags = NA_SYNCF_NETIF;
4300 if (ring->ckr_pending_doorbell != 0) {
4301 flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4302 }
4303
4304 ring->ckr_khead_pre = ring->ckr_khead;
4305 sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4306 if (sync_err != 0 && sync_err != EAGAIN) {
4307 SK_ERR("unexpected sync err %d", sync_err);
4308 if (rc == 0) {
4309 rc = sync_err;
4310 }
4311 goto done;
4312 }
4313 /*
4314 * Verify that the driver has detached packets from the consumed slots.
4315 */
4316 idx = ring->ckr_khead_pre;
4317 consumed = 0;
4318 while (idx != ring->ckr_khead) {
4319 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4320
4321 consumed++;
4322 VERIFY(!KSD_VALID_METADATA(ksd));
4323 idx = SLOT_NEXT(idx, ring->ckr_lim);
4324 }
4325 ring->ckr_khead_pre = ring->ckr_khead;
4326
4327 done:
4328 sk_sync_unprotect(protect);
4329 kr_exit(ring);
4330 out:
4331 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4332 SK_KVA(ring), rc, 0, npkts);
4333
4334 return rc;
4335 }
4336
4337 #define NQ_EWMA(old, new, decay) do { \
4338 u_int64_t _avg; \
4339 if (__probable((_avg = (old)) > 0)) \
4340 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
4341 else \
4342 _avg = (new); \
4343 (old) = _avg; \
4344 } while (0)
4345
4346 static void
4347 kern_netif_increment_queue_stats(kern_netif_queue_t queue,
4348 uint32_t pkt_count, uint32_t byte_count)
4349 {
4350 struct netif_llink *llink = queue->nq_qset->nqs_llink;
4351 struct ifnet *ifp = llink->nll_nif->nif_ifp;
4352 if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
4353 atomic_add_64(&ifp->if_data.ifi_opackets, pkt_count);
4354 atomic_add_64(&ifp->if_data.ifi_obytes, byte_count);
4355 } else {
4356 atomic_add_64(&ifp->if_data.ifi_ipackets, pkt_count);
4357 atomic_add_64(&ifp->if_data.ifi_ibytes, byte_count);
4358 }
4359
4360 if (ifp->if_data_threshold != 0) {
4361 ifnet_notify_data_threshold(ifp);
4362 }
4363
4364 uint64_t now;
4365 uint64_t diff_secs;
4366 struct netif_qstats *stats = &queue->nq_stats;
4367
4368 if (nq_stat_enable == 0) {
4369 return;
4370 }
4371
4372 if (__improbable(pkt_count == 0)) {
4373 return;
4374 }
4375
4376 stats->nq_num_xfers++;
4377 stats->nq_total_bytes += byte_count;
4378 stats->nq_total_pkts += pkt_count;
4379 if (pkt_count > stats->nq_max_pkts) {
4380 stats->nq_max_pkts = pkt_count;
4381 }
4382 if (stats->nq_min_pkts == 0 ||
4383 pkt_count < stats->nq_min_pkts) {
4384 stats->nq_min_pkts = pkt_count;
4385 }
4386
4387 now = net_uptime();
4388 if (__probable(queue->nq_accumulate_start != 0)) {
4389 diff_secs = now - queue->nq_accumulate_start;
4390 if (diff_secs >= nq_accumulate_interval) {
4391 uint64_t bps;
4392 uint64_t pps;
4393 uint64_t pps_ma;
4394
4395 /* bytes per second */
4396 bps = queue->nq_accumulated_bytes / diff_secs;
4397 NQ_EWMA(stats->nq_bytes_ps_ma,
4398 bps, nq_transfer_decay);
4399 stats->nq_bytes_ps = bps;
4400
4401 /* pkts per second */
4402 pps = queue->nq_accumulated_pkts / diff_secs;
4403 pps_ma = stats->nq_pkts_ps_ma;
4404 NQ_EWMA(pps_ma, pps, nq_transfer_decay);
4405 stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
4406 stats->nq_pkts_ps = (uint32_t)pps;
4407
4408 /* start over */
4409 queue->nq_accumulate_start = now;
4410 queue->nq_accumulated_bytes = 0;
4411 queue->nq_accumulated_pkts = 0;
4412
4413 stats->nq_min_pkts = 0;
4414 stats->nq_max_pkts = 0;
4415 }
4416 } else {
4417 queue->nq_accumulate_start = now;
4418 }
4419 queue->nq_accumulated_bytes += byte_count;
4420 queue->nq_accumulated_pkts += pkt_count;
4421 }
4422
4423 void
4424 kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4425 uint32_t count, uint32_t flags)
4426 {
4427 #pragma unused (count)
4428 struct netif_queue *q = queue;
4429 struct netif_llink *llink = q->nq_qset->nqs_llink;
4430 struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4431 bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4432 struct pktq *pktq = &q->nq_pktq;
4433 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4434 struct nexus_pkt_stats stats;
4435 sk_protect_t protect;
4436
4437 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4438 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4439 int drop_cnt = 0;
4440
4441 pp_free_packet_chain(pkt_chain, &drop_cnt);
4442 STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4443 return;
4444 }
4445 KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4446 if (flush) {
4447 pkt_chain = KPKTQ_FIRST(pktq);
4448 KPKTQ_INIT(pktq);
4449
4450 protect = sk_sync_protect();
4451 netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
4452 sk_sync_unprotect(protect);
4453 kern_netif_increment_queue_stats(queue, (uint32_t)stats.nps_pkts,
4454 (uint32_t)stats.nps_bytes);
4455 }
4456 }
4457
4458 errno_t
4459 kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4460 uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4461 {
4462 struct netif_queue *q = queue;
4463 struct netif_llink *llink = q->nq_qset->nqs_llink;
4464 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4465 struct nexus_adapter *hwna;
4466 struct __kern_packet *pkt_chain = NULL;
4467 uint32_t bytes = 0, pkt_cnt = 0;
4468 errno_t rc;
4469
4470 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4471 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4472 STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4473 return ENXIO;
4474 }
4475 hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4476
4477 if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4478 sk_is_tx_notify_protected()) {
4479 pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4480 }
4481 rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
4482 byte_limit, &pkt_chain, pending, q->nq_svc, &pkt_cnt, &bytes,
4483 q->nq_qset->nqs_idx);
4484
4485 if (pkt_cnt > 0) {
4486 kern_netif_increment_queue_stats(queue, pkt_cnt, bytes);
4487 }
4488 if (pkt_chain != NULL) {
4489 *ph_chain = SK_PKT2PH(pkt_chain);
4490 }
4491 return rc;
4492 }
4493
4494 errno_t
4495 kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
4496 uint32_t * pkts_cnt, uint32_t * bytes_cnt)
4497 {
4498 VERIFY(qset != NULL);
4499 VERIFY(pkts_cnt != NULL);
4500 VERIFY(bytes_cnt != NULL);
4501
4502 return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
4503 bytes_cnt);
4504 }
4505
4506 void
4507 kern_netif_set_qset_combined(kern_netif_qset_t qset)
4508 {
4509 VERIFY(qset != NULL);
4510 VERIFY(qset->nqs_ifcq != NULL);
4511
4512 ifclassq_set_grp_combined(qset->nqs_ifcq, qset->nqs_idx);
4513 }
4514
4515 void
4516 kern_netif_set_qset_separate(kern_netif_qset_t qset)
4517 {
4518 VERIFY(qset != NULL);
4519 VERIFY(qset->nqs_ifcq != NULL);
4520
4521 ifclassq_set_grp_separated(qset->nqs_ifcq, qset->nqs_idx);
4522 }
4523
4524 errno_t
4525 kern_nexus_netif_llink_add(struct kern_nexus *nx,
4526 struct kern_nexus_netif_llink_init *llink_init)
4527 {
4528 errno_t err;
4529 struct nx_netif *nif;
4530 struct netif_llink *llink;
4531 struct netif_stats *nifs;
4532
4533 VERIFY(nx != NULL);
4534 VERIFY(llink_init != NULL);
4535 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4536
4537 nif = NX_NETIF_PRIVATE(nx);
4538 nifs = &nif->nif_stats;
4539
4540 err = nx_netif_validate_llink_config(llink_init, false);
4541 if (err != 0) {
4542 SK_ERR("Invalid llink init params");
4543 STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4544 return err;
4545 }
4546
4547 err = nx_netif_llink_add(nif, llink_init, &llink);
4548 return err;
4549 }
4550
4551 errno_t
4552 kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4553 kern_nexus_netif_llink_id_t llink_id)
4554 {
4555 struct nx_netif *nif;
4556
4557 VERIFY(nx != NULL);
4558 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4559
4560 nif = NX_NETIF_PRIVATE(nx);
4561 return nx_netif_llink_remove(nif, llink_id);
4562 }
4563
4564 errno_t
4565 kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4566 kern_packet_svc_class_t *svc)
4567 {
4568 *svc = queue->nq_svc;
4569 return 0;
4570 }
4571