xref: /xnu-10002.1.13/bsd/skywalk/nexus/netif/nx_netif_compat.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/netif/nx_netif.h>
56 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
57 #include <mach/thread_act.h>
58 #include <kern/thread.h>
59 #include <kern/sched_prim.h>
60 
61 static void na_netif_compat_finalize(struct nexus_netif_adapter *,
62     struct ifnet *);
63 static errno_t nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
64     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
65     boolean_t poll, struct thread *tp);
66 static int nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *na,
67     boolean_t enable);
68 static int nx_netif_compat_xmit_frame(struct nexus_adapter *, struct mbuf *,
69     struct __kern_packet *);
70 
71 static int nx_netif_compat_na_notify_tx(struct __kern_channel_ring *,
72     struct proc *, uint32_t);
73 static int nx_netif_compat_na_notify_rx(struct __kern_channel_ring *,
74     struct proc *, uint32_t);
75 static int nx_netif_compat_na_activate(struct nexus_adapter *,
76     na_activate_mode_t);
77 static int nx_netif_compat_na_txsync(struct __kern_channel_ring *,
78     struct proc *, uint32_t);
79 static int nx_netif_compat_na_rxsync(struct __kern_channel_ring *,
80     struct proc *, uint32_t);
81 static void nx_netif_compat_na_dtor(struct nexus_adapter *na);
82 
83 static void nx_netif_compat_tx_intr(struct ifnet *, enum txrx, uint32_t,
84     uint32_t *);
85 static inline struct mbuf *nx_netif_compat_ring_alloc(int, int, uint16_t);
86 static inline void nx_netif_compat_ring_free(struct mbuf *m);
87 static void nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg);
88 
89 static uint32_t nx_netif_compat_tx_clean(struct netif_stats *nifs,
90     struct __kern_channel_ring *kring);
91 static void nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
92     slot_idx_t khead);
93 
94 static struct nexus_netif_compat_adapter *na_netif_compat_alloc(zalloc_flags_t);
95 static void na_netif_compat_free(struct nexus_adapter *);
96 #if DEBUG || DEVELOPMENT
97 static struct mbuf *nx_netif_rx_split(struct mbuf *, uint32_t);
98 #endif /* DEBUG || DEVELOPMENT */
99 
100 #define MBUF_TXQ(m)     ((m)->m_pkthdr.pkt_flowid)
101 #define MBUF_RXQ(m)     ((m)->m_pkthdr.pkt_flowid)
102 
103 #define NMB_PROPF_TX_NOTIFY     0x1     /* generate transmit event */
104 #define NMB_FLAGS_MASK          0x0000ffff
105 #define NMB_INDEX_MASK          0xffff0000
106 #define NMB_GET_FLAGS(p)        (((uint32_t)(p) & NMB_FLAGS_MASK))
107 #define NMB_SET_FLAGS(p, f)     (((uint32_t)(p) & ~NMB_FLAGS_MASK) | (f))
108 #define NMB_GET_INDEX(p)        (((uint32_t)(p) & NMB_INDEX_MASK) >> 16)
109 #define NMB_SET_INDEX(p, i)     (((uint32_t)(p) & ~NMB_INDEX_MASK) | (i << 16))
110 
111 static SKMEM_TYPE_DEFINE(na_netif_compat_zone, struct nexus_netif_compat_adapter);
112 
113 static int netif_tx_event_mode = 0;
114 
115 #if (DEVELOPMENT || DEBUG)
116 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk_netif, OID_AUTO, compat,
117     CTLFLAG_RW | CTLFLAG_LOCKED,
118     0, "Skywalk netif Nexus legacy compatibility support");
119 SYSCTL_INT(_kern_skywalk_netif_compat, OID_AUTO, tx_event_mode,
120     CTLFLAG_RW | CTLFLAG_LOCKED, &netif_tx_event_mode, 0, "");
121 static uint32_t netif_rx_split = 0;
122 SYSCTL_UINT(_kern_skywalk_netif_compat, OID_AUTO, rx_split,
123     CTLFLAG_RW | CTLFLAG_LOCKED, &netif_rx_split, 0, "");
124 #endif /* !DEVELOPMENT && !DEBUG */
125 
126 struct kern_nexus_domain_provider nx_netif_compat_prov_s = {
127 	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF_COMPAT,
128 	.nxdom_prov_flags =             NXDOMPROVF_DEFAULT,
129 	.nxdom_prov_cb = {
130 		.dp_cb_init =           nx_netif_prov_init,
131 		.dp_cb_fini =           nx_netif_prov_fini,
132 		.dp_cb_params =         nx_netif_prov_params,
133 		/*
134 		 * We must be using the native netif handlers below,
135 		 * since we act as the default domain provider; see
136 		 * kern_nexus_register_domain_provider().
137 		 */
138 		.dp_cb_mem_new =        nx_netif_prov_mem_new,
139 		.dp_cb_config =         nx_netif_prov_config,
140 		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
141 		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
142 		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
143 		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
144 		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
145 	},
146 };
147 
148 struct nexus_ifnet_ops na_netif_compat_ops = {
149 	.ni_finalize = na_netif_compat_finalize,
150 	.ni_reap = nx_netif_reap,
151 	.ni_dequeue = nx_netif_compat_tx_dequeue,
152 	.ni_get_len = nx_netif_compat_tx_get_len,
153 };
154 
155 #define SKMEM_TAG_NETIF_COMPAT_MIT      "com.apple.skywalk.netif.compat.mit"
156 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_mit, SKMEM_TAG_NETIF_COMPAT_MIT);
157 
158 #define SKMEM_TAG_NETIF_COMPAT_POOL     "com.apple.skywalk.netif.compat.pool"
159 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_pool, SKMEM_TAG_NETIF_COMPAT_POOL);
160 
161 void
nx_netif_compat_init(struct nxdom * nxdom)162 nx_netif_compat_init(struct nxdom *nxdom)
163 {
164 	_CASSERT(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE);
165 
166 	/*
167 	 * We want nxprov_create() coming from userland to use the
168 	 * netif_compat domain provider, so install it as default.
169 	 * This is verified by the caller.
170 	 */
171 	(void) nxdom_prov_add(nxdom, &nx_netif_compat_prov_s);
172 }
173 
174 void
nx_netif_compat_fini(void)175 nx_netif_compat_fini(void)
176 {
177 	(void) nxdom_prov_del(&nx_netif_compat_prov_s);
178 }
179 
180 static struct nexus_netif_compat_adapter *
na_netif_compat_alloc(zalloc_flags_t how)181 na_netif_compat_alloc(zalloc_flags_t how)
182 {
183 	struct nexus_netif_compat_adapter *nca;
184 
185 	_CASSERT(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0);
186 
187 	nca = zalloc_flags(na_netif_compat_zone, how | Z_ZERO);
188 	if (nca) {
189 		SK_DF(SK_VERB_MEM, "nca %p ALLOC", SK_KVA(nca));
190 	}
191 	return nca;
192 }
193 
194 static void
na_netif_compat_free(struct nexus_adapter * na)195 na_netif_compat_free(struct nexus_adapter *na)
196 {
197 	struct nexus_netif_compat_adapter *nca =
198 	    (struct nexus_netif_compat_adapter *)na;
199 
200 	SK_LOCK_ASSERT_HELD();
201 	ASSERT(na->na_refcount == 0);
202 
203 	SK_DF(SK_VERB_MEM, "nca [dev+host] %p FREE", SK_KVA(nca));
204 	bzero(nca, sizeof(*nca));
205 	zfree(na_netif_compat_zone, nca);
206 }
207 
208 /*
209  * Callback invoked when the device driver frees an mbuf used
210  * by skywalk to transmit a packet. This usually happens when
211  * the NIC notifies the driver that transmission is completed.
212  */
213 static void
nx_netif_compat_ringcb(caddr_t cl,uint32_t size,caddr_t arg)214 nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg)
215 {
216 #pragma unused(cl, size)
217 	struct mbuf *m = (void *)arg;
218 	struct ifnet *ifp = NULL;
219 	struct netif_stats *nifs = NULL;
220 	uintptr_t data; /* not used */
221 	uint32_t txq;
222 	errno_t err;
223 
224 	err = mbuf_get_tx_compl_data(m, (uintptr_t *)&ifp, &data);
225 	ASSERT(err == 0);
226 
227 	nifs = &NX_NETIF_PRIVATE(NA(ifp)->nifna_up.na_nx)->nif_stats;
228 	txq = MBUF_TXQ(m);
229 
230 	for (;;) {
231 		uint32_t p = 0, i, f;
232 
233 		(void) mbuf_cluster_get_prop(m, &p);
234 		f = NMB_GET_FLAGS(p);
235 		i = NMB_GET_INDEX(p);
236 
237 		SK_DF(SK_VERB_NETIF, "%s m 0x%llx txq %u i %u f 0x%x",
238 		    if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
239 
240 		if (f & NMB_PROPF_TX_NOTIFY) {
241 			uint32_t pn;
242 
243 			f &= ~NMB_PROPF_TX_NOTIFY;
244 			pn = NMB_SET_FLAGS(p, f);
245 
246 			err = mbuf_cluster_set_prop(m, p, pn);
247 			if (err != 0) {
248 				if (err == EBUSY) {     /* try again */
249 					continue;
250 				}
251 				/* TODO: [email protected] -- what to do? */
252 				SK_ERR("Failed to clear TX_NOTIFY "
253 				    "m 0x%llx i %u err %d", SK_KVA(m), i, err);
254 			} else {
255 				nx_netif_compat_tx_intr(ifp, NR_TX, txq, NULL);
256 				SK_DF(SK_VERB_NETIF | SK_VERB_INTR | SK_VERB_TX,
257 				    "%s TX irq m 0x%llx txq %u i %u f 0x%x",
258 				    if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
259 				STATS_INC(nifs, NETIF_STATS_TX_IRQ);
260 			}
261 		}
262 		break;
263 	}
264 }
265 
266 /* Hoisted out of line to reduce kernel stack footprint */
267 SK_NO_INLINE_ATTRIBUTE
268 static struct mbuf *
nx_netif_compat_ring_alloc(int how,int len,uint16_t idx)269 nx_netif_compat_ring_alloc(int how, int len, uint16_t idx)
270 {
271 	struct mbuf *m = NULL;
272 	size_t size = len;
273 	uint32_t i;
274 
275 	if (mbuf_ring_cluster_alloc(how, MBUF_TYPE_HEADER, &m,
276 	    nx_netif_compat_ringcb, &size) != 0) {
277 		return NULL;
278 	}
279 
280 	for (;;) {
281 		uint32_t p = 0, pn;
282 		int err;
283 
284 		(void) mbuf_cluster_get_prop(m, &p);
285 		pn = NMB_SET_FLAGS(p, 0);
286 		pn = NMB_SET_INDEX(pn, idx);
287 
288 		err = mbuf_cluster_set_prop(m, p, pn);
289 		if (err != 0) {
290 			if (err == EBUSY) {     /* try again */
291 				continue;
292 			}
293 			SK_ERR("Failed to initialize properties m 0x%llx "
294 			    "err %d", SK_KVA(m), err);
295 			m_freem(m);
296 			return NULL;
297 		}
298 		(void) mbuf_cluster_get_prop(m, &p);
299 		i = NMB_GET_INDEX(p);
300 		ASSERT(i == idx);
301 		break;
302 	}
303 
304 	SK_DF(SK_VERB_MEM, "alloc m 0x%llx size %u i %u",
305 	    SK_KVA(m), (uint32_t)size, i);
306 
307 	return m;
308 }
309 
310 /* Hoisted out of line to reduce kernel stack footprint */
311 SK_NO_INLINE_ATTRIBUTE
312 static void
nx_netif_compat_ring_free(struct mbuf * m)313 nx_netif_compat_ring_free(struct mbuf *m)
314 {
315 	if (m == NULL) {
316 		return;
317 	}
318 
319 	for (;;) {
320 		uint32_t p = 0;
321 		int err;
322 
323 		(void) mbuf_cluster_get_prop(m, &p);
324 		err = mbuf_cluster_set_prop(m, p, 0);
325 		if (err != 0) {
326 			if (err == EBUSY) {     /* try again */
327 				continue;
328 			}
329 			/* TODO: [email protected] -- what to do? */
330 			SK_ERR("Failed to clear properties m 0x%llx err %d",
331 			    SK_KVA(m), err);
332 		}
333 		break;
334 	}
335 	m_freem(m);
336 }
337 
338 static void
nx_netif_compat_tx_intr(struct ifnet * ifp,enum txrx t,uint32_t q,uint32_t * work_done)339 nx_netif_compat_tx_intr(struct ifnet *ifp, enum txrx t, uint32_t q,
340     uint32_t *work_done)
341 {
342 	struct nexus_adapter *na = &NA(ifp)->nifna_up;
343 
344 	if (__improbable(!NA_IS_ACTIVE(na) || q >= na_get_nrings(na, t))) {
345 		if (q >= na_get_nrings(na, t)) {
346 			SK_ERR("na \"%s\" (0x%llx) invalid q %u >= %u",
347 			    na->na_name, SK_KVA(na), q, na_get_nrings(na, t));
348 		}
349 	} else {
350 		(void) nx_netif_mit_tx_intr((NAKR(na, t) + q), kernproc,
351 		    0, work_done);
352 	}
353 }
354 
355 static int
nx_netif_compat_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)356 nx_netif_compat_na_notify_tx(struct __kern_channel_ring *kring,
357     struct proc *p, uint32_t flags)
358 {
359 	/*
360 	 * This should never get executed, as nothing should be invoking
361 	 * the TX ring notify callback.  The compat adapter directly
362 	 * calls nx_netif_compat_tx_intr() for TX completion from within
363 	 * nx_netif_compat_ringcb().
364 	 *
365 	 * If we ever get here, use the original na_notify callback
366 	 * saved during na_activate().
367 	 */
368 	return kring->ckr_netif_notify(kring, p, flags);
369 }
370 
371 static int
nx_netif_compat_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)372 nx_netif_compat_na_notify_rx(struct __kern_channel_ring *kring,
373     struct proc *p, uint32_t flags)
374 {
375 	/*
376 	 * This should never get executed, as nothing should be invoking
377 	 * the RX ring notify callback.  The compat adapter directly
378 	 * calls nx_netif_mit_rx_intr() for RX completion from within
379 	 * nx_netif_compat_receive().
380 	 *
381 	 * If we ever get here, use the original na_notify callback
382 	 * saved during na_activate().
383 	 */
384 	return kring->ckr_netif_notify(kring, p, flags);
385 }
386 
387 /* Enable/disable skywalk mode for a compat network interface. */
388 static int
nx_netif_compat_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)389 nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
390 {
391 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
392 	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple, rxpoll;
393 	uint32_t limit = (uint32_t)sk_netif_compat_rx_mbq_limit;
394 	struct nx_netif *nif = nifna->nifna_netif;
395 	struct nexus_netif_compat_adapter *nca;
396 	ifnet_t ifp = na->na_ifp;
397 	uint32_t i, r;
398 	int error;
399 
400 	ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
401 	ASSERT(!(na->na_flags & NAF_HOST_ONLY));
402 
403 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s", na->na_name,
404 	    SK_KVA(na), na_activate_mode2str(mode));
405 
406 	nca = (struct nexus_netif_compat_adapter *)nifna;
407 
408 	switch (mode) {
409 	case NA_ACTIVATE_MODE_ON:
410 		ASSERT(SKYWALK_CAPABLE(na->na_ifp));
411 
412 		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
413 		    &rx_mit, &rx_mit_simple);
414 
415 		/*
416 		 * Init the mitigation support on all the dev TX rings.
417 		 */
418 		if (na_get_nrings(na, NR_TX) != 0 && tx_mit) {
419 			nifna->nifna_tx_mit =
420 			    skn_alloc_type_array(tx_on, struct nx_netif_mit,
421 			    na_get_nrings(na, NR_TX), Z_WAITOK,
422 			    skmem_tag_netif_compat_mit);
423 			if (nifna->nifna_tx_mit == NULL) {
424 				SK_ERR("TX mitigation allocation failed");
425 				error = ENOMEM;
426 				goto out;
427 			}
428 		} else {
429 			ASSERT(nifna->nifna_tx_mit == NULL);
430 		}
431 
432 		/*
433 		 * Init either poller or mitigation support on all the
434 		 * dev RX rings; they're mutually exclusive and poller
435 		 * takes precedence.
436 		 */
437 		rxpoll = (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
438 		if (rxpoll) {
439 			int err;
440 			__unused kern_return_t kret;
441 			thread_precedence_policy_data_t info;
442 
443 			ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
444 			ASSERT(ifp->if_input_poll != NULL);
445 			ASSERT(ifp->if_input_ctl != NULL);
446 			if ((err =
447 			    kernel_thread_start(netif_rxpoll_compat_thread_func,
448 			    ifp, &ifp->if_poll_thread)) != KERN_SUCCESS) {
449 				panic_plain("%s: ifp=%p couldn't get a poll "
450 				    " thread; err=%d", __func__, ifp, err);
451 				/* NOTREACHED */
452 				__builtin_unreachable();
453 			}
454 			VERIFY(ifp->if_poll_thread != NULL);
455 
456 			/* wait until thread is ready */
457 			lck_mtx_lock(&ifp->if_poll_lock);
458 			while (!(ifp->if_poll_flags & IF_POLLF_READY)) {
459 				(void) assert_wait(&ifp->if_poll_flags,
460 				    THREAD_UNINT);
461 				lck_mtx_unlock(&ifp->if_poll_lock);
462 				(void) thread_block(THREAD_CONTINUE_NULL);
463 				lck_mtx_lock(&ifp->if_poll_lock);
464 			}
465 			lck_mtx_unlock(&ifp->if_poll_lock);
466 
467 			bzero(&info, sizeof(info));
468 			info.importance = 1;
469 			kret = thread_policy_set(ifp->if_poll_thread,
470 			    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
471 			    THREAD_PRECEDENCE_POLICY_COUNT);
472 			ASSERT(kret == KERN_SUCCESS);
473 			limit = if_rcvq_maxlen;
474 			(void) netif_rxpoll_set_params(ifp, NULL, FALSE);
475 			ASSERT(nifna->nifna_rx_mit == NULL);
476 		} else if (rx_mit) {
477 			nifna->nifna_rx_mit =
478 			    skn_alloc_type_array(rx_on, struct nx_netif_mit,
479 			    na_get_nrings(na, NR_RX), Z_WAITOK,
480 			    skmem_tag_netif_compat_mit);
481 			if (nifna->nifna_rx_mit == NULL) {
482 				SK_ERR("RX mitigation allocation failed");
483 				if (nifna->nifna_tx_mit != NULL) {
484 					skn_free_type_array(rx_fail,
485 					    struct nx_netif_mit,
486 					    na_get_nrings(na, NR_TX),
487 					    nifna->nifna_tx_mit);
488 					nifna->nifna_tx_mit = NULL;
489 				}
490 				error = ENOMEM;
491 				goto out;
492 			}
493 		}
494 
495 		/* intercept na_notify callback on the TX rings */
496 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
497 			na->na_tx_rings[r].ckr_netif_notify =
498 			    na->na_tx_rings[r].ckr_na_notify;
499 			na->na_tx_rings[r].ckr_na_notify =
500 			    nx_netif_compat_na_notify_tx;
501 			if (nifna->nifna_tx_mit != NULL) {
502 				nx_netif_mit_init(nif, na->na_ifp,
503 				    &nifna->nifna_tx_mit[r],
504 				    &na->na_tx_rings[r], tx_mit_simple);
505 			}
506 		}
507 
508 		/* intercept na_notify callback on the RX rings */
509 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
510 			na->na_rx_rings[r].ckr_netif_notify =
511 			    na->na_rx_rings[r].ckr_na_notify;
512 			na->na_rx_rings[r].ckr_na_notify =
513 			    nx_netif_compat_na_notify_rx;
514 			if (nifna->nifna_rx_mit != NULL) {
515 				nx_netif_mit_init(nif, na->na_ifp,
516 				    &nifna->nifna_rx_mit[r],
517 				    &na->na_rx_rings[r], rx_mit_simple);
518 			}
519 		}
520 		/*
521 		 * Initialize the rx queue, as nx_netif_compat_receive() can
522 		 * be called as soon as nx_netif_compat_catch_rx() returns.
523 		 */
524 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
525 			struct __kern_channel_ring *kr = &na->na_rx_rings[r];
526 
527 			nx_mbq_safe_init(kr, &kr->ckr_rx_queue, limit,
528 			    &nexus_mbq_lock_group, &nexus_lock_attr);
529 			SK_DF(SK_VERB_NETIF,
530 			    "na \"%s\" (0x%llx) initialized kr \"%s\" "
531 			    "(0x%llx) krflags 0x%b", na->na_name, SK_KVA(na),
532 			    kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS);
533 		}
534 
535 		/*
536 		 * Prepare packet buffers for the tx rings; don't preallocate
537 		 * the mbufs here, leave this to nx_netif_compat_na_txsync().
538 		 */
539 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
540 			na->na_tx_rings[r].ckr_tx_pool = NULL;
541 		}
542 
543 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
544 			na->na_tx_rings[r].ckr_tx_pool =
545 			    skn_alloc_type_array(tx_pool_on, struct mbuf *,
546 			    na_get_nslots(na, NR_TX), Z_WAITOK,
547 			    skmem_tag_netif_compat_pool);
548 			if (na->na_tx_rings[r].ckr_tx_pool == NULL) {
549 				SK_ERR("ckr_tx_pool allocation failed");
550 				error = ENOMEM;
551 				goto free_tx_pools;
552 			}
553 		}
554 
555 		/* Prepare to intercept incoming traffic. */
556 		error = nx_netif_compat_catch_rx(nca, TRUE);
557 		if (error != 0) {
558 			SK_ERR("RX intercept failed (%d)", error);
559 			goto uncatch;
560 		}
561 		nx_netif_filter_enable(nifna->nifna_netif);
562 		nx_netif_flow_enable(nifna->nifna_netif);
563 		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
564 		break;
565 
566 	case NA_ACTIVATE_MODE_DEFUNCT:
567 		ASSERT(SKYWALK_CAPABLE(na->na_ifp));
568 		break;
569 
570 	case NA_ACTIVATE_MODE_OFF:
571 		/*
572 		 * Note that here we cannot assert SKYWALK_CAPABLE()
573 		 * as we're called in the destructor path.
574 		 */
575 		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
576 		nx_netif_flow_disable(nifna->nifna_netif);
577 		nx_netif_filter_disable(nifna->nifna_netif);
578 
579 		/*
580 		 * Signal the poller thread to terminate itself, and
581 		 * wait for it to exit.
582 		 */
583 		if (ifp->if_poll_thread != THREAD_NULL) {
584 			ASSERT(net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
585 			ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
586 			lck_mtx_lock_spin(&ifp->if_poll_lock);
587 			ifp->if_poll_flags |= IF_POLLF_TERMINATING;
588 			wakeup_one((caddr_t)&ifp->if_poll_thread);
589 			lck_mtx_unlock(&ifp->if_poll_lock);
590 
591 			/* wait for poller thread to terminate */
592 			lck_mtx_lock(&ifp->if_poll_lock);
593 			while (ifp->if_poll_thread != THREAD_NULL) {
594 				SK_DF(SK_VERB_NETIF_POLL,
595 				    "%s: waiting for poller thread to terminate",
596 				    if_name(ifp));
597 				(void) msleep(&ifp->if_poll_thread,
598 				    &ifp->if_poll_lock, (PZERO - 1),
599 				    "netif_poll_thread_exit", NULL);
600 			}
601 			lck_mtx_unlock(&ifp->if_poll_lock);
602 			SK_DF(SK_VERB_NETIF_POLL,
603 			    "%s: poller thread termination complete",
604 			    if_name(ifp));
605 		}
606 
607 		/* Do not intercept packets on the rx path. */
608 		(void) nx_netif_compat_catch_rx(nca, FALSE);
609 
610 		/* Free the mbufs going to the channel rings */
611 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
612 			nx_mbq_safe_purge(&na->na_rx_rings[r].ckr_rx_queue);
613 			nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
614 		}
615 
616 		/* reset all TX notify callbacks */
617 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
618 			na->na_tx_rings[r].ckr_na_notify =
619 			    na->na_tx_rings[r].ckr_netif_notify;
620 			na->na_tx_rings[r].ckr_netif_notify = NULL;
621 			if (nifna->nifna_tx_mit != NULL) {
622 				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
623 				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
624 			}
625 		}
626 
627 		if (nifna->nifna_tx_mit != NULL) {
628 			skn_free_type_array(tx_off, struct nx_netif_mit,
629 			    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
630 			nifna->nifna_tx_mit = NULL;
631 		}
632 
633 		/* reset all RX notify callbacks */
634 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
635 			na->na_rx_rings[r].ckr_na_notify =
636 			    na->na_rx_rings[r].ckr_netif_notify;
637 			na->na_rx_rings[r].ckr_netif_notify = NULL;
638 			if (nifna->nifna_rx_mit != NULL) {
639 				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
640 				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
641 			}
642 		}
643 		if (nifna->nifna_rx_mit != NULL) {
644 			skn_free_type_array(rx_off, struct nx_netif_mit,
645 			    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
646 			nifna->nifna_rx_mit = NULL;
647 		}
648 
649 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
650 			for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
651 				nx_netif_compat_ring_free(na->
652 				    na_tx_rings[r].ckr_tx_pool[i]);
653 				na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
654 			}
655 			skn_free_type_array(tx_pool_off,
656 			    struct mbuf *, na_get_nslots(na, NR_TX),
657 			    na->na_tx_rings[r].ckr_tx_pool);
658 		}
659 		break;
660 
661 	default:
662 		VERIFY(0);
663 		/* NOTREACHED */
664 		__builtin_unreachable();
665 	}
666 
667 	return 0;
668 
669 uncatch:
670 	(void) nx_netif_compat_catch_rx(nca, FALSE);
671 
672 free_tx_pools:
673 	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
674 		if (na->na_tx_rings[r].ckr_tx_pool == NULL) {
675 			continue;
676 		}
677 		for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
678 			nx_netif_compat_ring_free(
679 				na->na_tx_rings[r].ckr_tx_pool[i]);
680 			na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
681 		}
682 		skn_free_type_array(tx_pool, struct mbuf *,
683 		    na_get_nslots(na, NR_TX), na->na_tx_rings[r].ckr_tx_pool);
684 		na->na_tx_rings[r].ckr_tx_pool = NULL;
685 	}
686 	if (nifna->nifna_tx_mit != NULL) {
687 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
688 			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
689 		}
690 		skn_free_type_array(tx, struct nx_netif_mit,
691 		    na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
692 		nifna->nifna_tx_mit = NULL;
693 	}
694 	if (nifna->nifna_rx_mit != NULL) {
695 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
696 			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
697 		}
698 		skn_free_type_array(rx, struct nx_netif_mit,
699 		    na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
700 		nifna->nifna_rx_mit = NULL;
701 	}
702 	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
703 		nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
704 	}
705 out:
706 
707 	return error;
708 }
709 
710 /*
711  * Record completed transmissions and update ktail.
712  *
713  * The oldest tx buffer not yet completed is at ckr_ktail + 1,
714  * ckr_khead is the first unsent buffer.
715  */
716 /* Hoisted out of line to reduce kernel stack footprint */
717 SK_NO_INLINE_ATTRIBUTE
718 static uint32_t
nx_netif_compat_tx_clean(struct netif_stats * nifs,struct __kern_channel_ring * kring)719 nx_netif_compat_tx_clean(struct netif_stats *nifs,
720     struct __kern_channel_ring *kring)
721 {
722 	const slot_idx_t lim = kring->ckr_lim;
723 	slot_idx_t nm_i = SLOT_NEXT(kring->ckr_ktail, lim);
724 	slot_idx_t khead = kring->ckr_khead;
725 	uint32_t n = 0;
726 	struct mbuf **ckr_tx_pool = kring->ckr_tx_pool;
727 
728 	while (nm_i != khead) { /* buffers not completed */
729 		struct mbuf *m = ckr_tx_pool[nm_i];
730 
731 		if (__improbable(m == NULL)) {
732 			/* this is done, try to replenish the entry */
733 			VERIFY(nm_i <= UINT16_MAX);
734 			ckr_tx_pool[nm_i] = m =
735 			    nx_netif_compat_ring_alloc(M_WAITOK,
736 			    kring->ckr_max_pkt_len, (uint16_t)nm_i);
737 			if (__improbable(m == NULL)) {
738 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
739 				STATS_INC(nifs, NETIF_STATS_DROP);
740 				SK_DF(SK_VERB_MEM,
741 				    "mbuf allocation failed (slot %u)", nm_i);
742 				/* XXX how do we proceed ? break ? */
743 				return -ENOMEM;
744 			}
745 		} else if (mbuf_ring_cluster_is_active(m)) {
746 			break; /* This mbuf is still busy */
747 		}
748 		n++;
749 		nm_i = SLOT_NEXT(nm_i, lim);
750 	}
751 	kring->ckr_ktail = SLOT_PREV(nm_i, lim);
752 
753 	SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (0x%llx) tx completed [%u] -> "
754 	    "kh %u kt %u | rh %u rt %u", kring->ckr_name, SK_KVA(kring),
755 	    n, kring->ckr_khead, kring->ckr_ktail,
756 	    kring->ckr_rhead, kring->ckr_rtail);
757 
758 	return n;
759 }
760 
761 /* Hoisted out of line to reduce kernel stack footprint */
762 SK_NO_INLINE_ATTRIBUTE
763 static void
nx_netif_compat_set_tx_event(struct __kern_channel_ring * kring,slot_idx_t khead)764 nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
765     slot_idx_t khead)
766 {
767 	const slot_idx_t lim = kring->ckr_lim;
768 	slot_idx_t ntc = SLOT_NEXT(kring->ckr_ktail, lim); /* next to clean */
769 	struct mbuf *m;
770 	slot_idx_t e;
771 
772 	if (ntc == khead) {
773 		return; /* all buffers are free */
774 	}
775 	/*
776 	 * We have pending packet in the driver between ckr_ktail+1 and
777 	 * ckr_khead, and we have to choose one of these slots to generate
778 	 * a TX notification.  There is a race, but this is only called
779 	 * within TX sync which does a double check.
780 	 */
781 	if (__probable(netif_tx_event_mode == 0)) {
782 		/*
783 		 * Choose the first pending slot, to be safe against drivers
784 		 * reordering mbuf transmissions.
785 		 */
786 		e = ntc;
787 	} else {
788 		/*
789 		 * Choose a slot in the middle, so that we don't risk ending
790 		 * up in a situation where the client continuously wake up,
791 		 * fills one or a few TX slots and go to sleep again.
792 		 */
793 		slot_idx_t n = lim + 1;
794 
795 		if (khead >= ntc) {
796 			e = (khead + ntc) >> 1;
797 		} else { /* wrap around */
798 			e = (khead + n + ntc) >> 1;
799 			if (e >= n) {
800 				e -= n;
801 			}
802 		}
803 
804 		if (__improbable(e >= n)) {
805 			SK_ERR("This cannot happen");
806 			e = 0;
807 		}
808 	}
809 	m = kring->ckr_tx_pool[e];
810 
811 	for (;;) {
812 		uint32_t p = 0, pn, i, f;
813 		int err;
814 
815 		(void) mbuf_cluster_get_prop(m, &p);
816 		f = NMB_GET_FLAGS(p);
817 		i = NMB_GET_INDEX(p);
818 
819 		if (f & NMB_PROPF_TX_NOTIFY) {
820 			/*
821 			 * This can happen if there is already an event
822 			 * on the ring slot 'e': There is nothing to do.
823 			 */
824 			SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
825 			    "TX_NOTIFY already set at %u m 0x%llx kc %u ntc %u",
826 			    e, SK_KVA(m), khead, ntc);
827 			return;
828 		}
829 
830 		f |= NMB_PROPF_TX_NOTIFY;
831 		pn = NMB_SET_FLAGS(p, f);
832 
833 		err = mbuf_cluster_set_prop(m, p, pn);
834 		if (err != 0) {
835 			if (err == EBUSY) {     /* try again */
836 				continue;
837 			}
838 			/* TODO: [email protected] -- what to do? */
839 			SK_ERR("Failed to set TX_NOTIFY at %u m 0x%llx kh %u "
840 			    "ntc %u, err %d", e, SK_KVA(m), khead, ntc, err);
841 		} else {
842 			SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
843 			    "Request TX_NOTIFY at %u m 0x%llx kh %u ntc %u",
844 			    e, SK_KVA(m), khead, ntc);
845 		}
846 		break;
847 	}
848 }
849 
850 #if SK_LOG
851 /* Hoisted out of line to reduce kernel stack footprint */
852 SK_LOG_ATTRIBUTE
853 static void
nx_netif_compat_na_txsync_log(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)854 nx_netif_compat_na_txsync_log(struct __kern_channel_ring *kring,
855     struct proc *p, uint32_t flags, slot_idx_t nm_i)
856 {
857 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
858 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0x%x "
859 	    "nm_i %u, kh %u kt %u | rh %u rt %u",
860 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
861 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
862 	    flags, nm_i, kring->ckr_khead, kring->ckr_ktail,
863 	    kring->ckr_rhead, kring->ckr_rtail);
864 }
865 #endif /* SK_LOG */
866 
867 /*
868  * nx_netif_compat_na_txsync() transforms packets into mbufs and passes
869  * them to the device driver.
870  */
871 static int
nx_netif_compat_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)872 nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
873     uint32_t flags)
874 {
875 #pragma unused(p)
876 	struct nexus_adapter *na = KRNA(kring);
877 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
878 	slot_idx_t nm_i; /* index into the channel ring */        // j
879 	const slot_idx_t head = kring->ckr_rhead;
880 	uint32_t slot_count = 0;
881 	uint32_t byte_count = 0;
882 
883 	STATS_INC(nifs, NETIF_STATS_TX_SYNC);
884 
885 	/* update our work timestamp */
886 	na->na_work_ts = _net_uptime;
887 
888 	/*
889 	 * First part: process new packets to send.
890 	 */
891 	nm_i = kring->ckr_khead;
892 	if (nm_i != head) {     /* we have new packets to send */
893 		while (nm_i != head) {
894 			struct __kern_slot_desc *sd = KR_KSD(kring, nm_i);
895 
896 			/* device-specific */
897 			struct mbuf *m;
898 			int tx_ret;
899 			/*
900 			 * Take a mbuf from the tx pool (replenishing the pool
901 			 * entry if necessary) and copy in the user packet.
902 			 */
903 			VERIFY(nm_i <= UINT16_MAX);
904 			m = kring->ckr_tx_pool[nm_i];
905 			if (__improbable(m == NULL)) {
906 				kring->ckr_tx_pool[nm_i] = m =
907 				    nx_netif_compat_ring_alloc(M_WAITOK,
908 				    kring->ckr_max_pkt_len, (uint16_t)nm_i);
909 				if (__improbable(m == NULL)) {
910 					STATS_INC(nifs, NETIF_STATS_DROP);
911 					STATS_INC(nifs,
912 					    NETIF_STATS_DROP_NOMEM_MBUF);
913 					SK_DF(SK_VERB_MEM,
914 					    "%s(%d) kr \"%s\" (0x%llx) "
915 					    "krflags 0x%b ckr_tx_pool[%u] "
916 					    "allocation failed",
917 					    sk_proc_name_address(p),
918 					    sk_proc_pid(p), kring->ckr_name,
919 					    SK_KVA(kring), kring->ckr_flags,
920 					    CKRF_BITS, nm_i);
921 					/*
922 					 * Here we could schedule a timer
923 					 * which retries to replenish after
924 					 * a while, and notifies the client
925 					 * when it manages to replenish some
926 					 * slot.  In any cae we break early
927 					 * to avoid crashes.
928 					 */
929 					break;
930 				}
931 				STATS_INC(nifs, NETIF_STATS_TX_REPL);
932 			}
933 
934 			byte_count += sd->sd_pkt->pkt_length;
935 			slot_count++;
936 
937 			/*
938 			 * We should ask notifications when CS_REPORT is set,
939 			 * or roughly every half ring.  To optimize this,
940 			 * we set a notification event when the client runs
941 			 * out of TX ring space, or when transmission fails.
942 			 * In the latter case we also break early.
943 			 */
944 			tx_ret = nx_netif_compat_xmit_frame(na, m, sd->sd_pkt);
945 			if (__improbable(tx_ret)) {
946 				SK_RD(5, "start_xmit failed: err %d "
947 				    "[nm_i %u, h %u, kt %u]",
948 				    tx_ret, nm_i, head, kring->ckr_ktail);
949 				/*
950 				 * No room for this mbuf in the device driver.
951 				 * Request a notification FOR A PREVIOUS MBUF,
952 				 * then call nx_netif_compat_tx_clean(kring) to
953 				 * do the double check and see if we can free
954 				 * more buffers.  If there is space continue,
955 				 * else break; NOTE: the double check is
956 				 * necessary if the problem occurs in the
957 				 * txsync call after selrecord().  Also, we
958 				 * need some way to tell the caller that not
959 				 * all buffers were queued onto the device
960 				 * (this was not a problem with native skywalk
961 				 * driver where space is preallocated). The
962 				 * bridge has a similar problem and we solve
963 				 * it there by dropping the excess packets.
964 				 */
965 				nx_netif_compat_set_tx_event(kring, nm_i);
966 				if (nx_netif_compat_tx_clean(nifs, kring)) {
967 					/* space now available */
968 					continue;
969 				} else {
970 					break;
971 				}
972 			}
973 			nm_i = SLOT_NEXT(nm_i, kring->ckr_lim);
974 			STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
975 		}
976 
977 		/*
978 		 * Update khead to the next slot to transmit; Here nm_i
979 		 * is not necesarrily head, we could break early.
980 		 */
981 		kring->ckr_khead = nm_i;
982 
983 		kr_update_stats(kring, slot_count, byte_count);
984 	}
985 
986 	/*
987 	 * Second, reclaim completed buffers
988 	 */
989 	if ((flags & NA_SYNCF_FORCE_RECLAIM) || kr_txempty(kring)) {
990 		/*
991 		 * No more available slots? Set a notification event on a
992 		 * channel slot that will be cleaned in the future.  No
993 		 * doublecheck is performed, since nx_netif_compat_na_txsync()
994 		 * will be called twice by ch_event().
995 		 */
996 		nx_netif_compat_set_tx_event(kring, nm_i);
997 	}
998 	kring->ckr_pending_intr = 0;
999 
1000 #if SK_LOG
1001 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1002 		nx_netif_compat_na_txsync_log(kring, p, flags, nm_i);
1003 	}
1004 #endif /* SK_LOG */
1005 
1006 	(void) nx_netif_compat_tx_clean(nifs, kring);
1007 
1008 	return 0;
1009 }
1010 
1011 #if SK_LOG
1012 /* Hoisted out of line to reduce kernel stack footprint */
1013 SK_LOG_ATTRIBUTE
1014 static void
nx_netif_compat_receive_log1(const struct __kern_channel_ring * kring,struct nx_mbq * q)1015 nx_netif_compat_receive_log1(const struct __kern_channel_ring *kring,
1016     struct nx_mbq *q)
1017 {
1018 	SK_RD(10, "kr \"%s\" (0x%llx) krflags 0x%b FULL "
1019 	    "(qlen %u qsize %llu), kc %u kt %u", kring->ckr_name,
1020 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, nx_mbq_len(q),
1021 	    nx_mbq_size(q), kring->ckr_khead, kring->ckr_ktail);
1022 }
1023 
1024 /* Hoisted out of line to reduce kernel stack footprint */
1025 SK_LOG_ATTRIBUTE
1026 static void
nx_netif_compat_receive_log2(const struct __kern_channel_ring * kring,struct nx_mbq * q,const struct ifnet_stat_increment_param * s)1027 nx_netif_compat_receive_log2(const struct __kern_channel_ring *kring,
1028     struct nx_mbq *q, const struct ifnet_stat_increment_param *s)
1029 {
1030 	SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (0x%llx) krflags 0x%b OK, "
1031 	    "added %u packets %u bytes, now qlen %u qsize %llu",
1032 	    kring->ckr_name, SK_KVA(kring), kring->ckr_flags, CKRF_BITS,
1033 	    s->packets_in, s->bytes_in, nx_mbq_len(q), nx_mbq_size(q));
1034 }
1035 #endif /* SK_LOG */
1036 
1037 /*
1038  * This is the default RX path for the compat netif nexus. Packets
1039  * are enqueued and later extracted by nx_netif_compat_na_rxsync().
1040  */
1041 /* TODO: [email protected] -- implement chaining */
1042 static errno_t
nx_netif_compat_receive(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)1043 nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
1044     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
1045     boolean_t poll, struct thread *tp)
1046 {
1047 #pragma unused(tp)
1048 	boolean_t ifp_rxpoll = ((ifp->if_eflags & IFEF_RXPOLL) && net_rxpoll);
1049 	struct nexus_adapter *na = &NA(ifp)->nifna_up;
1050 	struct __kern_channel_ring *kring;
1051 	struct netif_stats *nifs;
1052 	uint32_t r, work_done;
1053 	unsigned int qlimit;
1054 	struct nx_mbq *q;
1055 	errno_t err = 0;
1056 
1057 	/* update our work timestamp */
1058 	na->na_work_ts = _net_uptime;
1059 
1060 	if (__improbable(m_head == NULL)) {
1061 		ASSERT(m_tail == NULL);
1062 		ASSERT(poll);
1063 		ASSERT(s->bytes_in == 0);
1064 		ASSERT(s->packets_in == 0);
1065 	}
1066 
1067 	/* BEGIN CSTYLED */
1068 	/*
1069 	 * TODO: [email protected] -- this needs to be revisited once we
1070 	 * have a clear definition of how multiple RX rings are mapped
1071 	 * to flows; this would involve the hardware/driver doing some
1072 	 * kind of classification and RSS-like demuxing.
1073 	 *
1074 	 * When we enable that, we'll need to consider sifting thru the
1075 	 * mbuf chain we get from the caller, and enqueue them across
1076 	 * per-ring temporary mbuf queue (along with marking the ring
1077 	 * indicating pending packets.)  During second stage processing,
1078 	 * we'll issue nx_netif_mit_rx_intr() on each marked ring to
1079 	 * dispatch the packets upstream.
1080 	 *
1081 	 * r = MBUF_RXQ(m);
1082 	 *
1083 	 * if (r >= na->na_num_rx_rings)
1084 	 *     r = r % na->na_num_rx_rings;
1085 	 *
1086 	 * kring = &na->na_rx_rings[r];
1087 	 * q = &kring->ckr_rx_queue;
1088 	 *
1089 	 * For now, target only the first RX ring (ring 0).
1090 	 */
1091 	/* END CSTYLED */
1092 	r = 0;  /* receive ring number */
1093 	kring = &na->na_rx_rings[r];
1094 
1095 	ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
1096 	nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1097 
1098 	if (__improbable((!NA_IS_ACTIVE(na)) || KR_DROP(kring))) {
1099 		/* BEGIN CSTYLED */
1100 		/*
1101 		 * If we deal with multiple rings, change above to:
1102 		 *
1103 		 * if (!NA_IS_ACTIVE(na) || r >= na_get_nrings(na, NR_RX)))
1104 		 *
1105 		 * then here do:
1106 		 *
1107 		 * if (r >= na_get_nrings(na, NR_RX)) {
1108 		 *      SK_ERR("na \"%s\" (0x%llx) invalid r %u >= %u",
1109 		 *          na->na_name, SK_KVA(na), r,
1110 		 *          na_get_nrings(na, NR_RX));
1111 		 * }
1112 		 */
1113 		/* END CSTYLED */
1114 		m_freem_list(m_head);
1115 		if (!NA_IS_ACTIVE(na)) {
1116 			STATS_ADD(nifs, NETIF_STATS_DROP_NA_INACTIVE,
1117 			    s->packets_in);
1118 		} else if (KR_DROP(kring)) {
1119 			STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE,
1120 			    s->packets_in);
1121 		}
1122 		STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1123 		err = ENXIO;
1124 		goto done;
1125 	}
1126 	if (__improbable(m_head == NULL)) {
1127 		goto send_packets;
1128 	}
1129 
1130 	q = &kring->ckr_rx_queue;
1131 	nx_mbq_lock_spin(q);
1132 	qlimit = nx_mbq_limit(q);
1133 	if (ifp_rxpoll) {
1134 		/*
1135 		 * qlimit of the receive queue is much smaller when the
1136 		 * interface is in oppurtunistic polling mode. In this case
1137 		 * when the interface is operating in interrupt mode,
1138 		 * a sudden burst of input packets can cause the receive queue
1139 		 * to quickly buildup due to scheduling latency in waking up
1140 		 * the poller thread. To avoid drops here due to this latency
1141 		 * we provide a leeway on the qlimit.
1142 		 */
1143 		qlimit <<= 5;
1144 	}
1145 	if (__improbable(nx_mbq_len(q) > qlimit)) {
1146 #if SK_LOG
1147 		if (__improbable(sk_verbose != 0)) {
1148 			nx_netif_compat_receive_log1(kring, q);
1149 		}
1150 #endif /* SK_LOG */
1151 		nx_mbq_unlock(q);
1152 		m_freem_list(m_head);
1153 		STATS_ADD(nifs, NETIF_STATS_DROP_RXQ_OVFL, s->packets_in);
1154 		STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1155 		goto send_packets;
1156 	}
1157 	nx_mbq_enq_multi(q, m_head, m_tail, s->packets_in, s->bytes_in);
1158 
1159 #if SK_LOG
1160 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1161 		nx_netif_compat_receive_log2(kring, q, s);
1162 	}
1163 #endif /* SK_LOG */
1164 
1165 	nx_mbq_unlock(q);
1166 
1167 	(void) ifnet_stat_increment_in(ifp, s->packets_in, s->bytes_in,
1168 	    s->errors_in);
1169 
1170 	if (poll) {
1171 		/* update incremental poll stats */
1172 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
1173 	}
1174 
1175 send_packets:
1176 	/*
1177 	 * if the interface supports oppurtunistic input polling, then the
1178 	 * input packet processing is performed in context of the poller thread.
1179 	 */
1180 	if (!poll && ifp_rxpoll) {
1181 		/* wakeup the poller thread */
1182 		ifnet_poll(ifp);
1183 	} else {
1184 		/*
1185 		 * wakeup the mitigation thread if needed to perform input
1186 		 * packet processing.
1187 		 * if the interface supports oppurtunistic input polling, then
1188 		 * mitigation thread is not created and the input packet
1189 		 * processing happens in context of the poller thread.
1190 		 */
1191 		err = nx_netif_mit_rx_intr((NAKR(na, NR_RX) + r), kernproc, 0,
1192 		    &work_done);
1193 	}
1194 done:
1195 	return err;
1196 }
1197 
1198 #if SK_LOG
1199 /* Hoisted out of line to reduce kernel stack footprint */
1200 SK_LOG_ATTRIBUTE
1201 static void
nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)1202 nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring *kring,
1203     struct proc *p, uint32_t flags, slot_idx_t nm_i)
1204 {
1205 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1206 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b "
1207 	    "ring %u flags 0x%x nm_i %u kt %u", sk_proc_name_address(p),
1208 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1209 	    CKRF_BITS, kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail);
1210 }
1211 #endif /* SK_LOG */
1212 
1213 #if DEBUG || DEVELOPMENT
1214 /*
1215  * Split an mbuf chain at offset "split", such that the first mbuf
1216  * is a zero-length M_PKTHDR, followed by the rest of the mbufs.
1217  * Typically, the "split" value is equal to the size of the link
1218  * layer header, e.g. Ethernet header.
1219  */
1220 static struct mbuf *
nx_netif_rx_split(struct mbuf * m0,uint32_t split)1221 nx_netif_rx_split(struct mbuf *m0, uint32_t split)
1222 {
1223 	struct mbuf *m = m0;
1224 
1225 	if (split == 0) {
1226 		split = MHLEN;
1227 		M_PREPEND(m, split, M_DONTWAIT, 0);
1228 	} else {
1229 		m->m_data -= split;
1230 		m->m_len += split;
1231 		m_pktlen(m) += split;
1232 
1233 		ASSERT((uintptr_t)m->m_data >= (uintptr_t)mbuf_datastart(m));
1234 		ASSERT((uintptr_t)m->m_data < ((uintptr_t)mbuf_datastart(m) +
1235 		    mbuf_maxlen(m)));
1236 	}
1237 	if (m != NULL) {
1238 		struct mbuf *n = m_split(m, split, M_DONTWAIT);
1239 		if (n == NULL) {
1240 			m_freem(m);
1241 			return NULL;
1242 		}
1243 		m0 = m;
1244 		ASSERT((uint32_t)m->m_len == split);
1245 		m->m_data += split;
1246 		m->m_len -= split;
1247 		while (m->m_next != NULL) {
1248 			m = m->m_next;
1249 		}
1250 		m->m_next = n;
1251 		m = m0;
1252 		m_pktlen(m) = m_length2(m, NULL);
1253 	}
1254 
1255 	return m;
1256 }
1257 #endif /* DEBUG || DEVELOPMENT */
1258 
1259 /*
1260  * nx_netif_compat_na_rxsync() extracts mbufs from the queue filled by
1261  * nx_netif_compat_receive() and puts their content in the channel
1262  * receive ring.
1263  *
1264  * Accesses to kring are serialized via kring->ckr_rx_queue lock, because
1265  * the rx handler is asynchronous,
1266  */
1267 static int
nx_netif_compat_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1268 nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1269     uint32_t flags)
1270 {
1271 #pragma unused(p)
1272 	struct nexus_adapter *na = KRNA(kring);
1273 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1274 	struct nx_netif *nif = nifna->nifna_netif;
1275 	slot_idx_t nm_i;        /* index into the channel ring */
1276 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1277 	uint32_t npkts = 0;
1278 	uint32_t byte_count = 0;
1279 	const slot_idx_t lim = kring->ckr_lim;
1280 	const slot_idx_t head = kring->ckr_rhead;
1281 	boolean_t force_update = ((flags & NA_SYNCF_FORCE_READ) ||
1282 	    kring->ckr_pending_intr != 0);
1283 	struct mbuf *m;
1284 	uint32_t n;
1285 	uint32_t avail; /* in slots */
1286 	int err, mlen;
1287 	boolean_t attach_mbuf = FALSE;
1288 	struct nx_mbq *q, tmpq;
1289 	struct kern_pbufpool *pp = kring->ckr_pp;
1290 	uint32_t ph_cnt, i = 0;
1291 
1292 	ASSERT(pp->pp_max_frags == 1);
1293 	ASSERT(head <= lim);
1294 
1295 	/*
1296 	 * First part: skip past packets that userspace has released.
1297 	 * This can possibly make room for the second part.
1298 	 * equivalent to kr_reclaim()
1299 	 */
1300 	if (kring->ckr_khead != head) {
1301 		kring->ckr_khead = head;
1302 		/* ensure global visibility */
1303 		os_atomic_thread_fence(seq_cst);
1304 	}
1305 
1306 	STATS_INC(nifs, NETIF_STATS_RX_SYNC);
1307 
1308 	/*
1309 	 * Second part: import newly received packets.
1310 	 */
1311 	if (!force_update) {
1312 		return 0;
1313 	}
1314 
1315 	/* update our work timestamp */
1316 	na->na_work_ts = _net_uptime;
1317 
1318 	/* first empty slot in the receive ring */
1319 	nm_i = kring->ckr_ktail;
1320 
1321 	/*
1322 	 * Compute the available space (in bytes) in this ring.
1323 	 * The first slot that is not considered in is the one
1324 	 * before ckr_khead.
1325 	 */
1326 	avail = kr_available_slots_rxring(kring);
1327 	if (__improbable(avail == 0)) {
1328 		return 0;
1329 	}
1330 
1331 	if (NA_KERNEL_ONLY(na)) {
1332 		ASSERT(na->na_ifp != NULL &&
1333 		    fsw_ifp_to_fsw(na->na_ifp) != NULL);
1334 		/*
1335 		 * We are not supporting attachment to bridge flowswitch
1336 		 * for now, until we support PKT_F_MBUF_DATA packets
1337 		 * in bridge flowswitch.
1338 		 */
1339 		attach_mbuf = TRUE;
1340 	}
1341 
1342 	/*
1343 	 * Quickly move all of ckr_rx_queue to a temporary queue to dequeue
1344 	 * from.  For each mbuf, attach or copy it to the packet attached
1345 	 * to the slot.  Release the lock while we're doing that, to allow
1346 	 * for the input thread to enqueue.
1347 	 */
1348 	q = &kring->ckr_rx_queue;
1349 	nx_mbq_init(&tmpq, NX_MBQ_NO_LIMIT);
1350 	nx_mbq_lock_spin(q);
1351 	nx_mbq_concat(&tmpq, q);
1352 	nx_mbq_unlock(q);
1353 
1354 	if (__improbable(nx_mbq_len(&tmpq) == 0)) {
1355 		return 0;
1356 	}
1357 
1358 	ph_cnt = MIN(avail, nx_mbq_len(&tmpq));
1359 	err = kern_pbufpool_alloc_batch_nosleep(pp, 1, kring->ckr_scratch,
1360 	    &ph_cnt);
1361 	if (err == ENOMEM) {
1362 		SK_DF(SK_VERB_MEM, "%s(%p) failed to alloc %d pkts for kr "
1363 		    "0x%llu", sk_proc_name_address(p), sk_proc_pid(p), ph_cnt,
1364 		    SK_KVA(kring));
1365 		goto done;
1366 	}
1367 	ASSERT(ph_cnt != 0);
1368 
1369 	for (n = 0; (n < ph_cnt) &&
1370 	    ((m = nx_mbq_deq(&tmpq)) != NULL); n++) {
1371 		struct __kern_slot_desc *ksd = KR_KSD(kring, nm_i);
1372 		struct __kern_packet *pkt;
1373 		kern_packet_t ph;
1374 		uint8_t hlen;
1375 		uint16_t tag;
1376 		char *h;
1377 
1378 		ASSERT(m->m_flags & M_PKTHDR);
1379 		mlen = m_pktlen(m);
1380 		h = m->m_pkthdr.pkt_hdr;
1381 		if (__improbable(mlen == 0 || h == NULL ||
1382 		    h < (char *)mbuf_datastart(m) || h > (char *)m->m_data)) {
1383 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1384 			SK_RD(5, "kr \"%s\" (0x%llx) m 0x%llx len %d"
1385 			    "bad pkt_hdr", kring->ckr_name,
1386 			    SK_KVA(kring), SK_KVA(m), mlen);
1387 			m_freem(m);
1388 			m = NULL;
1389 			continue;
1390 		}
1391 
1392 		hlen = (uint8_t)(m->m_data - h);
1393 		mlen += hlen;
1394 
1395 #if DEBUG || DEVELOPMENT
1396 		if (__improbable(netif_rx_split != 0)) {
1397 			/* callee frees mbuf upon failure */
1398 			if ((m = nx_netif_rx_split(m, hlen)) == NULL) {
1399 				continue;
1400 			}
1401 
1402 			ASSERT((uintptr_t)m->m_data >=
1403 			    (uintptr_t)mbuf_datastart(m));
1404 			ASSERT((uintptr_t)m->m_data <
1405 			    ((uintptr_t)mbuf_datastart(m) +
1406 			    mbuf_maxlen(m)));
1407 		}
1408 #endif /* DEBUG || DEVELOPMENT */
1409 
1410 		ph = kring->ckr_scratch[i];
1411 		ASSERT(ph != 0);
1412 		kring->ckr_scratch[i] = 0;
1413 		pkt = SK_PTR_ADDR_KPKT(ph);
1414 		++i;
1415 
1416 		/*
1417 		 * Wind back the data pointer to include any frame headers
1418 		 * as part of the copy below.  The header length is then
1419 		 * stored in the corresponding metadata area of the buffer.
1420 		 */
1421 		m->m_data -= hlen;
1422 		m->m_len += hlen;
1423 		m->m_pkthdr.len += hlen;
1424 		ASSERT(mlen == m->m_pkthdr.len);
1425 
1426 		pkt->pkt_link_flags = 0;
1427 		if (m->m_flags & M_HASFCS) {
1428 			pkt->pkt_link_flags |= PKT_LINKF_ETHFCS;
1429 		}
1430 		if (mbuf_get_vlan_tag(m, &tag) == 0) {
1431 			(void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag,
1432 			    FALSE);
1433 		}
1434 		SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1435 		    "kr \"%s\" (0x%llx) m 0x%llx idx %u slot_len %d",
1436 		    kring->ckr_name, SK_KVA(kring), SK_KVA(m), nm_i, mlen);
1437 
1438 		if (__probable(attach_mbuf)) {
1439 			STATS_INC(nifs, NETIF_STATS_RX_COPY_ATTACH);
1440 			err = __packet_initialize_with_mbuf(pkt, m, 0, hlen);
1441 			VERIFY(err == 0);
1442 		} else if (__probable(mlen <= (int)PP_BUF_SIZE_DEF(pp))) {
1443 			STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
1444 			/*
1445 			 * We're sending this up to a user channel opened
1446 			 * directly to the netif; copy everything.
1447 			 */
1448 			err = __packet_set_headroom(ph, 0);
1449 			VERIFY(err == 0);
1450 			err = __packet_set_link_header_length(ph, hlen);
1451 			VERIFY(err == 0);
1452 			nif->nif_pkt_copy_from_mbuf(NR_RX, ph, 0, m, 0,
1453 			    mlen, FALSE, 0);
1454 			/* finalize and attach the packet */
1455 			err = __packet_finalize(ph);
1456 			VERIFY(err == 0);
1457 			m_freem(m);
1458 			m = NULL;
1459 		} else {
1460 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1461 			STATS_INC(nifs, NETIF_STATS_DROP);
1462 			m_freem(m);
1463 			m = NULL;
1464 			kern_pbufpool_free(pp, ph);
1465 			ph = 0;
1466 			pkt = NULL;
1467 			continue;
1468 		}
1469 
1470 		err = KR_SLOT_ATTACH_METADATA(kring, ksd,
1471 		    (struct __kern_quantum *)pkt);
1472 		ASSERT(err == 0);
1473 
1474 		byte_count += mlen;
1475 		++npkts;
1476 		ASSERT(npkts < kring->ckr_num_slots);
1477 		nm_i = SLOT_NEXT(nm_i, lim);
1478 	}
1479 
1480 	if (__improbable(i < ph_cnt)) {
1481 		kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1482 		    (ph_cnt - i));
1483 	}
1484 
1485 	ASSERT(npkts <= ph_cnt);
1486 	kr_update_stats(kring, npkts, byte_count);
1487 
1488 	if (npkts != 0) {
1489 		kring->ckr_ktail = nm_i;
1490 		STATS_ADD(nifs, NETIF_STATS_RX_PACKETS, npkts);
1491 	}
1492 	kring->ckr_pending_intr = 0;
1493 
1494 #if SK_LOG
1495 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1496 		nx_netif_compat_na_rxsync_log(kring, p, flags, nm_i);
1497 	}
1498 #endif /* SK_LOG */
1499 
1500 done:
1501 	/*
1502 	 * If we didn't process all packets in temporary queue,
1503 	 * move them back to the head of ckr_rx_queue.
1504 	 */
1505 	if (!nx_mbq_empty(&tmpq)) {
1506 		nx_mbq_lock_spin(q);
1507 		nx_mbq_concat(&tmpq, q);
1508 		ASSERT(nx_mbq_empty(q));
1509 		nx_mbq_concat(q, &tmpq);
1510 		nx_mbq_unlock(q);
1511 	}
1512 	ASSERT(nx_mbq_empty(&tmpq));
1513 
1514 	return 0;
1515 }
1516 
1517 static void
nx_netif_compat_na_dtor(struct nexus_adapter * na)1518 nx_netif_compat_na_dtor(struct nexus_adapter *na)
1519 {
1520 	struct ifnet *ifp;
1521 	struct nexus_netif_compat_adapter *nca =
1522 	    (struct nexus_netif_compat_adapter *)na;
1523 
1524 	SK_LOCK_ASSERT_HELD();
1525 
1526 	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1527 
1528 	/*
1529 	 * If the finalizer callback hasn't been called for whatever
1530 	 * reasons, pick up the embryonic ifnet stored in na_private.
1531 	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1532 	 */
1533 	if ((ifp = na->na_ifp) == NULL) {
1534 		ifp = na->na_private;
1535 		na->na_private = NULL;
1536 	} else {
1537 		ifnet_decr_iorefcnt(ifp);
1538 		na->na_ifp = NULL;
1539 	}
1540 
1541 	if (nca->nca_up.nifna_netif != NULL) {
1542 		nx_netif_release(nca->nca_up.nifna_netif);
1543 		nca->nca_up.nifna_netif = NULL;
1544 	}
1545 	ASSERT(!SKYWALK_NATIVE(ifp));
1546 }
1547 
1548 /*
1549  * nx_netif_compat_attach() makes it possible to use skywalk on
1550  * a device without native skywalk support.
1551  * This is less performant than native support but potentially
1552  * faster than raw sockets or similar schemes.
1553  */
1554 int
nx_netif_compat_attach(struct kern_nexus * nx,struct ifnet * ifp)1555 nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp)
1556 {
1557 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1558 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1559 	struct nexus_netif_compat_adapter *devnca = NULL;
1560 	struct nexus_netif_compat_adapter *hostnca = NULL;
1561 	struct nexus_adapter *devna = NULL;
1562 	struct nexus_adapter *hostna = NULL;
1563 	boolean_t embryonic = FALSE;
1564 	uint32_t tx_rings, tx_slots;
1565 	int retval = 0;
1566 
1567 	SK_LOCK_ASSERT_HELD();
1568 	ASSERT(!SKYWALK_NATIVE(ifp));
1569 	ASSERT(!SKYWALK_CAPABLE(ifp));
1570 	ASSERT(ifp->if_na == NULL);
1571 	ASSERT(ifp->if_na_ops == NULL);
1572 
1573 	devnca = na_netif_compat_alloc(Z_WAITOK);
1574 	hostnca = na_netif_compat_alloc(Z_WAITOK);
1575 
1576 	/*
1577 	 * We can be called for two different interface states:
1578 	 *
1579 	 * Fully attached: get an io ref count; upon success, this
1580 	 * holds a reference to the ifnet for the ifp pointer stored
1581 	 * in 'na_ifp' down below for both adapters.
1582 	 *
1583 	 * Embryonic: temporary hold the ifnet in na_private, which
1584 	 * upon a successful ifnet_attach(), will be moved over to
1585 	 * the 'na_ifp' with an io ref count held.
1586 	 *
1587 	 * The ifnet in 'na_ifp' will be released by na_release_locked().
1588 	 */
1589 	if (!ifnet_is_attached(ifp, 1)) {
1590 		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
1591 			ifp = NULL;
1592 			retval = ENXIO;
1593 			goto err;
1594 		}
1595 		embryonic = TRUE;
1596 	}
1597 
1598 	/* initialize the (compat) device netif adapter */
1599 	devnca->nca_up.nifna_netif = nif;
1600 	nx_netif_retain(nif);
1601 	devna = &devnca->nca_up.nifna_up;
1602 	(void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1);
1603 	devna->na_name[sizeof(devna->na_name) - 1] = '\0';
1604 	uuid_generate_random(devna->na_uuid);
1605 	if (embryonic) {
1606 		/*
1607 		 * We will move this over to na_ifp once
1608 		 * the interface is fully attached.
1609 		 */
1610 		devna->na_private = ifp;
1611 		ASSERT(devna->na_ifp == NULL);
1612 	} else {
1613 		ASSERT(devna->na_private == NULL);
1614 		/* use I/O refcnt from ifnet_is_attached() */
1615 		devna->na_ifp = ifp;
1616 	}
1617 
1618 	devna->na_type = NA_NETIF_COMPAT_DEV;
1619 	devna->na_free = na_netif_compat_free;
1620 	devna->na_activate = nx_netif_compat_na_activate;
1621 	devna->na_txsync = nx_netif_compat_na_txsync;
1622 	devna->na_rxsync = nx_netif_compat_na_rxsync;
1623 	devna->na_dtor = nx_netif_compat_na_dtor;
1624 	devna->na_krings_create = nx_netif_dev_krings_create;
1625 	devna->na_krings_delete = nx_netif_dev_krings_delete;
1626 	devna->na_special = nx_netif_na_special;
1627 
1628 	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
1629 	    NEXUS_STATS_TYPE_INVALID;
1630 
1631 	if (skywalk_netif_direct_allowed(ifp->if_xname)) {
1632 		tx_rings = nxp->nxp_tx_rings;
1633 		tx_slots = nxp->nxp_tx_slots;
1634 	} else {
1635 		tx_rings = 0;
1636 		tx_slots = 0;
1637 	}
1638 	na_set_nrings(devna, NR_TX, tx_rings);
1639 	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
1640 	na_set_nslots(devna, NR_TX, tx_slots);
1641 	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
1642 	/*
1643 	 * Verify upper bounds; the parameters must have already been
1644 	 * validated by nxdom_prov_params() by the time we get here.
1645 	 */
1646 	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
1647 	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
1648 	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
1649 	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
1650 
1651 	na_attach_common(devna, nx, &nx_netif_compat_prov_s);
1652 
1653 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1654 	    nx, devna)) != 0) {
1655 		ASSERT(devna->na_arena == NULL);
1656 		/* we've transferred the refcnt to na_ifp above */
1657 		ifp = NULL;
1658 		goto err;
1659 	}
1660 	ASSERT(devna->na_arena != NULL);
1661 
1662 	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
1663 	ASSERT(devna->na_flowadv_max == 0 ||
1664 	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
1665 
1666 	/* setup packet copy routines */
1667 	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
1668 		nif->nif_pkt_copy_from_mbuf =
1669 		    pkt_copy_multi_buflet_from_mbuf;
1670 		nif->nif_pkt_copy_to_mbuf =
1671 		    pkt_copy_multi_buflet_to_mbuf;
1672 	} else {
1673 		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
1674 		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
1675 	}
1676 
1677 	/* initialize the host netif adapter */
1678 	hostnca->nca_up.nifna_netif = nif;
1679 	nx_netif_retain(nif);
1680 	hostna = &hostnca->nca_up.nifna_up;
1681 	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
1682 	    "%s^", devna->na_name);
1683 	uuid_generate_random(hostna->na_uuid);
1684 	if (embryonic) {
1685 		/*
1686 		 * We will move this over to na_ifp once
1687 		 * the interface is fully attached.
1688 		 */
1689 		hostna->na_private = ifp;
1690 		ASSERT(hostna->na_ifp == NULL);
1691 	} else {
1692 		ASSERT(hostna->na_private == NULL);
1693 		hostna->na_ifp = devna->na_ifp;
1694 		ifnet_incr_iorefcnt(hostna->na_ifp);
1695 	}
1696 	hostna->na_type = NA_NETIF_COMPAT_HOST;
1697 	hostna->na_free = na_netif_compat_free;
1698 	hostna->na_activate = nx_netif_host_na_activate;
1699 	hostna->na_txsync = nx_netif_host_na_txsync;
1700 	hostna->na_rxsync = nx_netif_host_na_rxsync;
1701 	hostna->na_dtor = nx_netif_compat_na_dtor;
1702 	hostna->na_krings_create = nx_netif_host_krings_create;
1703 	hostna->na_krings_delete = nx_netif_host_krings_delete;
1704 	hostna->na_special = nx_netif_host_na_special;
1705 
1706 	os_atomic_or(&hostna->na_flags, NAF_HOST_ONLY, relaxed);
1707 	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
1708 	    NEXUS_STATS_TYPE_INVALID;
1709 
1710 	na_set_nrings(hostna, NR_TX, 1);
1711 	na_set_nrings(hostna, NR_RX, 0);
1712 	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
1713 	na_set_nslots(hostna, NR_RX, 0);
1714 
1715 	na_attach_common(hostna, nx, &nx_netif_prov_s);
1716 
1717 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1718 	    nx, hostna)) != 0) {
1719 		ASSERT(hostna->na_arena == NULL);
1720 		/* we've transferred the refcnt to na_ifp above */
1721 		ifp = NULL;
1722 		goto err;
1723 	}
1724 	ASSERT(hostna->na_arena != NULL);
1725 
1726 	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
1727 	ASSERT(hostna->na_flowadv_max == 0 ||
1728 	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
1729 
1730 	/* these will be undone by destructor  */
1731 	ifp->if_na_ops = &na_netif_compat_ops;
1732 	ifp->if_na = &devnca->nca_up;
1733 	na_retain_locked(devna);
1734 	na_retain_locked(hostna);
1735 
1736 	SKYWALK_SET_CAPABLE(ifp);
1737 
1738 	NETIF_WLOCK(nif);
1739 	nif->nif_ifp = ifp;
1740 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna, kernproc);
1741 	ASSERT(retval == 0);
1742 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna, kernproc);
1743 	ASSERT(retval == 0);
1744 	NETIF_WUNLOCK(nif);
1745 
1746 #if SK_LOG
1747 	uuid_string_t uuidstr;
1748 	SK_DF(SK_VERB_NETIF, "na_name: \"%s\"", devna->na_name);
1749 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
1750 	    sk_uuid_unparse(devna->na_uuid, uuidstr));
1751 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
1752 	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
1753 	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
1754 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b", devna->na_flags, NAF_BITS);
1755 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
1756 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
1757 	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
1758 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
1759 	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
1760 #if CONFIG_NEXUS_USER_PIPE
1761 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
1762 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
1763 #endif /* CONFIG_NEXUS_USER_PIPE */
1764 	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
1765 	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
1766 	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
1767 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
1768 	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
1769 	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
1770 	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
1771 	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
1772 	SK_DF(SK_VERB_NETIF, "  flags:       0x%b",
1773 	    hostna->na_flags, NAF_BITS);
1774 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
1775 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
1776 	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
1777 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
1778 	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
1779 #if CONFIG_NEXUS_USER_PIPE
1780 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
1781 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
1782 #endif /* CONFIG_NEXUS_USER_PIPE */
1783 	SK_DF(SK_VERB_NETIF, "  ifp:       0x%llx %s [ioref %u]", SK_KVA(ifp),
1784 	    ifp->if_xname, ifp->if_refio);
1785 #endif /* SK_LOG */
1786 
1787 err:
1788 	if (retval != 0) {
1789 		ASSERT(ifp == NULL);
1790 		if (devna != NULL) {
1791 			if (devna->na_arena != NULL) {
1792 				skmem_arena_release(devna->na_arena);
1793 				devna->na_arena = NULL;
1794 			}
1795 			if (devna->na_ifp != NULL) {
1796 				ifnet_decr_iorefcnt(devna->na_ifp);
1797 				devna->na_ifp = NULL;
1798 			}
1799 			devna->na_private = NULL;
1800 		}
1801 		if (hostna != NULL) {
1802 			if (hostna->na_arena != NULL) {
1803 				skmem_arena_release(hostna->na_arena);
1804 				hostna->na_arena = NULL;
1805 			}
1806 			if (hostna->na_ifp != NULL) {
1807 				ifnet_decr_iorefcnt(hostna->na_ifp);
1808 				hostna->na_ifp = NULL;
1809 			}
1810 			hostna->na_private = NULL;
1811 		}
1812 		if (devnca != NULL) {
1813 			if (devnca->nca_up.nifna_netif != NULL) {
1814 				nx_netif_release(devnca->nca_up.nifna_netif);
1815 				devnca->nca_up.nifna_netif = NULL;
1816 			}
1817 			na_netif_compat_free((struct nexus_adapter *)devnca);
1818 		}
1819 		if (hostnca != NULL) {
1820 			if (hostnca->nca_up.nifna_netif != NULL) {
1821 				nx_netif_release(hostnca->nca_up.nifna_netif);
1822 				hostnca->nca_up.nifna_netif = NULL;
1823 			}
1824 			na_netif_compat_free((struct nexus_adapter *)hostnca);
1825 		}
1826 	}
1827 	return retval;
1828 }
1829 
1830 static void
na_netif_compat_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)1831 na_netif_compat_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
1832 {
1833 	na_netif_finalize(nifna, ifp);
1834 }
1835 
1836 /*
1837  * Intercept the rx routine in the standard device driver.
1838  * Second argument is non-zero to intercept, 0 to restore
1839  */
1840 static int
nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter * nca,boolean_t enable)1841 nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *nca,
1842     boolean_t enable)
1843 {
1844 	struct ifnet *ifp = nca->nca_up.nifna_up.na_ifp;
1845 	int err = 0;
1846 
1847 	ASSERT(!(nca->nca_up.nifna_up.na_flags & NAF_HOST_ONLY));
1848 
1849 	if (enable) {
1850 		err = dlil_set_input_handler(ifp, nx_netif_compat_receive);
1851 	} else {
1852 		dlil_reset_input_handler(ifp);
1853 	}
1854 	return err;
1855 }
1856 
1857 /*
1858  * Transmit routine used by nx_netif_compat_na_txsync(). Returns 0 on success
1859  * and non-zero on error (which may be packet drops or other errors).
1860  * len identifies the channel buffer, m is the (preallocated) mbuf to use
1861  * for transmissions.
1862  *
1863  * We should add a reference to the mbuf so the m_freem() at the end
1864  * of the transmission does not consume resources.
1865  *
1866  * On FreeBSD, and on multiqueue cards, we can force the queue using
1867  *      if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1868  *              i = m->m_pkthdr.flowid % adapter->num_queues;
1869  *      else
1870  *              i = curcpu % adapter->num_queues;
1871  *
1872  */
1873 static int
nx_netif_compat_xmit_frame(struct nexus_adapter * na,struct mbuf * m,struct __kern_packet * pkt)1874 nx_netif_compat_xmit_frame(struct nexus_adapter *na, struct mbuf *m,
1875     struct __kern_packet *pkt)
1876 {
1877 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1878 	struct nx_netif *nif = nifna->nifna_netif;
1879 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1880 	struct ifnet *ifp = na->na_ifp;
1881 	kern_packet_t ph = SK_PTR_ENCODE(pkt, METADATA_TYPE(pkt),
1882 	    METADATA_SUBTYPE(pkt));
1883 	uint32_t len;
1884 	int ret = 0;
1885 
1886 	if ((ret = mbuf_ring_cluster_activate(m)) != 0) {
1887 		panic("Failed to activate mbuf ring cluster 0x%llx (%d)",
1888 		    SK_KVA(m), ret);
1889 		/* NOTREACHED */
1890 		__builtin_unreachable();
1891 	}
1892 
1893 	len = pkt->pkt_length;
1894 
1895 	/*
1896 	 * The mbuf should be a cluster from our special pool,
1897 	 * so we do not need to do an m_copyback but just copy.
1898 	 */
1899 	if (m->m_ext.ext_size < len) {
1900 		SK_RD(5, "size %u < len %u", m->m_ext.ext_size, len);
1901 		len = m->m_ext.ext_size;
1902 	}
1903 
1904 	STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
1905 	if (PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1906 		STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM);
1907 	}
1908 
1909 	nif->nif_pkt_copy_to_mbuf(NR_TX, ph, pkt->pkt_headroom, m, 0, len,
1910 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt), pkt->pkt_csum_tx_start_off);
1911 
1912 	/* used for tx notification */
1913 	ret = mbuf_set_tx_compl_data(m, (uintptr_t)ifp, (uintptr_t)NULL);
1914 	ASSERT(ret == 0);
1915 
1916 	ret = dlil_output_handler(ifp, m);
1917 	return ret;
1918 }
1919