xref: /xnu-12377.1.9/bsd/skywalk/nexus/netif/nx_netif_compat.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2015-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/netif/nx_netif.h>
56 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
57 #include <mach/thread_act.h>
58 #include <kern/sched_prim.h>
59 #include <kern/thread.h>
60 #include <kern/uipc_domain.h>
61 
62 static void na_netif_compat_finalize(struct nexus_netif_adapter *,
63     struct ifnet *);
64 static errno_t nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
65     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
66     boolean_t poll, struct thread *tp);
67 static int nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *na,
68     boolean_t enable);
69 static int nx_netif_compat_xmit_frame(struct nexus_adapter *, struct mbuf *,
70     struct __kern_packet *);
71 
72 static int nx_netif_compat_na_notify_tx(struct __kern_channel_ring *,
73     struct proc *, uint32_t);
74 static int nx_netif_compat_na_notify_rx(struct __kern_channel_ring *,
75     struct proc *, uint32_t);
76 static int nx_netif_compat_na_activate(struct nexus_adapter *,
77     na_activate_mode_t);
78 static int nx_netif_compat_na_txsync(struct __kern_channel_ring *,
79     struct proc *, uint32_t);
80 static int nx_netif_compat_na_rxsync(struct __kern_channel_ring *,
81     struct proc *, uint32_t);
82 static void nx_netif_compat_na_dtor(struct nexus_adapter *na);
83 
84 static void nx_netif_compat_tx_intr(struct ifnet *, enum txrx, uint32_t,
85     uint32_t *);
86 static inline struct mbuf *nx_netif_compat_ring_alloc(int, int, uint16_t);
87 static inline void nx_netif_compat_ring_free(struct mbuf *m);
88 static void nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg);
89 
90 static uint32_t nx_netif_compat_tx_clean(struct netif_stats *nifs,
91     struct __kern_channel_ring *kring);
92 static void nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
93     slot_idx_t khead);
94 
95 static struct nexus_netif_compat_adapter *na_netif_compat_alloc(zalloc_flags_t);
96 static void na_netif_compat_free(struct nexus_adapter *);
97 #if DEBUG || DEVELOPMENT
98 static struct mbuf *nx_netif_rx_split(struct mbuf *, uint32_t);
99 #endif /* DEBUG || DEVELOPMENT */
100 
101 #define MBUF_TXQ(m)     ((m)->m_pkthdr.pkt_flowid)
102 #define MBUF_RXQ(m)     ((m)->m_pkthdr.pkt_flowid)
103 
104 #define NMB_PROPF_TX_NOTIFY     0x1     /* generate transmit event */
105 #define NMB_FLAGS_MASK          0x0000ffff
106 #define NMB_INDEX_MASK          0xffff0000
107 #define NMB_GET_FLAGS(p)        (((uint32_t)(p) & NMB_FLAGS_MASK))
108 #define NMB_SET_FLAGS(p, f)     (((uint32_t)(p) & ~NMB_FLAGS_MASK) | (f))
109 #define NMB_GET_INDEX(p)        (((uint32_t)(p) & NMB_INDEX_MASK) >> 16)
110 #define NMB_SET_INDEX(p, i)     (((uint32_t)(p) & ~NMB_INDEX_MASK) | (i << 16))
111 
112 static SKMEM_TYPE_DEFINE(na_netif_compat_zone, struct nexus_netif_compat_adapter);
113 
114 static int netif_tx_event_mode = 0;
115 
116 #if (DEVELOPMENT || DEBUG)
117 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk_netif, OID_AUTO, compat,
118     CTLFLAG_RW | CTLFLAG_LOCKED,
119     0, "Skywalk netif Nexus legacy compatibility support");
120 SYSCTL_INT(_kern_skywalk_netif_compat, OID_AUTO, tx_event_mode,
121     CTLFLAG_RW | CTLFLAG_LOCKED, &netif_tx_event_mode, 0, "");
122 static uint32_t netif_rx_split = 0;
123 SYSCTL_UINT(_kern_skywalk_netif_compat, OID_AUTO, rx_split,
124     CTLFLAG_RW | CTLFLAG_LOCKED, &netif_rx_split, 0, "");
125 #endif /* !DEVELOPMENT && !DEBUG */
126 
127 struct kern_nexus_domain_provider nx_netif_compat_prov_s = {
128 	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF_COMPAT,
129 	.nxdom_prov_flags =             NXDOMPROVF_DEFAULT,
130 	.nxdom_prov_cb = {
131 		.dp_cb_init =           nx_netif_prov_init,
132 		.dp_cb_fini =           nx_netif_prov_fini,
133 		.dp_cb_params =         nx_netif_prov_params,
134 		/*
135 		 * We must be using the native netif handlers below,
136 		 * since we act as the default domain provider; see
137 		 * kern_nexus_register_domain_provider().
138 		 */
139 		.dp_cb_mem_new =        nx_netif_prov_mem_new,
140 		.dp_cb_config =         nx_netif_prov_config,
141 		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
142 		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
143 		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
144 		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
145 		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
146 	},
147 };
148 
149 struct nexus_ifnet_ops na_netif_compat_ops = {
150 	.ni_finalize = na_netif_compat_finalize,
151 	.ni_reap = nx_netif_reap,
152 	.ni_dequeue = nx_netif_compat_tx_dequeue,
153 	.ni_get_len = nx_netif_compat_tx_get_len,
154 };
155 
156 #define SKMEM_TAG_NETIF_COMPAT_MIT      "com.apple.skywalk.netif.compat.mit"
157 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_mit, SKMEM_TAG_NETIF_COMPAT_MIT);
158 
159 #define SKMEM_TAG_NETIF_COMPAT_POOL     "com.apple.skywalk.netif.compat.pool"
160 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_pool, SKMEM_TAG_NETIF_COMPAT_POOL);
161 
162 void
nx_netif_compat_init(struct nxdom * nxdom)163 nx_netif_compat_init(struct nxdom *nxdom)
164 {
165 	static_assert(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE);
166 
167 	/*
168 	 * We want nxprov_create() coming from userland to use the
169 	 * netif_compat domain provider, so install it as default.
170 	 * This is verified by the caller.
171 	 */
172 	(void) nxdom_prov_add(nxdom, &nx_netif_compat_prov_s);
173 }
174 
175 void
nx_netif_compat_fini(void)176 nx_netif_compat_fini(void)
177 {
178 	(void) nxdom_prov_del(&nx_netif_compat_prov_s);
179 }
180 
181 static struct nexus_netif_compat_adapter *
na_netif_compat_alloc(zalloc_flags_t how)182 na_netif_compat_alloc(zalloc_flags_t how)
183 {
184 	struct nexus_netif_compat_adapter *nca;
185 
186 	static_assert(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0);
187 
188 	nca = zalloc_flags(na_netif_compat_zone, how | Z_ZERO);
189 	if (nca) {
190 		SK_DF(SK_VERB_MEM, "nca %p ALLOC", SK_KVA(nca));
191 	}
192 	return nca;
193 }
194 
195 static void
na_netif_compat_free(struct nexus_adapter * na)196 na_netif_compat_free(struct nexus_adapter *na)
197 {
198 	struct nexus_netif_compat_adapter *nca =
199 	    (struct nexus_netif_compat_adapter *)na;
200 
201 	SK_LOCK_ASSERT_HELD();
202 	ASSERT(na->na_refcount == 0);
203 
204 	SK_DF(SK_VERB_MEM, "nca [dev+host] %p FREE", SK_KVA(nca));
205 	bzero(nca, sizeof(*nca));
206 	zfree(na_netif_compat_zone, nca);
207 }
208 
209 /*
210  * Callback invoked when the device driver frees an mbuf used
211  * by skywalk to transmit a packet. This usually happens when
212  * the NIC notifies the driver that transmission is completed.
213  */
214 static void
nx_netif_compat_ringcb(caddr_t cl,uint32_t size,caddr_t arg)215 nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg)
216 {
217 #pragma unused(cl, size)
218 	struct mbuf *__single m = (void *)arg;
219 	struct ifnet *ifp = NULL;
220 	struct netif_stats *nifs = NULL;
221 	uintptr_t data; /* not used */
222 	uint32_t txq;
223 	errno_t err;
224 
225 	err = mbuf_get_tx_compl_data(m, (uintptr_t *)&ifp, &data);
226 	ASSERT(err == 0);
227 
228 	nifs = &NX_NETIF_PRIVATE(NA(ifp)->nifna_up.na_nx)->nif_stats;
229 	txq = MBUF_TXQ(m);
230 
231 	for (;;) {
232 		uint32_t p = 0, i, f;
233 
234 		(void) mbuf_cluster_get_prop(m, &p);
235 		f = NMB_GET_FLAGS(p);
236 		i = NMB_GET_INDEX(p);
237 
238 		SK_DF(SK_VERB_NETIF, "%s m %p txq %u i %u f 0x%x",
239 		    if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
240 
241 		if (f & NMB_PROPF_TX_NOTIFY) {
242 			uint32_t pn;
243 
244 			f &= ~NMB_PROPF_TX_NOTIFY;
245 			pn = NMB_SET_FLAGS(p, f);
246 
247 			err = mbuf_cluster_set_prop(m, p, pn);
248 			if (err != 0) {
249 				if (err == EBUSY) {     /* try again */
250 					continue;
251 				}
252 				/* TODO: [email protected] -- what to do? */
253 				SK_ERR("Failed to clear TX_NOTIFY "
254 				    "m %p i %u err %d", SK_KVA(m), i, err);
255 			} else {
256 				nx_netif_compat_tx_intr(ifp, NR_TX, txq, NULL);
257 				SK_DF(SK_VERB_NETIF | SK_VERB_INTR | SK_VERB_TX,
258 				    "%s TX irq m %p txq %u i %u f 0x%x",
259 				    if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
260 				STATS_INC(nifs, NETIF_STATS_TX_IRQ);
261 			}
262 		}
263 		break;
264 	}
265 }
266 
267 /* Hoisted out of line to reduce kernel stack footprint */
268 SK_NO_INLINE_ATTRIBUTE
269 static struct mbuf *
nx_netif_compat_ring_alloc(int how,int len,uint16_t idx)270 nx_netif_compat_ring_alloc(int how, int len, uint16_t idx)
271 {
272 	struct mbuf *__single m = NULL;
273 	size_t size = len;
274 	uint32_t i;
275 
276 	if (mbuf_ring_cluster_alloc(how, MBUF_TYPE_HEADER, &m,
277 	    nx_netif_compat_ringcb, &size) != 0) {
278 		return NULL;
279 	}
280 
281 	for (;;) {
282 		uint32_t p = 0, pn;
283 		int err;
284 
285 		(void) mbuf_cluster_get_prop(m, &p);
286 		pn = NMB_SET_FLAGS(p, 0);
287 		pn = NMB_SET_INDEX(pn, idx);
288 
289 		err = mbuf_cluster_set_prop(m, p, pn);
290 		if (err != 0) {
291 			if (err == EBUSY) {     /* try again */
292 				continue;
293 			}
294 			SK_ERR("Failed to initialize properties m %p "
295 			    "err %d", SK_KVA(m), err);
296 			m_freem(m);
297 			return NULL;
298 		}
299 		(void) mbuf_cluster_get_prop(m, &p);
300 		i = NMB_GET_INDEX(p);
301 		ASSERT(i == idx);
302 		break;
303 	}
304 
305 	SK_DF(SK_VERB_MEM, "alloc m %p size %u i %u",
306 	    SK_KVA(m), (uint32_t)size, i);
307 
308 	return m;
309 }
310 
311 /* Hoisted out of line to reduce kernel stack footprint */
312 SK_NO_INLINE_ATTRIBUTE
313 static void
nx_netif_compat_ring_free(struct mbuf * m)314 nx_netif_compat_ring_free(struct mbuf *m)
315 {
316 	if (m == NULL) {
317 		return;
318 	}
319 
320 	for (;;) {
321 		uint32_t p = 0;
322 		int err;
323 
324 		(void) mbuf_cluster_get_prop(m, &p);
325 		err = mbuf_cluster_set_prop(m, p, 0);
326 		if (err != 0) {
327 			if (err == EBUSY) {     /* try again */
328 				continue;
329 			}
330 			/* TODO: [email protected] -- what to do? */
331 			SK_ERR("Failed to clear properties m %p err %d",
332 			    SK_KVA(m), err);
333 		}
334 		break;
335 	}
336 	m_freem(m);
337 }
338 
339 static void
nx_netif_compat_tx_intr(struct ifnet * ifp,enum txrx t,uint32_t q,uint32_t * work_done)340 nx_netif_compat_tx_intr(struct ifnet *ifp, enum txrx t, uint32_t q,
341     uint32_t *work_done)
342 {
343 	struct nexus_adapter *na = &NA(ifp)->nifna_up;
344 
345 	if (__improbable(!NA_IS_ACTIVE(na) || q >= na_get_nrings(na, t))) {
346 		if (q >= na_get_nrings(na, t)) {
347 			SK_ERR("na \"%s\" (%p) invalid q %u >= %u",
348 			    na->na_name, SK_KVA(na), q, na_get_nrings(na, t));
349 		}
350 	} else {
351 		(void) nx_netif_mit_tx_intr((NAKR(na, t) + q), kernproc,
352 		    0, work_done);
353 	}
354 }
355 
356 static int
nx_netif_compat_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)357 nx_netif_compat_na_notify_tx(struct __kern_channel_ring *kring,
358     struct proc *p, uint32_t flags)
359 {
360 	/*
361 	 * This should never get executed, as nothing should be invoking
362 	 * the TX ring notify callback.  The compat adapter directly
363 	 * calls nx_netif_compat_tx_intr() for TX completion from within
364 	 * nx_netif_compat_ringcb().
365 	 *
366 	 * If we ever get here, use the original na_notify callback
367 	 * saved during na_activate().
368 	 */
369 	return kring->ckr_netif_notify(kring, p, flags);
370 }
371 
372 static int
nx_netif_compat_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)373 nx_netif_compat_na_notify_rx(struct __kern_channel_ring *kring,
374     struct proc *p, uint32_t flags)
375 {
376 	/*
377 	 * This should never get executed, as nothing should be invoking
378 	 * the RX ring notify callback.  The compat adapter directly
379 	 * calls nx_netif_mit_rx_intr() for RX completion from within
380 	 * nx_netif_compat_receive().
381 	 *
382 	 * If we ever get here, use the original na_notify callback
383 	 * saved during na_activate().
384 	 */
385 	return kring->ckr_netif_notify(kring, p, flags);
386 }
387 
388 /* Enable/disable skywalk mode for a compat network interface. */
389 static int
nx_netif_compat_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)390 nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
391 {
392 	struct nexus_netif_adapter *nifna = NIFNA(na);
393 	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple, rxpoll;
394 	uint32_t limit = (uint32_t)sk_netif_compat_rx_mbq_limit;
395 	struct nx_netif *nif = nifna->nifna_netif;
396 	struct nexus_netif_compat_adapter *nca;
397 	ifnet_t ifp = na->na_ifp;
398 	uint32_t i, r;
399 	int error;
400 	/* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
401 	struct nx_netif_mit *mit_tmp;
402 	uint32_t nrings;
403 	struct mbuf **ckr_tx_pool_tmp;
404 
405 	ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
406 	ASSERT(!(na->na_flags & NAF_HOST_ONLY));
407 
408 	SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s", na->na_name,
409 	    SK_KVA(na), na_activate_mode2str(mode));
410 
411 	nca = (struct nexus_netif_compat_adapter *)nifna;
412 
413 	switch (mode) {
414 	case NA_ACTIVATE_MODE_ON:
415 		ASSERT(SKYWALK_CAPABLE(na->na_ifp));
416 
417 		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
418 		    &rx_mit, &rx_mit_simple);
419 
420 		/*
421 		 * Init the mitigation support on all the dev TX rings.
422 		 */
423 		if (na_get_nrings(na, NR_TX) != 0 && tx_mit) {
424 			nrings = na_get_nrings(na, NR_TX);
425 			mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
426 			    nrings, Z_WAITOK, skmem_tag_netif_compat_mit);
427 			if (mit_tmp == NULL) {
428 				SK_ERR("TX mitigation allocation failed");
429 				error = ENOMEM;
430 				goto out;
431 			}
432 			nifna->nifna_tx_mit = mit_tmp;
433 			nifna->nifna_tx_mit_count = nrings;
434 		} else {
435 			ASSERT(nifna->nifna_tx_mit == NULL);
436 		}
437 
438 		/*
439 		 * Init either poller or mitigation support on all the
440 		 * dev RX rings; they're mutually exclusive and poller
441 		 * takes precedence.
442 		 */
443 		rxpoll = (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
444 		if (rxpoll) {
445 			int err;
446 			__unused kern_return_t kret;
447 			thread_precedence_policy_data_t info;
448 
449 			ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
450 			ASSERT(ifp->if_input_poll != NULL);
451 			ASSERT(ifp->if_input_ctl != NULL);
452 			if ((err =
453 			    kernel_thread_start(netif_rxpoll_compat_thread_func,
454 			    ifp, &ifp->if_poll_thread)) != KERN_SUCCESS) {
455 				panic_plain("%s: ifp=%p couldn't get a poll "
456 				    " thread; err=%d", __func__, ifp, err);
457 				/* NOTREACHED */
458 				__builtin_unreachable();
459 			}
460 			VERIFY(ifp->if_poll_thread != NULL);
461 
462 			/* wait until thread is ready */
463 			lck_mtx_lock(&ifp->if_poll_lock);
464 			while (!(ifp->if_poll_flags & IF_POLLF_READY)) {
465 				(void) assert_wait(&ifp->if_poll_flags,
466 				    THREAD_UNINT);
467 				lck_mtx_unlock(&ifp->if_poll_lock);
468 				(void) thread_block(THREAD_CONTINUE_NULL);
469 				lck_mtx_lock(&ifp->if_poll_lock);
470 			}
471 			lck_mtx_unlock(&ifp->if_poll_lock);
472 
473 			bzero(&info, sizeof(info));
474 			info.importance = 1;
475 			kret = thread_policy_set(ifp->if_poll_thread,
476 			    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
477 			    THREAD_PRECEDENCE_POLICY_COUNT);
478 			ASSERT(kret == KERN_SUCCESS);
479 			limit = if_rcvq_maxlen;
480 			(void) netif_rxpoll_set_params(ifp, NULL, FALSE);
481 			ASSERT(nifna->nifna_rx_mit == NULL);
482 		} else if (rx_mit) {
483 			nrings = na_get_nrings(na, NR_RX);
484 			mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
485 			    nrings, Z_WAITOK, skmem_tag_netif_compat_mit);
486 			if (mit_tmp == NULL) {
487 				SK_ERR("RX mitigation allocation failed");
488 				if (nifna->nifna_tx_mit != NULL) {
489 					skn_free_type_array_counted_by(rx_fail,
490 					    struct nx_netif_mit,
491 					    nifna->nifna_tx_mit_count,
492 					    nifna->nifna_tx_mit);
493 				}
494 				error = ENOMEM;
495 				goto out;
496 			}
497 			nifna->nifna_rx_mit = mit_tmp;
498 			nifna->nifna_rx_mit_count = nrings;
499 		}
500 
501 		/* intercept na_notify callback on the TX rings */
502 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
503 			na->na_tx_rings[r].ckr_netif_notify =
504 			    na->na_tx_rings[r].ckr_na_notify;
505 			na->na_tx_rings[r].ckr_na_notify =
506 			    nx_netif_compat_na_notify_tx;
507 			if (nifna->nifna_tx_mit != NULL) {
508 				nx_netif_mit_init(nif, na->na_ifp,
509 				    &nifna->nifna_tx_mit[r],
510 				    &na->na_tx_rings[r], tx_mit_simple);
511 			}
512 		}
513 
514 		/* intercept na_notify callback on the RX rings */
515 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
516 			na->na_rx_rings[r].ckr_netif_notify =
517 			    na->na_rx_rings[r].ckr_na_notify;
518 			na->na_rx_rings[r].ckr_na_notify =
519 			    nx_netif_compat_na_notify_rx;
520 			if (nifna->nifna_rx_mit != NULL) {
521 				nx_netif_mit_init(nif, na->na_ifp,
522 				    &nifna->nifna_rx_mit[r],
523 				    &na->na_rx_rings[r], rx_mit_simple);
524 			}
525 		}
526 		/*
527 		 * Initialize the rx queue, as nx_netif_compat_receive() can
528 		 * be called as soon as nx_netif_compat_catch_rx() returns.
529 		 */
530 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
531 			struct __kern_channel_ring *kr = &na->na_rx_rings[r];
532 
533 			nx_mbq_safe_init(kr, &kr->ckr_rx_queue, limit,
534 			    &nexus_mbq_lock_group, &nexus_lock_attr);
535 			SK_DF(SK_VERB_NETIF,
536 			    "na \"%s\" (%p) initialized kr \"%s\" "
537 			    "(%p) krflags 0x%x", na->na_name, SK_KVA(na),
538 			    kr->ckr_name, SK_KVA(kr), kr->ckr_flags);
539 		}
540 
541 		/*
542 		 * Prepare packet buffers for the tx rings; don't preallocate
543 		 * the mbufs here, leave this to nx_netif_compat_na_txsync().
544 		 */
545 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
546 			na->na_tx_rings[r].ckr_tx_pool = NULL;
547 			na->na_tx_rings[r].ckr_tx_pool_count = 0;
548 		}
549 
550 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
551 			nrings = na_get_nslots(na, NR_TX);
552 			ckr_tx_pool_tmp =
553 			    skn_alloc_type_array(tx_pool_on, struct mbuf *,
554 			    nrings, Z_WAITOK,
555 			    skmem_tag_netif_compat_pool);
556 			if (ckr_tx_pool_tmp == NULL) {
557 				SK_ERR("ckr_tx_pool allocation failed");
558 				error = ENOMEM;
559 				goto free_tx_pools;
560 			}
561 			na->na_tx_rings[r].ckr_tx_pool = ckr_tx_pool_tmp;
562 			na->na_tx_rings[r].ckr_tx_pool_count = nrings;
563 		}
564 
565 		/* Prepare to intercept incoming traffic. */
566 		error = nx_netif_compat_catch_rx(nca, TRUE);
567 		if (error != 0) {
568 			SK_ERR("RX intercept failed (%d)", error);
569 			goto uncatch;
570 		}
571 		nx_netif_filter_enable(nifna->nifna_netif);
572 		nx_netif_flow_enable(nifna->nifna_netif);
573 		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
574 		break;
575 
576 	case NA_ACTIVATE_MODE_DEFUNCT:
577 		ASSERT(SKYWALK_CAPABLE(na->na_ifp));
578 		break;
579 
580 	case NA_ACTIVATE_MODE_OFF:
581 		/*
582 		 * Note that here we cannot assert SKYWALK_CAPABLE()
583 		 * as we're called in the destructor path.
584 		 */
585 		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
586 		nx_netif_flow_disable(nifna->nifna_netif);
587 		nx_netif_filter_disable(nifna->nifna_netif);
588 
589 		/*
590 		 * Signal the poller thread to terminate itself, and
591 		 * wait for it to exit.
592 		 */
593 		if (ifp->if_poll_thread != THREAD_NULL) {
594 			ASSERT(net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
595 			ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
596 			lck_mtx_lock_spin(&ifp->if_poll_lock);
597 			ifp->if_poll_flags |= IF_POLLF_TERMINATING;
598 			wakeup_one((caddr_t)&ifp->if_poll_thread);
599 			lck_mtx_unlock(&ifp->if_poll_lock);
600 
601 			/* wait for poller thread to terminate */
602 			lck_mtx_lock(&ifp->if_poll_lock);
603 			while (ifp->if_poll_thread != THREAD_NULL) {
604 				SK_DF(SK_VERB_NETIF_POLL,
605 				    "%s: waiting for poller thread to terminate",
606 				    if_name(ifp));
607 				(void) msleep(&ifp->if_poll_thread,
608 				    &ifp->if_poll_lock, (PZERO - 1),
609 				    "netif_poll_thread_exit", NULL);
610 			}
611 			lck_mtx_unlock(&ifp->if_poll_lock);
612 			SK_DF(SK_VERB_NETIF_POLL,
613 			    "%s: poller thread termination complete",
614 			    if_name(ifp));
615 		}
616 
617 		/* Do not intercept packets on the rx path. */
618 		(void) nx_netif_compat_catch_rx(nca, FALSE);
619 
620 		/* Free the mbufs going to the channel rings */
621 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
622 			nx_mbq_safe_purge(&na->na_rx_rings[r].ckr_rx_queue);
623 			nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
624 		}
625 
626 		/* reset all TX notify callbacks */
627 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
628 			na->na_tx_rings[r].ckr_na_notify =
629 			    na->na_tx_rings[r].ckr_netif_notify;
630 			na->na_tx_rings[r].ckr_netif_notify = NULL;
631 			if (nifna->nifna_tx_mit != NULL) {
632 				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
633 				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
634 			}
635 		}
636 
637 		if (nifna->nifna_tx_mit != NULL) {
638 			skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
639 			    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
640 		}
641 
642 		/* reset all RX notify callbacks */
643 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
644 			na->na_rx_rings[r].ckr_na_notify =
645 			    na->na_rx_rings[r].ckr_netif_notify;
646 			na->na_rx_rings[r].ckr_netif_notify = NULL;
647 			if (nifna->nifna_rx_mit != NULL) {
648 				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
649 				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
650 			}
651 		}
652 		if (nifna->nifna_rx_mit != NULL) {
653 			skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
654 			    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
655 		}
656 
657 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
658 			for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
659 				nx_netif_compat_ring_free(na->
660 				    na_tx_rings[r].ckr_tx_pool[i]);
661 				na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
662 			}
663 			skn_free_type_array_counted_by(tx_pool_off,
664 			    struct mbuf *, na->na_tx_rings[r].ckr_tx_pool_count,
665 			    na->na_tx_rings[r].ckr_tx_pool);
666 		}
667 		break;
668 
669 	default:
670 		VERIFY(0);
671 		/* NOTREACHED */
672 		__builtin_unreachable();
673 	}
674 
675 	return 0;
676 
677 uncatch:
678 	(void) nx_netif_compat_catch_rx(nca, FALSE);
679 
680 free_tx_pools:
681 	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
682 		if (na->na_tx_rings[r].ckr_tx_pool == NULL) {
683 			continue;
684 		}
685 		for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
686 			nx_netif_compat_ring_free(
687 				na->na_tx_rings[r].ckr_tx_pool[i]);
688 			na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
689 		}
690 		skn_free_type_array_counted_by(tx_pool, struct mbuf *,
691 		    na->na_tx_rings[r].ckr_tx_pool_count,
692 		    na->na_tx_rings[r].ckr_tx_pool);
693 	}
694 	if (nifna->nifna_tx_mit != NULL) {
695 		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
696 			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
697 		}
698 		skn_free_type_array_counted_by(tx, struct nx_netif_mit,
699 		    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
700 	}
701 	if (nifna->nifna_rx_mit != NULL) {
702 		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
703 			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
704 		}
705 		skn_free_type_array_counted_by(rx, struct nx_netif_mit,
706 		    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
707 	}
708 	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
709 		nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
710 	}
711 out:
712 
713 	return error;
714 }
715 
716 /*
717  * Record completed transmissions and update ktail.
718  *
719  * The oldest tx buffer not yet completed is at ckr_ktail + 1,
720  * ckr_khead is the first unsent buffer.
721  */
722 /* Hoisted out of line to reduce kernel stack footprint */
723 SK_NO_INLINE_ATTRIBUTE
724 static uint32_t
nx_netif_compat_tx_clean(struct netif_stats * nifs,struct __kern_channel_ring * kring)725 nx_netif_compat_tx_clean(struct netif_stats *nifs,
726     struct __kern_channel_ring *kring)
727 {
728 	const slot_idx_t lim = kring->ckr_lim;
729 	slot_idx_t nm_i = SLOT_NEXT(kring->ckr_ktail, lim);
730 	slot_idx_t khead = kring->ckr_khead;
731 	uint32_t n = 0;
732 	struct mbuf **ckr_tx_pool = kring->ckr_tx_pool;
733 
734 	while (nm_i != khead) { /* buffers not completed */
735 		struct mbuf *m = ckr_tx_pool[nm_i];
736 
737 		if (__improbable(m == NULL)) {
738 			/* this is done, try to replenish the entry */
739 			VERIFY(nm_i <= UINT16_MAX);
740 			ckr_tx_pool[nm_i] = m =
741 			    nx_netif_compat_ring_alloc(M_WAITOK,
742 			    kring->ckr_max_pkt_len, (uint16_t)nm_i);
743 			if (__improbable(m == NULL)) {
744 				STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
745 				STATS_INC(nifs, NETIF_STATS_DROP);
746 				SK_DF(SK_VERB_MEM,
747 				    "mbuf allocation failed (slot %u)", nm_i);
748 				/* XXX how do we proceed ? break ? */
749 				return -ENOMEM;
750 			}
751 		} else if (mbuf_ring_cluster_is_active(m)) {
752 			break; /* This mbuf is still busy */
753 		}
754 		n++;
755 		nm_i = SLOT_NEXT(nm_i, lim);
756 	}
757 	kring->ckr_ktail = SLOT_PREV(nm_i, lim);
758 
759 	SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (%p) tx completed [%u] -> "
760 	    "kh %u kt %u | rh %u rt %u", kring->ckr_name, SK_KVA(kring),
761 	    n, kring->ckr_khead, kring->ckr_ktail,
762 	    kring->ckr_rhead, kring->ckr_rtail);
763 
764 	return n;
765 }
766 
767 /* Hoisted out of line to reduce kernel stack footprint */
768 SK_NO_INLINE_ATTRIBUTE
769 static void
nx_netif_compat_set_tx_event(struct __kern_channel_ring * kring,slot_idx_t khead)770 nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
771     slot_idx_t khead)
772 {
773 	const slot_idx_t lim = kring->ckr_lim;
774 	slot_idx_t ntc = SLOT_NEXT(kring->ckr_ktail, lim); /* next to clean */
775 	struct mbuf *m;
776 	slot_idx_t e;
777 
778 	if (ntc == khead) {
779 		return; /* all buffers are free */
780 	}
781 	/*
782 	 * We have pending packet in the driver between ckr_ktail+1 and
783 	 * ckr_khead, and we have to choose one of these slots to generate
784 	 * a TX notification.  There is a race, but this is only called
785 	 * within TX sync which does a double check.
786 	 */
787 	if (__probable(netif_tx_event_mode == 0)) {
788 		/*
789 		 * Choose the first pending slot, to be safe against drivers
790 		 * reordering mbuf transmissions.
791 		 */
792 		e = ntc;
793 	} else {
794 		/*
795 		 * Choose a slot in the middle, so that we don't risk ending
796 		 * up in a situation where the client continuously wake up,
797 		 * fills one or a few TX slots and go to sleep again.
798 		 */
799 		slot_idx_t n = lim + 1;
800 
801 		if (khead >= ntc) {
802 			e = (khead + ntc) >> 1;
803 		} else { /* wrap around */
804 			e = (khead + n + ntc) >> 1;
805 			if (e >= n) {
806 				e -= n;
807 			}
808 		}
809 
810 		if (__improbable(e >= n)) {
811 			SK_ERR("This cannot happen");
812 			e = 0;
813 		}
814 	}
815 	m = kring->ckr_tx_pool[e];
816 
817 	for (;;) {
818 		uint32_t p = 0, pn, i, f;
819 		int err;
820 
821 		(void) mbuf_cluster_get_prop(m, &p);
822 		f = NMB_GET_FLAGS(p);
823 		i = NMB_GET_INDEX(p);
824 
825 		if (f & NMB_PROPF_TX_NOTIFY) {
826 			/*
827 			 * This can happen if there is already an event
828 			 * on the ring slot 'e': There is nothing to do.
829 			 */
830 			SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
831 			    "TX_NOTIFY already set at %u m %p kc %u ntc %u",
832 			    e, SK_KVA(m), khead, ntc);
833 			return;
834 		}
835 
836 		f |= NMB_PROPF_TX_NOTIFY;
837 		pn = NMB_SET_FLAGS(p, f);
838 
839 		err = mbuf_cluster_set_prop(m, p, pn);
840 		if (err != 0) {
841 			if (err == EBUSY) {     /* try again */
842 				continue;
843 			}
844 			/* TODO: [email protected] -- what to do? */
845 			SK_ERR("Failed to set TX_NOTIFY at %u m %p kh %u "
846 			    "ntc %u, err %d", e, SK_KVA(m), khead, ntc, err);
847 		} else {
848 			SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
849 			    "Request TX_NOTIFY at %u m %p kh %u ntc %u",
850 			    e, SK_KVA(m), khead, ntc);
851 		}
852 		break;
853 	}
854 }
855 
856 #if SK_LOG
857 /* Hoisted out of line to reduce kernel stack footprint */
858 SK_LOG_ATTRIBUTE
859 static void
nx_netif_compat_na_txsync_log(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)860 nx_netif_compat_na_txsync_log(struct __kern_channel_ring *kring,
861     struct proc *p, uint32_t flags, slot_idx_t nm_i)
862 {
863 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
864 	    "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0x%x "
865 	    "nm_i %u, kh %u kt %u | rh %u rt %u",
866 	    sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
867 	    SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id,
868 	    flags, nm_i, kring->ckr_khead, kring->ckr_ktail,
869 	    kring->ckr_rhead, kring->ckr_rtail);
870 }
871 #endif /* SK_LOG */
872 
873 /*
874  * nx_netif_compat_na_txsync() transforms packets into mbufs and passes
875  * them to the device driver.
876  */
877 static int
nx_netif_compat_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)878 nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
879     uint32_t flags)
880 {
881 #pragma unused(p)
882 	struct nexus_adapter *na = KRNA(kring);
883 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
884 	slot_idx_t nm_i; /* index into the channel ring */        // j
885 	const slot_idx_t head = kring->ckr_rhead;
886 	uint32_t slot_count = 0;
887 	uint32_t byte_count = 0;
888 
889 	STATS_INC(nifs, NETIF_STATS_TX_SYNC);
890 
891 	/* update our work timestamp */
892 	na->na_work_ts = net_uptime();
893 
894 	/*
895 	 * First part: process new packets to send.
896 	 */
897 	nm_i = kring->ckr_khead;
898 	if (nm_i != head) {     /* we have new packets to send */
899 		while (nm_i != head) {
900 			struct __kern_slot_desc *sd = KR_KSD(kring, nm_i);
901 
902 			/* device-specific */
903 			struct mbuf *m;
904 			int tx_ret;
905 			/*
906 			 * Take a mbuf from the tx pool (replenishing the pool
907 			 * entry if necessary) and copy in the user packet.
908 			 */
909 			VERIFY(nm_i <= UINT16_MAX);
910 			m = kring->ckr_tx_pool[nm_i];
911 			if (__improbable(m == NULL)) {
912 				kring->ckr_tx_pool[nm_i] = m =
913 				    nx_netif_compat_ring_alloc(M_WAITOK,
914 				    kring->ckr_max_pkt_len, (uint16_t)nm_i);
915 				if (__improbable(m == NULL)) {
916 					STATS_INC(nifs, NETIF_STATS_DROP);
917 					STATS_INC(nifs,
918 					    NETIF_STATS_DROP_NOMEM_MBUF);
919 					SK_DF(SK_VERB_MEM,
920 					    "%s(%d) kr \"%s\" (%p) "
921 					    "krflags 0x%x ckr_tx_pool[%u] "
922 					    "allocation failed",
923 					    sk_proc_name(p),
924 					    sk_proc_pid(p), kring->ckr_name,
925 					    SK_KVA(kring), kring->ckr_flags,
926 					    nm_i);
927 					/*
928 					 * Here we could schedule a timer
929 					 * which retries to replenish after
930 					 * a while, and notifies the client
931 					 * when it manages to replenish some
932 					 * slot.  In any cae we break early
933 					 * to avoid crashes.
934 					 */
935 					break;
936 				}
937 				STATS_INC(nifs, NETIF_STATS_TX_REPL);
938 			}
939 
940 			byte_count += sd->sd_pkt->pkt_length;
941 			slot_count++;
942 
943 			/*
944 			 * We should ask notifications when CS_REPORT is set,
945 			 * or roughly every half ring.  To optimize this,
946 			 * we set a notification event when the client runs
947 			 * out of TX ring space, or when transmission fails.
948 			 * In the latter case we also break early.
949 			 */
950 			tx_ret = nx_netif_compat_xmit_frame(na, m, sd->sd_pkt);
951 			if (__improbable(tx_ret)) {
952 				SK_RD(5, "start_xmit failed: err %d "
953 				    "[nm_i %u, h %u, kt %u]",
954 				    tx_ret, nm_i, head, kring->ckr_ktail);
955 				/*
956 				 * No room for this mbuf in the device driver.
957 				 * Request a notification FOR A PREVIOUS MBUF,
958 				 * then call nx_netif_compat_tx_clean(kring) to
959 				 * do the double check and see if we can free
960 				 * more buffers.  If there is space continue,
961 				 * else break; NOTE: the double check is
962 				 * necessary if the problem occurs in the
963 				 * txsync call after selrecord().  Also, we
964 				 * need some way to tell the caller that not
965 				 * all buffers were queued onto the device
966 				 * (this was not a problem with native skywalk
967 				 * driver where space is preallocated). The
968 				 * bridge has a similar problem and we solve
969 				 * it there by dropping the excess packets.
970 				 */
971 				nx_netif_compat_set_tx_event(kring, nm_i);
972 				if (nx_netif_compat_tx_clean(nifs, kring)) {
973 					/* space now available */
974 					continue;
975 				} else {
976 					break;
977 				}
978 			}
979 			nm_i = SLOT_NEXT(nm_i, kring->ckr_lim);
980 			STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
981 		}
982 
983 		/*
984 		 * Update khead to the next slot to transmit; Here nm_i
985 		 * is not necesarrily head, we could break early.
986 		 */
987 		kring->ckr_khead = nm_i;
988 
989 		kr_update_stats(kring, slot_count, byte_count);
990 	}
991 
992 	/*
993 	 * Second, reclaim completed buffers
994 	 */
995 	if ((flags & NA_SYNCF_FORCE_RECLAIM) || kr_txempty(kring)) {
996 		/*
997 		 * No more available slots? Set a notification event on a
998 		 * channel slot that will be cleaned in the future.  No
999 		 * doublecheck is performed, since nx_netif_compat_na_txsync()
1000 		 * will be called twice by ch_event().
1001 		 */
1002 		nx_netif_compat_set_tx_event(kring, nm_i);
1003 	}
1004 	kring->ckr_pending_intr = 0;
1005 
1006 #if SK_LOG
1007 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1008 		nx_netif_compat_na_txsync_log(kring, p, flags, nm_i);
1009 	}
1010 #endif /* SK_LOG */
1011 
1012 	(void) nx_netif_compat_tx_clean(nifs, kring);
1013 
1014 	return 0;
1015 }
1016 
1017 #if SK_LOG
1018 /* Hoisted out of line to reduce kernel stack footprint */
1019 SK_LOG_ATTRIBUTE
1020 static void
nx_netif_compat_receive_log1(const struct __kern_channel_ring * kring,struct nx_mbq * q)1021 nx_netif_compat_receive_log1(const struct __kern_channel_ring *kring,
1022     struct nx_mbq *q)
1023 {
1024 	SK_RD(10, "kr \"%s\" (%p) krflags 0x%x FULL "
1025 	    "(qlen %u qsize %zu), kc %u kt %u", kring->ckr_name,
1026 	    SK_KVA(kring), kring->ckr_flags, nx_mbq_len(q),
1027 	    nx_mbq_size(q), kring->ckr_khead, kring->ckr_ktail);
1028 }
1029 
1030 /* Hoisted out of line to reduce kernel stack footprint */
1031 SK_LOG_ATTRIBUTE
1032 static void
nx_netif_compat_receive_log2(const struct __kern_channel_ring * kring,struct nx_mbq * q,const struct ifnet_stat_increment_param * s)1033 nx_netif_compat_receive_log2(const struct __kern_channel_ring *kring,
1034     struct nx_mbq *q, const struct ifnet_stat_increment_param *s)
1035 {
1036 	SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (%p) krflags 0x%x OK, "
1037 	    "added %u packets %u bytes, now qlen %u qsize %zu",
1038 	    kring->ckr_name, SK_KVA(kring), kring->ckr_flags, s->packets_in,
1039 	    s->bytes_in, nx_mbq_len(q), nx_mbq_size(q));
1040 }
1041 #endif /* SK_LOG */
1042 
1043 /*
1044  * This is the default RX path for the compat netif nexus. Packets
1045  * are enqueued and later extracted by nx_netif_compat_na_rxsync().
1046  */
1047 /* TODO: [email protected] -- implement chaining */
1048 static errno_t
nx_netif_compat_receive(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)1049 nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
1050     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
1051     boolean_t poll, struct thread *tp)
1052 {
1053 #pragma unused(tp)
1054 	boolean_t ifp_rxpoll = ((ifp->if_eflags & IFEF_RXPOLL) && net_rxpoll);
1055 	struct nexus_adapter *na = &NA(ifp)->nifna_up;
1056 	struct __kern_channel_ring *kring;
1057 	struct netif_stats *nifs;
1058 	uint32_t r, work_done;
1059 	unsigned int qlimit;
1060 	struct nx_mbq *q;
1061 	errno_t err = 0;
1062 
1063 	/* update our work timestamp */
1064 	na->na_work_ts = net_uptime();
1065 
1066 	if (__improbable(m_head == NULL)) {
1067 		ASSERT(m_tail == NULL);
1068 		ASSERT(poll);
1069 		ASSERT(s->bytes_in == 0);
1070 		ASSERT(s->packets_in == 0);
1071 	}
1072 
1073 	/* BEGIN CSTYLED */
1074 	/*
1075 	 * TODO: [email protected] -- this needs to be revisited once we
1076 	 * have a clear definition of how multiple RX rings are mapped
1077 	 * to flows; this would involve the hardware/driver doing some
1078 	 * kind of classification and RSS-like demuxing.
1079 	 *
1080 	 * When we enable that, we'll need to consider sifting thru the
1081 	 * mbuf chain we get from the caller, and enqueue them across
1082 	 * per-ring temporary mbuf queue (along with marking the ring
1083 	 * indicating pending packets.)  During second stage processing,
1084 	 * we'll issue nx_netif_mit_rx_intr() on each marked ring to
1085 	 * dispatch the packets upstream.
1086 	 *
1087 	 * r = MBUF_RXQ(m);
1088 	 *
1089 	 * if (r >= na->na_num_rx_rings)
1090 	 *     r = r % na->na_num_rx_rings;
1091 	 *
1092 	 * kring = &na->na_rx_rings[r];
1093 	 * q = &kring->ckr_rx_queue;
1094 	 *
1095 	 * For now, target only the first RX ring (ring 0).
1096 	 */
1097 	/* END CSTYLED */
1098 	r = 0;  /* receive ring number */
1099 	kring = &na->na_rx_rings[r];
1100 
1101 	ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
1102 	nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1103 
1104 	if (__improbable((!NA_IS_ACTIVE(na)) || KR_DROP(kring))) {
1105 		/* BEGIN CSTYLED */
1106 		/*
1107 		 * If we deal with multiple rings, change above to:
1108 		 *
1109 		 * if (!NA_IS_ACTIVE(na) || r >= na_get_nrings(na, NR_RX)))
1110 		 *
1111 		 * then here do:
1112 		 *
1113 		 * if (r >= na_get_nrings(na, NR_RX)) {
1114 		 *      SK_ERR("na \"%s\" (%p) invalid r %u >= %u",
1115 		 *          na->na_name, SK_KVA(na), r,
1116 		 *          na_get_nrings(na, NR_RX));
1117 		 * }
1118 		 */
1119 		/* END CSTYLED */
1120 		m_freem_list(m_head);
1121 		if (!NA_IS_ACTIVE(na)) {
1122 			STATS_ADD(nifs, NETIF_STATS_DROP_NA_INACTIVE,
1123 			    s->packets_in);
1124 		} else if (KR_DROP(kring)) {
1125 			STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE,
1126 			    s->packets_in);
1127 		}
1128 		STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1129 		err = ENXIO;
1130 		goto done;
1131 	}
1132 	if (__improbable(m_head == NULL)) {
1133 		goto send_packets;
1134 	}
1135 
1136 	q = &kring->ckr_rx_queue;
1137 	nx_mbq_lock_spin(q);
1138 	qlimit = nx_mbq_limit(q);
1139 	if (ifp_rxpoll) {
1140 		/*
1141 		 * qlimit of the receive queue is much smaller when the
1142 		 * interface is in oppurtunistic polling mode. In this case
1143 		 * when the interface is operating in interrupt mode,
1144 		 * a sudden burst of input packets can cause the receive queue
1145 		 * to quickly buildup due to scheduling latency in waking up
1146 		 * the poller thread. To avoid drops here due to this latency
1147 		 * we provide a leeway on the qlimit.
1148 		 */
1149 		qlimit <<= 5;
1150 	}
1151 	if (__improbable(nx_mbq_len(q) > qlimit)) {
1152 #if SK_LOG
1153 		if (__improbable(sk_verbose != 0)) {
1154 			nx_netif_compat_receive_log1(kring, q);
1155 		}
1156 #endif /* SK_LOG */
1157 		nx_mbq_unlock(q);
1158 		m_freem_list(m_head);
1159 		STATS_ADD(nifs, NETIF_STATS_DROP_RXQ_OVFL, s->packets_in);
1160 		STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1161 		goto send_packets;
1162 	}
1163 	nx_mbq_enq_multi(q, m_head, m_tail, s->packets_in, s->bytes_in);
1164 
1165 #if SK_LOG
1166 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1167 		nx_netif_compat_receive_log2(kring, q, s);
1168 	}
1169 #endif /* SK_LOG */
1170 
1171 	nx_mbq_unlock(q);
1172 
1173 	(void) ifnet_stat_increment_in(ifp, s->packets_in, s->bytes_in,
1174 	    s->errors_in);
1175 
1176 	if (poll) {
1177 		/* update incremental poll stats */
1178 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
1179 	}
1180 
1181 send_packets:
1182 	/*
1183 	 * if the interface supports oppurtunistic input polling, then the
1184 	 * input packet processing is performed in context of the poller thread.
1185 	 */
1186 	if (!poll && ifp_rxpoll) {
1187 		/* wakeup the poller thread */
1188 		ifnet_poll(ifp);
1189 	} else {
1190 		/*
1191 		 * wakeup the mitigation thread if needed to perform input
1192 		 * packet processing.
1193 		 * if the interface supports oppurtunistic input polling, then
1194 		 * mitigation thread is not created and the input packet
1195 		 * processing happens in context of the poller thread.
1196 		 */
1197 		err = nx_netif_mit_rx_intr((NAKR(na, NR_RX) + r), kernproc, 0,
1198 		    &work_done);
1199 	}
1200 done:
1201 	return err;
1202 }
1203 
1204 #if SK_LOG
1205 /* Hoisted out of line to reduce kernel stack footprint */
1206 SK_LOG_ATTRIBUTE
1207 static void
nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)1208 nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring *kring,
1209     struct proc *p, uint32_t flags, slot_idx_t nm_i)
1210 {
1211 	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1212 	    "%s(%d) kr \"%s\" (%p) krflags 0x%x "
1213 	    "ring %u flags 0x%x nm_i %u kt %u", sk_proc_name(p),
1214 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1215 	    kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail);
1216 }
1217 #endif /* SK_LOG */
1218 
1219 #if DEBUG || DEVELOPMENT
1220 /*
1221  * Split an mbuf chain at offset "split", such that the first mbuf
1222  * is a zero-length M_PKTHDR, followed by the rest of the mbufs.
1223  * Typically, the "split" value is equal to the size of the link
1224  * layer header, e.g. Ethernet header.
1225  */
1226 static struct mbuf *
nx_netif_rx_split(struct mbuf * m0,uint32_t split)1227 nx_netif_rx_split(struct mbuf *m0, uint32_t split)
1228 {
1229 	struct mbuf *m = m0;
1230 
1231 	if (split == 0) {
1232 		split = MHLEN;
1233 		M_PREPEND(m, split, M_DONTWAIT, 0);
1234 	} else {
1235 		m->m_data -= split;
1236 		m->m_len += split;
1237 		m_pktlen(m) += split;
1238 
1239 		ASSERT((uintptr_t)m->m_data >= (uintptr_t)mbuf_datastart(m));
1240 		ASSERT((uintptr_t)m->m_data < ((uintptr_t)mbuf_datastart(m) +
1241 		    mbuf_maxlen(m)));
1242 	}
1243 	if (m != NULL) {
1244 		struct mbuf *n = m_split(m, split, M_DONTWAIT);
1245 		if (n == NULL) {
1246 			m_freem(m);
1247 			return NULL;
1248 		}
1249 		m0 = m;
1250 		ASSERT((uint32_t)m->m_len == split);
1251 		m->m_data += split;
1252 		m->m_len -= split;
1253 		while (m->m_next != NULL) {
1254 			m = m->m_next;
1255 		}
1256 		m->m_next = n;
1257 		m = m0;
1258 		m_pktlen(m) = m_length2(m, NULL);
1259 	}
1260 
1261 	return m;
1262 }
1263 #endif /* DEBUG || DEVELOPMENT */
1264 
1265 /*
1266  * nx_netif_compat_na_rxsync() extracts mbufs from the queue filled by
1267  * nx_netif_compat_receive() and puts their content in the channel
1268  * receive ring.
1269  *
1270  * Accesses to kring are serialized via kring->ckr_rx_queue lock, because
1271  * the rx handler is asynchronous,
1272  */
1273 static int
nx_netif_compat_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1274 nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1275     uint32_t flags)
1276 {
1277 #pragma unused(p)
1278 	struct nexus_adapter *na = KRNA(kring);
1279 	struct nexus_netif_adapter *nifna = NIFNA(na);
1280 	struct nx_netif *nif = nifna->nifna_netif;
1281 	slot_idx_t nm_i;        /* index into the channel ring */
1282 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1283 	uint32_t npkts = 0;
1284 	uint32_t byte_count = 0;
1285 	const slot_idx_t lim = kring->ckr_lim;
1286 	const slot_idx_t head = kring->ckr_rhead;
1287 	boolean_t force_update = ((flags & NA_SYNCF_FORCE_READ) ||
1288 	    kring->ckr_pending_intr != 0);
1289 	struct mbuf *m;
1290 	uint32_t n;
1291 	uint32_t avail; /* in slots */
1292 	int err, mlen;
1293 	boolean_t attach_mbuf = FALSE;
1294 	struct nx_mbq *q, tmpq;
1295 	struct kern_pbufpool *pp = kring->ckr_pp;
1296 	uint32_t ph_cnt, i = 0;
1297 
1298 	ASSERT(pp->pp_max_frags == 1);
1299 	ASSERT(head <= lim);
1300 
1301 	/*
1302 	 * First part: skip past packets that userspace has released.
1303 	 * This can possibly make room for the second part.
1304 	 * equivalent to kr_reclaim()
1305 	 */
1306 	if (kring->ckr_khead != head) {
1307 		kring->ckr_khead = head;
1308 		/* ensure global visibility */
1309 		os_atomic_thread_fence(seq_cst);
1310 	}
1311 
1312 	STATS_INC(nifs, NETIF_STATS_RX_SYNC);
1313 
1314 	/*
1315 	 * Second part: import newly received packets.
1316 	 */
1317 	if (!force_update) {
1318 		return 0;
1319 	}
1320 
1321 	/* update our work timestamp */
1322 	na->na_work_ts = net_uptime();
1323 
1324 	/* first empty slot in the receive ring */
1325 	nm_i = kring->ckr_ktail;
1326 
1327 	/*
1328 	 * Compute the available space (in bytes) in this ring.
1329 	 * The first slot that is not considered in is the one
1330 	 * before ckr_khead.
1331 	 */
1332 	avail = kr_available_slots_rxring(kring);
1333 	if (__improbable(avail == 0)) {
1334 		return 0;
1335 	}
1336 
1337 	if (NA_KERNEL_ONLY(na)) {
1338 		ASSERT(na->na_ifp != NULL &&
1339 		    fsw_ifp_to_fsw(na->na_ifp) != NULL);
1340 		/*
1341 		 * We are not supporting attachment to bridge flowswitch
1342 		 * for now, until we support PKT_F_MBUF_DATA packets
1343 		 * in bridge flowswitch.
1344 		 */
1345 		attach_mbuf = TRUE;
1346 	}
1347 
1348 	/*
1349 	 * Quickly move all of ckr_rx_queue to a temporary queue to dequeue
1350 	 * from.  For each mbuf, attach or copy it to the packet attached
1351 	 * to the slot.  Release the lock while we're doing that, to allow
1352 	 * for the input thread to enqueue.
1353 	 */
1354 	q = &kring->ckr_rx_queue;
1355 	nx_mbq_init(&tmpq, NX_MBQ_NO_LIMIT);
1356 	nx_mbq_lock_spin(q);
1357 	nx_mbq_concat(&tmpq, q);
1358 	nx_mbq_unlock(q);
1359 
1360 	if (__improbable(nx_mbq_len(&tmpq) == 0)) {
1361 		return 0;
1362 	}
1363 
1364 	ph_cnt = MIN(avail, nx_mbq_len(&tmpq));
1365 	err = kern_pbufpool_alloc_batch_nosleep(pp, 1, kring->ckr_scratch,
1366 	    &ph_cnt);
1367 	if (err == ENOMEM) {
1368 		SK_DF(SK_VERB_MEM, "%s(%d) failed to alloc %d pkts for kr %p",
1369 		    sk_proc_name(p), sk_proc_pid(p), ph_cnt,
1370 		    SK_KVA(kring));
1371 		goto done;
1372 	}
1373 	ASSERT(ph_cnt != 0);
1374 
1375 	for (n = 0; (n < ph_cnt) &&
1376 	    ((m = nx_mbq_deq(&tmpq)) != NULL); n++) {
1377 		struct __kern_slot_desc *ksd = KR_KSD(kring, nm_i);
1378 		struct __kern_packet *pkt;
1379 		kern_packet_t ph;
1380 		uint8_t hlen;
1381 		uint16_t tag;
1382 		char *__single h;
1383 
1384 		ASSERT(m->m_flags & M_PKTHDR);
1385 		mlen = m_pktlen(m);
1386 		h = m->m_pkthdr.pkt_hdr;
1387 		if (__improbable(mlen == 0 || h == NULL ||
1388 		    h < (char *)mbuf_datastart(m) || h > (char *)m->m_data)) {
1389 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1390 			SK_RD(5, "kr \"%s\" (%p) m %p len %d"
1391 			    "bad pkt_hdr", kring->ckr_name,
1392 			    SK_KVA(kring), SK_KVA(m), mlen);
1393 			m_freem(m);
1394 			m = NULL;
1395 			continue;
1396 		}
1397 
1398 		hlen = (uint8_t)(m->m_data - (uintptr_t)h);
1399 		mlen += hlen;
1400 
1401 #if DEBUG || DEVELOPMENT
1402 		if (__improbable(netif_rx_split != 0)) {
1403 			/* callee frees mbuf upon failure */
1404 			if ((m = nx_netif_rx_split(m, hlen)) == NULL) {
1405 				continue;
1406 			}
1407 
1408 			ASSERT((uintptr_t)m->m_data >=
1409 			    (uintptr_t)mbuf_datastart(m));
1410 			ASSERT((uintptr_t)m->m_data <
1411 			    ((uintptr_t)mbuf_datastart(m) +
1412 			    mbuf_maxlen(m)));
1413 		}
1414 #endif /* DEBUG || DEVELOPMENT */
1415 
1416 		ph = kring->ckr_scratch[i];
1417 		ASSERT(ph != 0);
1418 		kring->ckr_scratch[i] = 0;
1419 		pkt = SK_PTR_ADDR_KPKT(ph);
1420 		++i;
1421 
1422 		/*
1423 		 * Wind back the data pointer to include any frame headers
1424 		 * as part of the copy below.  The header length is then
1425 		 * stored in the corresponding metadata area of the buffer.
1426 		 */
1427 		m->m_data -= hlen;
1428 		m->m_len += hlen;
1429 		m->m_pkthdr.len += hlen;
1430 		ASSERT(mlen == m->m_pkthdr.len);
1431 
1432 		pkt->pkt_link_flags = 0;
1433 		if (m->m_flags & M_HASFCS) {
1434 			pkt->pkt_link_flags |= PKT_LINKF_ETHFCS;
1435 		}
1436 		if (mbuf_get_vlan_tag(m, &tag) == 0) {
1437 			(void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag);
1438 		}
1439 		SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1440 		    "kr \"%s\" (%p) m %p idx %u slot_len %d",
1441 		    kring->ckr_name, SK_KVA(kring), SK_KVA(m), nm_i, mlen);
1442 
1443 		if (__probable(attach_mbuf)) {
1444 			STATS_INC(nifs, NETIF_STATS_RX_COPY_ATTACH);
1445 			err = __packet_initialize_with_mbuf(pkt, m, 0, hlen);
1446 			VERIFY(err == 0);
1447 		} else if (__probable(mlen <= (int)PP_BUF_SIZE_DEF(pp))) {
1448 			STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
1449 			/*
1450 			 * We're sending this up to a user channel opened
1451 			 * directly to the netif; copy everything.
1452 			 */
1453 			err = __packet_set_headroom(ph, 0);
1454 			VERIFY(err == 0);
1455 			err = __packet_set_link_header_length(ph, hlen);
1456 			VERIFY(err == 0);
1457 			nif->nif_pkt_copy_from_mbuf(NR_RX, ph, 0, m, 0,
1458 			    mlen, FALSE, 0);
1459 			/* finalize and attach the packet */
1460 			err = __packet_finalize(ph);
1461 			VERIFY(err == 0);
1462 			m_freem(m);
1463 			m = NULL;
1464 		} else {
1465 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1466 			STATS_INC(nifs, NETIF_STATS_DROP);
1467 			m_freem(m);
1468 			m = NULL;
1469 			kern_pbufpool_free(pp, ph);
1470 			ph = 0;
1471 			pkt = NULL;
1472 			continue;
1473 		}
1474 
1475 		err = KR_SLOT_ATTACH_METADATA(kring, ksd,
1476 		    (struct __kern_quantum *)pkt);
1477 		ASSERT(err == 0);
1478 
1479 		byte_count += mlen;
1480 		++npkts;
1481 		ASSERT(npkts < kring->ckr_num_slots);
1482 		nm_i = SLOT_NEXT(nm_i, lim);
1483 	}
1484 
1485 	if (__improbable(i < ph_cnt)) {
1486 		kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1487 		    (ph_cnt - i));
1488 	}
1489 
1490 	ASSERT(npkts <= ph_cnt);
1491 	kr_update_stats(kring, npkts, byte_count);
1492 
1493 	if (npkts != 0) {
1494 		kring->ckr_ktail = nm_i;
1495 		STATS_ADD(nifs, NETIF_STATS_RX_PACKETS, npkts);
1496 	}
1497 	kring->ckr_pending_intr = 0;
1498 
1499 #if SK_LOG
1500 	if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1501 		nx_netif_compat_na_rxsync_log(kring, p, flags, nm_i);
1502 	}
1503 #endif /* SK_LOG */
1504 
1505 done:
1506 	/*
1507 	 * If we didn't process all packets in temporary queue,
1508 	 * move them back to the head of ckr_rx_queue.
1509 	 */
1510 	if (!nx_mbq_empty(&tmpq)) {
1511 		nx_mbq_lock_spin(q);
1512 		nx_mbq_concat(&tmpq, q);
1513 		ASSERT(nx_mbq_empty(q));
1514 		nx_mbq_concat(q, &tmpq);
1515 		nx_mbq_unlock(q);
1516 	}
1517 	ASSERT(nx_mbq_empty(&tmpq));
1518 
1519 	return 0;
1520 }
1521 
1522 static void
nx_netif_compat_na_dtor(struct nexus_adapter * na)1523 nx_netif_compat_na_dtor(struct nexus_adapter *na)
1524 {
1525 	struct ifnet *__single ifp;
1526 	struct nexus_netif_compat_adapter *nca =
1527 	    (struct nexus_netif_compat_adapter *)na;
1528 
1529 	SK_LOCK_ASSERT_HELD();
1530 
1531 	SK_DF(SK_VERB_NETIF, "na \"%s\" (%p)", na->na_name, SK_KVA(na));
1532 
1533 	/*
1534 	 * If the finalizer callback hasn't been called for whatever
1535 	 * reasons, pick up the embryonic ifnet stored in na_private.
1536 	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1537 	 */
1538 	if ((ifp = na->na_ifp) == NULL) {
1539 		ifp = na->na_private;
1540 		na->na_private = NULL;
1541 	} else {
1542 		ifnet_decr_iorefcnt(ifp);
1543 		na->na_ifp = NULL;
1544 	}
1545 
1546 	if (nca->nca_up.nifna_netif != NULL) {
1547 		nx_netif_release(nca->nca_up.nifna_netif);
1548 		nca->nca_up.nifna_netif = NULL;
1549 	}
1550 	ASSERT(!SKYWALK_NATIVE(ifp));
1551 }
1552 
1553 /*
1554  * nx_netif_compat_attach() makes it possible to use skywalk on
1555  * a device without native skywalk support.
1556  * This is less performant than native support but potentially
1557  * faster than raw sockets or similar schemes.
1558  */
1559 int
nx_netif_compat_attach(struct kern_nexus * nx,struct ifnet * ifp)1560 nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp)
1561 {
1562 	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1563 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1564 	struct nexus_netif_compat_adapter *devnca = NULL;
1565 	struct nexus_netif_compat_adapter *hostnca = NULL;
1566 	struct nexus_adapter *__single devna = NULL;
1567 	struct nexus_adapter *__single hostna = NULL;
1568 	boolean_t embryonic = FALSE;
1569 	uint32_t tx_rings, tx_slots;
1570 	int retval = 0;
1571 
1572 	SK_LOCK_ASSERT_HELD();
1573 	ASSERT(!SKYWALK_NATIVE(ifp));
1574 	ASSERT(!SKYWALK_CAPABLE(ifp));
1575 	ASSERT(ifp->if_na == NULL);
1576 	ASSERT(ifp->if_na_ops == NULL);
1577 
1578 	devnca = na_netif_compat_alloc(Z_WAITOK);
1579 	hostnca = na_netif_compat_alloc(Z_WAITOK);
1580 
1581 	/*
1582 	 * We can be called for two different interface states:
1583 	 *
1584 	 * Fully attached: get an io ref count; upon success, this
1585 	 * holds a reference to the ifnet for the ifp pointer stored
1586 	 * in 'na_ifp' down below for both adapters.
1587 	 *
1588 	 * Embryonic: temporary hold the ifnet in na_private, which
1589 	 * upon a successful ifnet_attach(), will be moved over to
1590 	 * the 'na_ifp' with an io ref count held.
1591 	 *
1592 	 * The ifnet in 'na_ifp' will be released by na_release_locked().
1593 	 */
1594 	if (!ifnet_get_ioref(ifp)) {
1595 		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
1596 			ifp = NULL;
1597 			retval = ENXIO;
1598 			goto err;
1599 		}
1600 		embryonic = TRUE;
1601 	}
1602 
1603 	/* initialize the (compat) device netif adapter */
1604 	devnca->nca_up.nifna_netif = nif;
1605 	nx_netif_retain(nif);
1606 	devna = &devnca->nca_up.nifna_up;
1607 	strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
1608 	uuid_generate_random(devna->na_uuid);
1609 	if (embryonic) {
1610 		/*
1611 		 * We will move this over to na_ifp once
1612 		 * the interface is fully attached.
1613 		 */
1614 		devna->na_private = ifp;
1615 		ASSERT(devna->na_ifp == NULL);
1616 	} else {
1617 		ASSERT(devna->na_private == NULL);
1618 		/* use I/O refcnt from ifnet_get_ioref() */
1619 		devna->na_ifp = ifp;
1620 	}
1621 
1622 	devna->na_type = NA_NETIF_COMPAT_DEV;
1623 	devna->na_free = na_netif_compat_free;
1624 	devna->na_activate = nx_netif_compat_na_activate;
1625 	devna->na_txsync = nx_netif_compat_na_txsync;
1626 	devna->na_rxsync = nx_netif_compat_na_rxsync;
1627 	devna->na_dtor = nx_netif_compat_na_dtor;
1628 	devna->na_krings_create = nx_netif_dev_krings_create;
1629 	devna->na_krings_delete = nx_netif_dev_krings_delete;
1630 	devna->na_special = nx_netif_na_special;
1631 
1632 	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
1633 	    NEXUS_STATS_TYPE_INVALID;
1634 
1635 	if (skywalk_netif_direct_allowed(ifp->if_xname)) {
1636 		tx_rings = nxp->nxp_tx_rings;
1637 		tx_slots = nxp->nxp_tx_slots;
1638 	} else {
1639 		tx_rings = 0;
1640 		tx_slots = 0;
1641 	}
1642 	na_set_nrings(devna, NR_TX, tx_rings);
1643 	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
1644 	na_set_nslots(devna, NR_TX, tx_slots);
1645 	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
1646 	/*
1647 	 * Verify upper bounds; the parameters must have already been
1648 	 * validated by nxdom_prov_params() by the time we get here.
1649 	 */
1650 	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
1651 	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
1652 	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
1653 	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
1654 
1655 	na_attach_common(devna, nx, &nx_netif_compat_prov_s);
1656 
1657 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1658 	    nx, devna)) != 0) {
1659 		ASSERT(devna->na_arena == NULL);
1660 		/* we've transferred the refcnt to na_ifp above */
1661 		ifp = NULL;
1662 		goto err;
1663 	}
1664 	ASSERT(devna->na_arena != NULL);
1665 
1666 	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
1667 	ASSERT(devna->na_flowadv_max == 0 ||
1668 	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
1669 
1670 	/* setup packet copy routines */
1671 	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
1672 		nif->nif_pkt_copy_from_mbuf =
1673 		    pkt_copy_multi_buflet_from_mbuf;
1674 		nif->nif_pkt_copy_to_mbuf =
1675 		    pkt_copy_multi_buflet_to_mbuf;
1676 	} else {
1677 		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
1678 		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
1679 	}
1680 
1681 	/* initialize the host netif adapter */
1682 	hostnca->nca_up.nifna_netif = nif;
1683 	nx_netif_retain(nif);
1684 	hostna = &hostnca->nca_up.nifna_up;
1685 	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
1686 	    "%s^", devna->na_name);
1687 	uuid_generate_random(hostna->na_uuid);
1688 	if (embryonic) {
1689 		/*
1690 		 * We will move this over to na_ifp once
1691 		 * the interface is fully attached.
1692 		 */
1693 		hostna->na_private = ifp;
1694 		ASSERT(hostna->na_ifp == NULL);
1695 	} else {
1696 		ASSERT(hostna->na_private == NULL);
1697 		hostna->na_ifp = devna->na_ifp;
1698 		ifnet_incr_iorefcnt(hostna->na_ifp);
1699 	}
1700 	hostna->na_type = NA_NETIF_COMPAT_HOST;
1701 	hostna->na_free = na_netif_compat_free;
1702 	hostna->na_activate = nx_netif_host_na_activate;
1703 	hostna->na_txsync = nx_netif_host_na_txsync;
1704 	hostna->na_rxsync = nx_netif_host_na_rxsync;
1705 	hostna->na_dtor = nx_netif_compat_na_dtor;
1706 	hostna->na_krings_create = nx_netif_host_krings_create;
1707 	hostna->na_krings_delete = nx_netif_host_krings_delete;
1708 	hostna->na_special = nx_netif_host_na_special;
1709 
1710 	os_atomic_or(&hostna->na_flags, NAF_HOST_ONLY, relaxed);
1711 	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
1712 	    NEXUS_STATS_TYPE_INVALID;
1713 
1714 	na_set_nrings(hostna, NR_TX, 1);
1715 	na_set_nrings(hostna, NR_RX, 0);
1716 	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
1717 	na_set_nslots(hostna, NR_RX, 0);
1718 
1719 	na_attach_common(hostna, nx, &nx_netif_prov_s);
1720 
1721 	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1722 	    nx, hostna)) != 0) {
1723 		ASSERT(hostna->na_arena == NULL);
1724 		/* we've transferred the refcnt to na_ifp above */
1725 		ifp = NULL;
1726 		goto err;
1727 	}
1728 	ASSERT(hostna->na_arena != NULL);
1729 
1730 	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
1731 	ASSERT(hostna->na_flowadv_max == 0 ||
1732 	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
1733 
1734 	/* these will be undone by destructor  */
1735 	ifp->if_na_ops = &na_netif_compat_ops;
1736 	ifp->if_na = &devnca->nca_up;
1737 	na_retain_locked(devna);
1738 	na_retain_locked(hostna);
1739 
1740 	SKYWALK_SET_CAPABLE(ifp);
1741 
1742 	NETIF_WLOCK(nif);
1743 	nif->nif_ifp = ifp;
1744 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna, kernproc);
1745 	ASSERT(retval == 0);
1746 	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna, kernproc);
1747 	ASSERT(retval == 0);
1748 	NETIF_WUNLOCK(nif);
1749 
1750 #if SK_LOG
1751 	uuid_string_t uuidstr;
1752 	SK_DF(SK_VERB_NETIF, "na_name: \"%s\"", devna->na_name);
1753 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
1754 	    sk_uuid_unparse(devna->na_uuid, uuidstr));
1755 	SK_DF(SK_VERB_NETIF, "  nx:          %p (\"%s\":\"%s\")",
1756 	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
1757 	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
1758 	SK_DF(SK_VERB_NETIF, "  flags:       0x%x", devna->na_flags);
1759 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
1760 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
1761 	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
1762 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
1763 	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
1764 #if CONFIG_NEXUS_USER_PIPE
1765 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
1766 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
1767 #endif /* CONFIG_NEXUS_USER_PIPE */
1768 	SK_DF(SK_VERB_NETIF, "  ifp:         %p %s [ioref %u]",
1769 	    SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio));
1770 	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
1771 	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
1772 	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
1773 	SK_DF(SK_VERB_NETIF, "  nx:          %p (\"%s\":\"%s\")",
1774 	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
1775 	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
1776 	SK_DF(SK_VERB_NETIF, "  flags:       0x%x", hostna->na_flags);
1777 	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
1778 	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
1779 	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
1780 	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
1781 	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
1782 #if CONFIG_NEXUS_USER_PIPE
1783 	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
1784 	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
1785 #endif /* CONFIG_NEXUS_USER_PIPE */
1786 	SK_DF(SK_VERB_NETIF, "  ifp:       %p %s [ioref %u]", SK_KVA(ifp),
1787 	    ifp->if_xname, os_ref_get_count(&ifp->if_refio));
1788 #endif /* SK_LOG */
1789 
1790 err:
1791 	if (retval != 0) {
1792 		ASSERT(ifp == NULL);
1793 		if (devna != NULL) {
1794 			if (devna->na_arena != NULL) {
1795 				skmem_arena_release(devna->na_arena);
1796 				devna->na_arena = NULL;
1797 			}
1798 			if (devna->na_ifp != NULL) {
1799 				ifnet_decr_iorefcnt(devna->na_ifp);
1800 				devna->na_ifp = NULL;
1801 			}
1802 			devna->na_private = NULL;
1803 		}
1804 		if (hostna != NULL) {
1805 			if (hostna->na_arena != NULL) {
1806 				skmem_arena_release(hostna->na_arena);
1807 				hostna->na_arena = NULL;
1808 			}
1809 			if (hostna->na_ifp != NULL) {
1810 				ifnet_decr_iorefcnt(hostna->na_ifp);
1811 				hostna->na_ifp = NULL;
1812 			}
1813 			hostna->na_private = NULL;
1814 		}
1815 		if (devnca != NULL) {
1816 			if (devnca->nca_up.nifna_netif != NULL) {
1817 				nx_netif_release(devnca->nca_up.nifna_netif);
1818 				devnca->nca_up.nifna_netif = NULL;
1819 			}
1820 			na_netif_compat_free((struct nexus_adapter *)devnca);
1821 		}
1822 		if (hostnca != NULL) {
1823 			if (hostnca->nca_up.nifna_netif != NULL) {
1824 				nx_netif_release(hostnca->nca_up.nifna_netif);
1825 				hostnca->nca_up.nifna_netif = NULL;
1826 			}
1827 			na_netif_compat_free((struct nexus_adapter *)hostnca);
1828 		}
1829 	}
1830 	return retval;
1831 }
1832 
1833 static void
na_netif_compat_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)1834 na_netif_compat_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
1835 {
1836 	na_netif_finalize(nifna, ifp);
1837 }
1838 
1839 /*
1840  * Intercept the rx routine in the standard device driver.
1841  * Second argument is non-zero to intercept, 0 to restore
1842  */
1843 static int
nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter * nca,boolean_t enable)1844 nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *nca,
1845     boolean_t enable)
1846 {
1847 	struct ifnet *ifp = nca->nca_up.nifna_up.na_ifp;
1848 	int err = 0;
1849 
1850 	ASSERT(!(nca->nca_up.nifna_up.na_flags & NAF_HOST_ONLY));
1851 
1852 	if (enable) {
1853 		err = dlil_set_input_handler(ifp, nx_netif_compat_receive);
1854 	} else {
1855 		dlil_reset_input_handler(ifp);
1856 	}
1857 	return err;
1858 }
1859 
1860 /*
1861  * Transmit routine used by nx_netif_compat_na_txsync(). Returns 0 on success
1862  * and non-zero on error (which may be packet drops or other errors).
1863  * len identifies the channel buffer, m is the (preallocated) mbuf to use
1864  * for transmissions.
1865  *
1866  * We should add a reference to the mbuf so the m_freem() at the end
1867  * of the transmission does not consume resources.
1868  *
1869  * On FreeBSD, and on multiqueue cards, we can force the queue using
1870  *      if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1871  *              i = m->m_pkthdr.flowid % adapter->num_queues;
1872  *      else
1873  *              i = curcpu % adapter->num_queues;
1874  *
1875  */
1876 static int
nx_netif_compat_xmit_frame(struct nexus_adapter * na,struct mbuf * m,struct __kern_packet * pkt)1877 nx_netif_compat_xmit_frame(struct nexus_adapter *na, struct mbuf *m,
1878     struct __kern_packet *pkt)
1879 {
1880 	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1881 	struct nx_netif *nif = nifna->nifna_netif;
1882 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1883 	struct ifnet *ifp = na->na_ifp;
1884 	kern_packet_t ph = SK_PTR_ENCODE(pkt, METADATA_TYPE(pkt),
1885 	    METADATA_SUBTYPE(pkt));
1886 	uint32_t len;
1887 	int ret = 0;
1888 
1889 	if ((ret = mbuf_ring_cluster_activate(m)) != 0) {
1890 		panic("Failed to activate mbuf ring cluster %p (%d)",
1891 		    SK_KVA(m), ret);
1892 		/* NOTREACHED */
1893 		__builtin_unreachable();
1894 	}
1895 
1896 	len = pkt->pkt_length;
1897 
1898 	/*
1899 	 * The mbuf should be a cluster from our special pool,
1900 	 * so we do not need to do an m_copyback but just copy.
1901 	 */
1902 	if (m->m_ext.ext_size < len) {
1903 		SK_RD(5, "size %u < len %u", m->m_ext.ext_size, len);
1904 		len = m->m_ext.ext_size;
1905 	}
1906 
1907 	STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
1908 	if (PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1909 		STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM);
1910 	}
1911 
1912 	nif->nif_pkt_copy_to_mbuf(NR_TX, ph, pkt->pkt_headroom, m, 0, len,
1913 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt), pkt->pkt_csum_tx_start_off);
1914 
1915 	/* used for tx notification */
1916 	ret = mbuf_set_tx_compl_data(m, (uintptr_t)ifp, (uintptr_t)NULL);
1917 	ASSERT(ret == 0);
1918 
1919 	ret = dlil_output_handler(ifp, m);
1920 	return ret;
1921 }
1922