1 /*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/netif/nx_netif.h>
56 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
57 #include <mach/thread_act.h>
58 #include <kern/thread.h>
59 #include <kern/sched_prim.h>
60
61 static void na_netif_compat_finalize(struct nexus_netif_adapter *,
62 struct ifnet *);
63 static errno_t nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
64 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
65 boolean_t poll, struct thread *tp);
66 static int nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *na,
67 boolean_t enable);
68 static int nx_netif_compat_xmit_frame(struct nexus_adapter *, struct mbuf *,
69 struct __kern_packet *);
70
71 static int nx_netif_compat_na_notify_tx(struct __kern_channel_ring *,
72 struct proc *, uint32_t);
73 static int nx_netif_compat_na_notify_rx(struct __kern_channel_ring *,
74 struct proc *, uint32_t);
75 static int nx_netif_compat_na_activate(struct nexus_adapter *,
76 na_activate_mode_t);
77 static int nx_netif_compat_na_txsync(struct __kern_channel_ring *,
78 struct proc *, uint32_t);
79 static int nx_netif_compat_na_rxsync(struct __kern_channel_ring *,
80 struct proc *, uint32_t);
81 static void nx_netif_compat_na_dtor(struct nexus_adapter *na);
82
83 static void nx_netif_compat_tx_intr(struct ifnet *, enum txrx, uint32_t,
84 uint32_t *);
85 static inline struct mbuf *nx_netif_compat_ring_alloc(int, int, uint16_t);
86 static inline void nx_netif_compat_ring_free(struct mbuf *m);
87 static void nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg);
88
89 static uint32_t nx_netif_compat_tx_clean(struct netif_stats *nifs,
90 struct __kern_channel_ring *kring);
91 static void nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
92 slot_idx_t khead);
93
94 static struct nexus_netif_compat_adapter *na_netif_compat_alloc(zalloc_flags_t);
95 static void na_netif_compat_free(struct nexus_adapter *);
96 #if DEBUG || DEVELOPMENT
97 static struct mbuf *nx_netif_rx_split(struct mbuf *, uint32_t);
98 #endif /* DEBUG || DEVELOPMENT */
99
100 #define MBUF_TXQ(m) ((m)->m_pkthdr.pkt_flowid)
101 #define MBUF_RXQ(m) ((m)->m_pkthdr.pkt_flowid)
102
103 #define NMB_PROPF_TX_NOTIFY 0x1 /* generate transmit event */
104 #define NMB_FLAGS_MASK 0x0000ffff
105 #define NMB_INDEX_MASK 0xffff0000
106 #define NMB_GET_FLAGS(p) (((uint32_t)(p) & NMB_FLAGS_MASK))
107 #define NMB_SET_FLAGS(p, f) (((uint32_t)(p) & ~NMB_FLAGS_MASK) | (f))
108 #define NMB_GET_INDEX(p) (((uint32_t)(p) & NMB_INDEX_MASK) >> 16)
109 #define NMB_SET_INDEX(p, i) (((uint32_t)(p) & ~NMB_INDEX_MASK) | (i << 16))
110
111 static SKMEM_TYPE_DEFINE(na_netif_compat_zone, struct nexus_netif_compat_adapter);
112
113 static int netif_tx_event_mode = 0;
114
115 #if (DEVELOPMENT || DEBUG)
116 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk_netif, OID_AUTO, compat,
117 CTLFLAG_RW | CTLFLAG_LOCKED,
118 0, "Skywalk netif Nexus legacy compatibility support");
119 SYSCTL_INT(_kern_skywalk_netif_compat, OID_AUTO, tx_event_mode,
120 CTLFLAG_RW | CTLFLAG_LOCKED, &netif_tx_event_mode, 0, "");
121 static uint32_t netif_rx_split = 0;
122 SYSCTL_UINT(_kern_skywalk_netif_compat, OID_AUTO, rx_split,
123 CTLFLAG_RW | CTLFLAG_LOCKED, &netif_rx_split, 0, "");
124 #endif /* !DEVELOPMENT && !DEBUG */
125
126 struct kern_nexus_domain_provider nx_netif_compat_prov_s = {
127 .nxdom_prov_name = NEXUS_PROVIDER_NET_IF_COMPAT,
128 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
129 .nxdom_prov_cb = {
130 .dp_cb_init = nx_netif_prov_init,
131 .dp_cb_fini = nx_netif_prov_fini,
132 .dp_cb_params = nx_netif_prov_params,
133 /*
134 * We must be using the native netif handlers below,
135 * since we act as the default domain provider; see
136 * kern_nexus_register_domain_provider().
137 */
138 .dp_cb_mem_new = nx_netif_prov_mem_new,
139 .dp_cb_config = nx_netif_prov_config,
140 .dp_cb_nx_ctor = nx_netif_prov_nx_ctor,
141 .dp_cb_nx_dtor = nx_netif_prov_nx_dtor,
142 .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info,
143 .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get,
144 .dp_cb_nx_stop = nx_netif_prov_nx_stop,
145 },
146 };
147
148 struct nexus_ifnet_ops na_netif_compat_ops = {
149 .ni_finalize = na_netif_compat_finalize,
150 .ni_reap = nx_netif_reap,
151 .ni_dequeue = nx_netif_compat_tx_dequeue,
152 .ni_get_len = nx_netif_compat_tx_get_len,
153 };
154
155 #define SKMEM_TAG_NETIF_COMPAT_MIT "com.apple.skywalk.netif.compat.mit"
156 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_mit, SKMEM_TAG_NETIF_COMPAT_MIT);
157
158 #define SKMEM_TAG_NETIF_COMPAT_POOL "com.apple.skywalk.netif.compat.pool"
159 static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_pool, SKMEM_TAG_NETIF_COMPAT_POOL);
160
161 void
nx_netif_compat_init(struct nxdom * nxdom)162 nx_netif_compat_init(struct nxdom *nxdom)
163 {
164 _CASSERT(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE);
165
166 /*
167 * We want nxprov_create() coming from userland to use the
168 * netif_compat domain provider, so install it as default.
169 * This is verified by the caller.
170 */
171 (void) nxdom_prov_add(nxdom, &nx_netif_compat_prov_s);
172 }
173
174 void
nx_netif_compat_fini(void)175 nx_netif_compat_fini(void)
176 {
177 (void) nxdom_prov_del(&nx_netif_compat_prov_s);
178 }
179
180 static struct nexus_netif_compat_adapter *
na_netif_compat_alloc(zalloc_flags_t how)181 na_netif_compat_alloc(zalloc_flags_t how)
182 {
183 struct nexus_netif_compat_adapter *nca;
184
185 _CASSERT(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0);
186
187 nca = zalloc_flags(na_netif_compat_zone, how | Z_ZERO);
188 if (nca) {
189 SK_DF(SK_VERB_MEM, "nca %p ALLOC", SK_KVA(nca));
190 }
191 return nca;
192 }
193
194 static void
na_netif_compat_free(struct nexus_adapter * na)195 na_netif_compat_free(struct nexus_adapter *na)
196 {
197 struct nexus_netif_compat_adapter *nca =
198 (struct nexus_netif_compat_adapter *)na;
199
200 SK_LOCK_ASSERT_HELD();
201 ASSERT(na->na_refcount == 0);
202
203 SK_DF(SK_VERB_MEM, "nca [dev+host] %p FREE", SK_KVA(nca));
204 bzero(nca, sizeof(*nca));
205 zfree(na_netif_compat_zone, nca);
206 }
207
208 /*
209 * Callback invoked when the device driver frees an mbuf used
210 * by skywalk to transmit a packet. This usually happens when
211 * the NIC notifies the driver that transmission is completed.
212 */
213 static void
nx_netif_compat_ringcb(caddr_t cl,uint32_t size,caddr_t arg)214 nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg)
215 {
216 #pragma unused(cl, size)
217 struct mbuf *__single m = (void *)arg;
218 struct ifnet *ifp = NULL;
219 struct netif_stats *nifs = NULL;
220 uintptr_t data; /* not used */
221 uint32_t txq;
222 errno_t err;
223
224 err = mbuf_get_tx_compl_data(m, (uintptr_t *)&ifp, &data);
225 ASSERT(err == 0);
226
227 nifs = &NX_NETIF_PRIVATE(NA(ifp)->nifna_up.na_nx)->nif_stats;
228 txq = MBUF_TXQ(m);
229
230 for (;;) {
231 uint32_t p = 0, i, f;
232
233 (void) mbuf_cluster_get_prop(m, &p);
234 f = NMB_GET_FLAGS(p);
235 i = NMB_GET_INDEX(p);
236
237 SK_DF(SK_VERB_NETIF, "%s m 0x%llx txq %u i %u f 0x%x",
238 if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
239
240 if (f & NMB_PROPF_TX_NOTIFY) {
241 uint32_t pn;
242
243 f &= ~NMB_PROPF_TX_NOTIFY;
244 pn = NMB_SET_FLAGS(p, f);
245
246 err = mbuf_cluster_set_prop(m, p, pn);
247 if (err != 0) {
248 if (err == EBUSY) { /* try again */
249 continue;
250 }
251 /* TODO: [email protected] -- what to do? */
252 SK_ERR("Failed to clear TX_NOTIFY "
253 "m 0x%llx i %u err %d", SK_KVA(m), i, err);
254 } else {
255 nx_netif_compat_tx_intr(ifp, NR_TX, txq, NULL);
256 SK_DF(SK_VERB_NETIF | SK_VERB_INTR | SK_VERB_TX,
257 "%s TX irq m 0x%llx txq %u i %u f 0x%x",
258 if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f);
259 STATS_INC(nifs, NETIF_STATS_TX_IRQ);
260 }
261 }
262 break;
263 }
264 }
265
266 /* Hoisted out of line to reduce kernel stack footprint */
267 SK_NO_INLINE_ATTRIBUTE
268 static struct mbuf *
nx_netif_compat_ring_alloc(int how,int len,uint16_t idx)269 nx_netif_compat_ring_alloc(int how, int len, uint16_t idx)
270 {
271 struct mbuf *__single m = NULL;
272 size_t size = len;
273 uint32_t i;
274
275 if (mbuf_ring_cluster_alloc(how, MBUF_TYPE_HEADER, &m,
276 nx_netif_compat_ringcb, &size) != 0) {
277 return NULL;
278 }
279
280 for (;;) {
281 uint32_t p = 0, pn;
282 int err;
283
284 (void) mbuf_cluster_get_prop(m, &p);
285 pn = NMB_SET_FLAGS(p, 0);
286 pn = NMB_SET_INDEX(pn, idx);
287
288 err = mbuf_cluster_set_prop(m, p, pn);
289 if (err != 0) {
290 if (err == EBUSY) { /* try again */
291 continue;
292 }
293 SK_ERR("Failed to initialize properties m 0x%llx "
294 "err %d", SK_KVA(m), err);
295 m_freem(m);
296 return NULL;
297 }
298 (void) mbuf_cluster_get_prop(m, &p);
299 i = NMB_GET_INDEX(p);
300 ASSERT(i == idx);
301 break;
302 }
303
304 SK_DF(SK_VERB_MEM, "alloc m 0x%llx size %u i %u",
305 SK_KVA(m), (uint32_t)size, i);
306
307 return m;
308 }
309
310 /* Hoisted out of line to reduce kernel stack footprint */
311 SK_NO_INLINE_ATTRIBUTE
312 static void
nx_netif_compat_ring_free(struct mbuf * m)313 nx_netif_compat_ring_free(struct mbuf *m)
314 {
315 if (m == NULL) {
316 return;
317 }
318
319 for (;;) {
320 uint32_t p = 0;
321 int err;
322
323 (void) mbuf_cluster_get_prop(m, &p);
324 err = mbuf_cluster_set_prop(m, p, 0);
325 if (err != 0) {
326 if (err == EBUSY) { /* try again */
327 continue;
328 }
329 /* TODO: [email protected] -- what to do? */
330 SK_ERR("Failed to clear properties m 0x%llx err %d",
331 SK_KVA(m), err);
332 }
333 break;
334 }
335 m_freem(m);
336 }
337
338 static void
nx_netif_compat_tx_intr(struct ifnet * ifp,enum txrx t,uint32_t q,uint32_t * work_done)339 nx_netif_compat_tx_intr(struct ifnet *ifp, enum txrx t, uint32_t q,
340 uint32_t *work_done)
341 {
342 struct nexus_adapter *na = &NA(ifp)->nifna_up;
343
344 if (__improbable(!NA_IS_ACTIVE(na) || q >= na_get_nrings(na, t))) {
345 if (q >= na_get_nrings(na, t)) {
346 SK_ERR("na \"%s\" (0x%llx) invalid q %u >= %u",
347 na->na_name, SK_KVA(na), q, na_get_nrings(na, t));
348 }
349 } else {
350 (void) nx_netif_mit_tx_intr((NAKR(na, t) + q), kernproc,
351 0, work_done);
352 }
353 }
354
355 static int
nx_netif_compat_na_notify_tx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)356 nx_netif_compat_na_notify_tx(struct __kern_channel_ring *kring,
357 struct proc *p, uint32_t flags)
358 {
359 /*
360 * This should never get executed, as nothing should be invoking
361 * the TX ring notify callback. The compat adapter directly
362 * calls nx_netif_compat_tx_intr() for TX completion from within
363 * nx_netif_compat_ringcb().
364 *
365 * If we ever get here, use the original na_notify callback
366 * saved during na_activate().
367 */
368 return kring->ckr_netif_notify(kring, p, flags);
369 }
370
371 static int
nx_netif_compat_na_notify_rx(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)372 nx_netif_compat_na_notify_rx(struct __kern_channel_ring *kring,
373 struct proc *p, uint32_t flags)
374 {
375 /*
376 * This should never get executed, as nothing should be invoking
377 * the RX ring notify callback. The compat adapter directly
378 * calls nx_netif_mit_rx_intr() for RX completion from within
379 * nx_netif_compat_receive().
380 *
381 * If we ever get here, use the original na_notify callback
382 * saved during na_activate().
383 */
384 return kring->ckr_netif_notify(kring, p, flags);
385 }
386
387 /* Enable/disable skywalk mode for a compat network interface. */
388 static int
nx_netif_compat_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)389 nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
390 {
391 struct nexus_netif_adapter *nifna = NIFNA(na);
392 boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple, rxpoll;
393 uint32_t limit = (uint32_t)sk_netif_compat_rx_mbq_limit;
394 struct nx_netif *nif = nifna->nifna_netif;
395 struct nexus_netif_compat_adapter *nca;
396 ifnet_t ifp = na->na_ifp;
397 uint32_t i, r;
398 int error;
399 /* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
400 struct nx_netif_mit *mit_tmp;
401 uint32_t nrings;
402 struct mbuf **ckr_tx_pool_tmp;
403
404 ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
405 ASSERT(!(na->na_flags & NAF_HOST_ONLY));
406
407 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s", na->na_name,
408 SK_KVA(na), na_activate_mode2str(mode));
409
410 nca = (struct nexus_netif_compat_adapter *)nifna;
411
412 switch (mode) {
413 case NA_ACTIVATE_MODE_ON:
414 ASSERT(SKYWALK_CAPABLE(na->na_ifp));
415
416 nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
417 &rx_mit, &rx_mit_simple);
418
419 /*
420 * Init the mitigation support on all the dev TX rings.
421 */
422 if (na_get_nrings(na, NR_TX) != 0 && tx_mit) {
423 nrings = na_get_nrings(na, NR_TX);
424 mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
425 nrings, Z_WAITOK, skmem_tag_netif_compat_mit);
426 if (mit_tmp == NULL) {
427 SK_ERR("TX mitigation allocation failed");
428 error = ENOMEM;
429 goto out;
430 }
431 nifna->nifna_tx_mit = mit_tmp;
432 nifna->nifna_tx_mit_count = nrings;
433 } else {
434 ASSERT(nifna->nifna_tx_mit == NULL);
435 }
436
437 /*
438 * Init either poller or mitigation support on all the
439 * dev RX rings; they're mutually exclusive and poller
440 * takes precedence.
441 */
442 rxpoll = (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
443 if (rxpoll) {
444 int err;
445 __unused kern_return_t kret;
446 thread_precedence_policy_data_t info;
447
448 ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
449 ASSERT(ifp->if_input_poll != NULL);
450 ASSERT(ifp->if_input_ctl != NULL);
451 if ((err =
452 kernel_thread_start(netif_rxpoll_compat_thread_func,
453 ifp, &ifp->if_poll_thread)) != KERN_SUCCESS) {
454 panic_plain("%s: ifp=%p couldn't get a poll "
455 " thread; err=%d", __func__, ifp, err);
456 /* NOTREACHED */
457 __builtin_unreachable();
458 }
459 VERIFY(ifp->if_poll_thread != NULL);
460
461 /* wait until thread is ready */
462 lck_mtx_lock(&ifp->if_poll_lock);
463 while (!(ifp->if_poll_flags & IF_POLLF_READY)) {
464 (void) assert_wait(&ifp->if_poll_flags,
465 THREAD_UNINT);
466 lck_mtx_unlock(&ifp->if_poll_lock);
467 (void) thread_block(THREAD_CONTINUE_NULL);
468 lck_mtx_lock(&ifp->if_poll_lock);
469 }
470 lck_mtx_unlock(&ifp->if_poll_lock);
471
472 bzero(&info, sizeof(info));
473 info.importance = 1;
474 kret = thread_policy_set(ifp->if_poll_thread,
475 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
476 THREAD_PRECEDENCE_POLICY_COUNT);
477 ASSERT(kret == KERN_SUCCESS);
478 limit = if_rcvq_maxlen;
479 (void) netif_rxpoll_set_params(ifp, NULL, FALSE);
480 ASSERT(nifna->nifna_rx_mit == NULL);
481 } else if (rx_mit) {
482 nrings = na_get_nrings(na, NR_RX);
483 mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
484 nrings, Z_WAITOK, skmem_tag_netif_compat_mit);
485 if (mit_tmp == NULL) {
486 SK_ERR("RX mitigation allocation failed");
487 if (nifna->nifna_tx_mit != NULL) {
488 skn_free_type_array_counted_by(rx_fail,
489 struct nx_netif_mit,
490 nifna->nifna_tx_mit_count,
491 nifna->nifna_tx_mit);
492 }
493 error = ENOMEM;
494 goto out;
495 }
496 nifna->nifna_rx_mit = mit_tmp;
497 nifna->nifna_rx_mit_count = nrings;
498 }
499
500 /* intercept na_notify callback on the TX rings */
501 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
502 na->na_tx_rings[r].ckr_netif_notify =
503 na->na_tx_rings[r].ckr_na_notify;
504 na->na_tx_rings[r].ckr_na_notify =
505 nx_netif_compat_na_notify_tx;
506 if (nifna->nifna_tx_mit != NULL) {
507 nx_netif_mit_init(nif, na->na_ifp,
508 &nifna->nifna_tx_mit[r],
509 &na->na_tx_rings[r], tx_mit_simple);
510 }
511 }
512
513 /* intercept na_notify callback on the RX rings */
514 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
515 na->na_rx_rings[r].ckr_netif_notify =
516 na->na_rx_rings[r].ckr_na_notify;
517 na->na_rx_rings[r].ckr_na_notify =
518 nx_netif_compat_na_notify_rx;
519 if (nifna->nifna_rx_mit != NULL) {
520 nx_netif_mit_init(nif, na->na_ifp,
521 &nifna->nifna_rx_mit[r],
522 &na->na_rx_rings[r], rx_mit_simple);
523 }
524 }
525 /*
526 * Initialize the rx queue, as nx_netif_compat_receive() can
527 * be called as soon as nx_netif_compat_catch_rx() returns.
528 */
529 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
530 struct __kern_channel_ring *kr = &na->na_rx_rings[r];
531
532 nx_mbq_safe_init(kr, &kr->ckr_rx_queue, limit,
533 &nexus_mbq_lock_group, &nexus_lock_attr);
534 SK_DF(SK_VERB_NETIF,
535 "na \"%s\" (0x%llx) initialized kr \"%s\" "
536 "(0x%llx) krflags 0x%b", na->na_name, SK_KVA(na),
537 kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS);
538 }
539
540 /*
541 * Prepare packet buffers for the tx rings; don't preallocate
542 * the mbufs here, leave this to nx_netif_compat_na_txsync().
543 */
544 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
545 na->na_tx_rings[r].ckr_tx_pool = NULL;
546 na->na_tx_rings[r].ckr_tx_pool_count = 0;
547 }
548
549 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
550 nrings = na_get_nslots(na, NR_TX);
551 ckr_tx_pool_tmp =
552 skn_alloc_type_array(tx_pool_on, struct mbuf *,
553 nrings, Z_WAITOK,
554 skmem_tag_netif_compat_pool);
555 if (ckr_tx_pool_tmp == NULL) {
556 SK_ERR("ckr_tx_pool allocation failed");
557 error = ENOMEM;
558 goto free_tx_pools;
559 }
560 na->na_tx_rings[r].ckr_tx_pool = ckr_tx_pool_tmp;
561 na->na_tx_rings[r].ckr_tx_pool_count = nrings;
562 }
563
564 /* Prepare to intercept incoming traffic. */
565 error = nx_netif_compat_catch_rx(nca, TRUE);
566 if (error != 0) {
567 SK_ERR("RX intercept failed (%d)", error);
568 goto uncatch;
569 }
570 nx_netif_filter_enable(nifna->nifna_netif);
571 nx_netif_flow_enable(nifna->nifna_netif);
572 os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
573 break;
574
575 case NA_ACTIVATE_MODE_DEFUNCT:
576 ASSERT(SKYWALK_CAPABLE(na->na_ifp));
577 break;
578
579 case NA_ACTIVATE_MODE_OFF:
580 /*
581 * Note that here we cannot assert SKYWALK_CAPABLE()
582 * as we're called in the destructor path.
583 */
584 os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
585 nx_netif_flow_disable(nifna->nifna_netif);
586 nx_netif_filter_disable(nifna->nifna_netif);
587
588 /*
589 * Signal the poller thread to terminate itself, and
590 * wait for it to exit.
591 */
592 if (ifp->if_poll_thread != THREAD_NULL) {
593 ASSERT(net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
594 ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0);
595 lck_mtx_lock_spin(&ifp->if_poll_lock);
596 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
597 wakeup_one((caddr_t)&ifp->if_poll_thread);
598 lck_mtx_unlock(&ifp->if_poll_lock);
599
600 /* wait for poller thread to terminate */
601 lck_mtx_lock(&ifp->if_poll_lock);
602 while (ifp->if_poll_thread != THREAD_NULL) {
603 SK_DF(SK_VERB_NETIF_POLL,
604 "%s: waiting for poller thread to terminate",
605 if_name(ifp));
606 (void) msleep(&ifp->if_poll_thread,
607 &ifp->if_poll_lock, (PZERO - 1),
608 "netif_poll_thread_exit", NULL);
609 }
610 lck_mtx_unlock(&ifp->if_poll_lock);
611 SK_DF(SK_VERB_NETIF_POLL,
612 "%s: poller thread termination complete",
613 if_name(ifp));
614 }
615
616 /* Do not intercept packets on the rx path. */
617 (void) nx_netif_compat_catch_rx(nca, FALSE);
618
619 /* Free the mbufs going to the channel rings */
620 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
621 nx_mbq_safe_purge(&na->na_rx_rings[r].ckr_rx_queue);
622 nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
623 }
624
625 /* reset all TX notify callbacks */
626 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
627 na->na_tx_rings[r].ckr_na_notify =
628 na->na_tx_rings[r].ckr_netif_notify;
629 na->na_tx_rings[r].ckr_netif_notify = NULL;
630 if (nifna->nifna_tx_mit != NULL) {
631 na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
632 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
633 }
634 }
635
636 if (nifna->nifna_tx_mit != NULL) {
637 skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
638 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
639 }
640
641 /* reset all RX notify callbacks */
642 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
643 na->na_rx_rings[r].ckr_na_notify =
644 na->na_rx_rings[r].ckr_netif_notify;
645 na->na_rx_rings[r].ckr_netif_notify = NULL;
646 if (nifna->nifna_rx_mit != NULL) {
647 na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
648 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
649 }
650 }
651 if (nifna->nifna_rx_mit != NULL) {
652 skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
653 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
654 }
655
656 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
657 for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
658 nx_netif_compat_ring_free(na->
659 na_tx_rings[r].ckr_tx_pool[i]);
660 na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
661 }
662 skn_free_type_array_counted_by(tx_pool_off,
663 struct mbuf *, na->na_tx_rings[r].ckr_tx_pool_count,
664 na->na_tx_rings[r].ckr_tx_pool);
665 }
666 break;
667
668 default:
669 VERIFY(0);
670 /* NOTREACHED */
671 __builtin_unreachable();
672 }
673
674 return 0;
675
676 uncatch:
677 (void) nx_netif_compat_catch_rx(nca, FALSE);
678
679 free_tx_pools:
680 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
681 if (na->na_tx_rings[r].ckr_tx_pool == NULL) {
682 continue;
683 }
684 for (i = 0; i < na_get_nslots(na, NR_TX); i++) {
685 nx_netif_compat_ring_free(
686 na->na_tx_rings[r].ckr_tx_pool[i]);
687 na->na_tx_rings[r].ckr_tx_pool[i] = NULL;
688 }
689 skn_free_type_array_counted_by(tx_pool, struct mbuf *,
690 na->na_tx_rings[r].ckr_tx_pool_count,
691 na->na_tx_rings[r].ckr_tx_pool);
692 }
693 if (nifna->nifna_tx_mit != NULL) {
694 for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
695 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
696 }
697 skn_free_type_array_counted_by(tx, struct nx_netif_mit,
698 nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
699 }
700 if (nifna->nifna_rx_mit != NULL) {
701 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
702 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
703 }
704 skn_free_type_array_counted_by(rx, struct nx_netif_mit,
705 nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
706 }
707 for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
708 nx_mbq_safe_destroy(&na->na_rx_rings[r].ckr_rx_queue);
709 }
710 out:
711
712 return error;
713 }
714
715 /*
716 * Record completed transmissions and update ktail.
717 *
718 * The oldest tx buffer not yet completed is at ckr_ktail + 1,
719 * ckr_khead is the first unsent buffer.
720 */
721 /* Hoisted out of line to reduce kernel stack footprint */
722 SK_NO_INLINE_ATTRIBUTE
723 static uint32_t
nx_netif_compat_tx_clean(struct netif_stats * nifs,struct __kern_channel_ring * kring)724 nx_netif_compat_tx_clean(struct netif_stats *nifs,
725 struct __kern_channel_ring *kring)
726 {
727 const slot_idx_t lim = kring->ckr_lim;
728 slot_idx_t nm_i = SLOT_NEXT(kring->ckr_ktail, lim);
729 slot_idx_t khead = kring->ckr_khead;
730 uint32_t n = 0;
731 struct mbuf **ckr_tx_pool = kring->ckr_tx_pool;
732
733 while (nm_i != khead) { /* buffers not completed */
734 struct mbuf *m = ckr_tx_pool[nm_i];
735
736 if (__improbable(m == NULL)) {
737 /* this is done, try to replenish the entry */
738 VERIFY(nm_i <= UINT16_MAX);
739 ckr_tx_pool[nm_i] = m =
740 nx_netif_compat_ring_alloc(M_WAITOK,
741 kring->ckr_max_pkt_len, (uint16_t)nm_i);
742 if (__improbable(m == NULL)) {
743 STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF);
744 STATS_INC(nifs, NETIF_STATS_DROP);
745 SK_DF(SK_VERB_MEM,
746 "mbuf allocation failed (slot %u)", nm_i);
747 /* XXX how do we proceed ? break ? */
748 return -ENOMEM;
749 }
750 } else if (mbuf_ring_cluster_is_active(m)) {
751 break; /* This mbuf is still busy */
752 }
753 n++;
754 nm_i = SLOT_NEXT(nm_i, lim);
755 }
756 kring->ckr_ktail = SLOT_PREV(nm_i, lim);
757
758 SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (0x%llx) tx completed [%u] -> "
759 "kh %u kt %u | rh %u rt %u", kring->ckr_name, SK_KVA(kring),
760 n, kring->ckr_khead, kring->ckr_ktail,
761 kring->ckr_rhead, kring->ckr_rtail);
762
763 return n;
764 }
765
766 /* Hoisted out of line to reduce kernel stack footprint */
767 SK_NO_INLINE_ATTRIBUTE
768 static void
nx_netif_compat_set_tx_event(struct __kern_channel_ring * kring,slot_idx_t khead)769 nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring,
770 slot_idx_t khead)
771 {
772 const slot_idx_t lim = kring->ckr_lim;
773 slot_idx_t ntc = SLOT_NEXT(kring->ckr_ktail, lim); /* next to clean */
774 struct mbuf *m;
775 slot_idx_t e;
776
777 if (ntc == khead) {
778 return; /* all buffers are free */
779 }
780 /*
781 * We have pending packet in the driver between ckr_ktail+1 and
782 * ckr_khead, and we have to choose one of these slots to generate
783 * a TX notification. There is a race, but this is only called
784 * within TX sync which does a double check.
785 */
786 if (__probable(netif_tx_event_mode == 0)) {
787 /*
788 * Choose the first pending slot, to be safe against drivers
789 * reordering mbuf transmissions.
790 */
791 e = ntc;
792 } else {
793 /*
794 * Choose a slot in the middle, so that we don't risk ending
795 * up in a situation where the client continuously wake up,
796 * fills one or a few TX slots and go to sleep again.
797 */
798 slot_idx_t n = lim + 1;
799
800 if (khead >= ntc) {
801 e = (khead + ntc) >> 1;
802 } else { /* wrap around */
803 e = (khead + n + ntc) >> 1;
804 if (e >= n) {
805 e -= n;
806 }
807 }
808
809 if (__improbable(e >= n)) {
810 SK_ERR("This cannot happen");
811 e = 0;
812 }
813 }
814 m = kring->ckr_tx_pool[e];
815
816 for (;;) {
817 uint32_t p = 0, pn, i, f;
818 int err;
819
820 (void) mbuf_cluster_get_prop(m, &p);
821 f = NMB_GET_FLAGS(p);
822 i = NMB_GET_INDEX(p);
823
824 if (f & NMB_PROPF_TX_NOTIFY) {
825 /*
826 * This can happen if there is already an event
827 * on the ring slot 'e': There is nothing to do.
828 */
829 SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
830 "TX_NOTIFY already set at %u m 0x%llx kc %u ntc %u",
831 e, SK_KVA(m), khead, ntc);
832 return;
833 }
834
835 f |= NMB_PROPF_TX_NOTIFY;
836 pn = NMB_SET_FLAGS(p, f);
837
838 err = mbuf_cluster_set_prop(m, p, pn);
839 if (err != 0) {
840 if (err == EBUSY) { /* try again */
841 continue;
842 }
843 /* TODO: [email protected] -- what to do? */
844 SK_ERR("Failed to set TX_NOTIFY at %u m 0x%llx kh %u "
845 "ntc %u, err %d", e, SK_KVA(m), khead, ntc, err);
846 } else {
847 SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX,
848 "Request TX_NOTIFY at %u m 0x%llx kh %u ntc %u",
849 e, SK_KVA(m), khead, ntc);
850 }
851 break;
852 }
853 }
854
855 #if SK_LOG
856 /* Hoisted out of line to reduce kernel stack footprint */
857 SK_LOG_ATTRIBUTE
858 static void
nx_netif_compat_na_txsync_log(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)859 nx_netif_compat_na_txsync_log(struct __kern_channel_ring *kring,
860 struct proc *p, uint32_t flags, slot_idx_t nm_i)
861 {
862 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
863 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0x%x "
864 "nm_i %u, kh %u kt %u | rh %u rt %u",
865 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
866 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
867 flags, nm_i, kring->ckr_khead, kring->ckr_ktail,
868 kring->ckr_rhead, kring->ckr_rtail);
869 }
870 #endif /* SK_LOG */
871
872 /*
873 * nx_netif_compat_na_txsync() transforms packets into mbufs and passes
874 * them to the device driver.
875 */
876 static int
nx_netif_compat_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)877 nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
878 uint32_t flags)
879 {
880 #pragma unused(p)
881 struct nexus_adapter *na = KRNA(kring);
882 struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
883 slot_idx_t nm_i; /* index into the channel ring */ // j
884 const slot_idx_t head = kring->ckr_rhead;
885 uint32_t slot_count = 0;
886 uint32_t byte_count = 0;
887
888 STATS_INC(nifs, NETIF_STATS_TX_SYNC);
889
890 /* update our work timestamp */
891 na->na_work_ts = _net_uptime;
892
893 /*
894 * First part: process new packets to send.
895 */
896 nm_i = kring->ckr_khead;
897 if (nm_i != head) { /* we have new packets to send */
898 while (nm_i != head) {
899 struct __kern_slot_desc *sd = KR_KSD(kring, nm_i);
900
901 /* device-specific */
902 struct mbuf *m;
903 int tx_ret;
904 /*
905 * Take a mbuf from the tx pool (replenishing the pool
906 * entry if necessary) and copy in the user packet.
907 */
908 VERIFY(nm_i <= UINT16_MAX);
909 m = kring->ckr_tx_pool[nm_i];
910 if (__improbable(m == NULL)) {
911 kring->ckr_tx_pool[nm_i] = m =
912 nx_netif_compat_ring_alloc(M_WAITOK,
913 kring->ckr_max_pkt_len, (uint16_t)nm_i);
914 if (__improbable(m == NULL)) {
915 STATS_INC(nifs, NETIF_STATS_DROP);
916 STATS_INC(nifs,
917 NETIF_STATS_DROP_NOMEM_MBUF);
918 SK_DF(SK_VERB_MEM,
919 "%s(%d) kr \"%s\" (0x%llx) "
920 "krflags 0x%b ckr_tx_pool[%u] "
921 "allocation failed",
922 sk_proc_name_address(p),
923 sk_proc_pid(p), kring->ckr_name,
924 SK_KVA(kring), kring->ckr_flags,
925 CKRF_BITS, nm_i);
926 /*
927 * Here we could schedule a timer
928 * which retries to replenish after
929 * a while, and notifies the client
930 * when it manages to replenish some
931 * slot. In any cae we break early
932 * to avoid crashes.
933 */
934 break;
935 }
936 STATS_INC(nifs, NETIF_STATS_TX_REPL);
937 }
938
939 byte_count += sd->sd_pkt->pkt_length;
940 slot_count++;
941
942 /*
943 * We should ask notifications when CS_REPORT is set,
944 * or roughly every half ring. To optimize this,
945 * we set a notification event when the client runs
946 * out of TX ring space, or when transmission fails.
947 * In the latter case we also break early.
948 */
949 tx_ret = nx_netif_compat_xmit_frame(na, m, sd->sd_pkt);
950 if (__improbable(tx_ret)) {
951 SK_RD(5, "start_xmit failed: err %d "
952 "[nm_i %u, h %u, kt %u]",
953 tx_ret, nm_i, head, kring->ckr_ktail);
954 /*
955 * No room for this mbuf in the device driver.
956 * Request a notification FOR A PREVIOUS MBUF,
957 * then call nx_netif_compat_tx_clean(kring) to
958 * do the double check and see if we can free
959 * more buffers. If there is space continue,
960 * else break; NOTE: the double check is
961 * necessary if the problem occurs in the
962 * txsync call after selrecord(). Also, we
963 * need some way to tell the caller that not
964 * all buffers were queued onto the device
965 * (this was not a problem with native skywalk
966 * driver where space is preallocated). The
967 * bridge has a similar problem and we solve
968 * it there by dropping the excess packets.
969 */
970 nx_netif_compat_set_tx_event(kring, nm_i);
971 if (nx_netif_compat_tx_clean(nifs, kring)) {
972 /* space now available */
973 continue;
974 } else {
975 break;
976 }
977 }
978 nm_i = SLOT_NEXT(nm_i, kring->ckr_lim);
979 STATS_INC(nifs, NETIF_STATS_TX_PACKETS);
980 }
981
982 /*
983 * Update khead to the next slot to transmit; Here nm_i
984 * is not necesarrily head, we could break early.
985 */
986 kring->ckr_khead = nm_i;
987
988 kr_update_stats(kring, slot_count, byte_count);
989 }
990
991 /*
992 * Second, reclaim completed buffers
993 */
994 if ((flags & NA_SYNCF_FORCE_RECLAIM) || kr_txempty(kring)) {
995 /*
996 * No more available slots? Set a notification event on a
997 * channel slot that will be cleaned in the future. No
998 * doublecheck is performed, since nx_netif_compat_na_txsync()
999 * will be called twice by ch_event().
1000 */
1001 nx_netif_compat_set_tx_event(kring, nm_i);
1002 }
1003 kring->ckr_pending_intr = 0;
1004
1005 #if SK_LOG
1006 if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1007 nx_netif_compat_na_txsync_log(kring, p, flags, nm_i);
1008 }
1009 #endif /* SK_LOG */
1010
1011 (void) nx_netif_compat_tx_clean(nifs, kring);
1012
1013 return 0;
1014 }
1015
1016 #if SK_LOG
1017 /* Hoisted out of line to reduce kernel stack footprint */
1018 SK_LOG_ATTRIBUTE
1019 static void
nx_netif_compat_receive_log1(const struct __kern_channel_ring * kring,struct nx_mbq * q)1020 nx_netif_compat_receive_log1(const struct __kern_channel_ring *kring,
1021 struct nx_mbq *q)
1022 {
1023 SK_RD(10, "kr \"%s\" (0x%llx) krflags 0x%b FULL "
1024 "(qlen %u qsize %llu), kc %u kt %u", kring->ckr_name,
1025 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, nx_mbq_len(q),
1026 nx_mbq_size(q), kring->ckr_khead, kring->ckr_ktail);
1027 }
1028
1029 /* Hoisted out of line to reduce kernel stack footprint */
1030 SK_LOG_ATTRIBUTE
1031 static void
nx_netif_compat_receive_log2(const struct __kern_channel_ring * kring,struct nx_mbq * q,const struct ifnet_stat_increment_param * s)1032 nx_netif_compat_receive_log2(const struct __kern_channel_ring *kring,
1033 struct nx_mbq *q, const struct ifnet_stat_increment_param *s)
1034 {
1035 SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (0x%llx) krflags 0x%b OK, "
1036 "added %u packets %u bytes, now qlen %u qsize %llu",
1037 kring->ckr_name, SK_KVA(kring), kring->ckr_flags, CKRF_BITS,
1038 s->packets_in, s->bytes_in, nx_mbq_len(q), nx_mbq_size(q));
1039 }
1040 #endif /* SK_LOG */
1041
1042 /*
1043 * This is the default RX path for the compat netif nexus. Packets
1044 * are enqueued and later extracted by nx_netif_compat_na_rxsync().
1045 */
1046 /* TODO: [email protected] -- implement chaining */
1047 static errno_t
nx_netif_compat_receive(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)1048 nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head,
1049 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
1050 boolean_t poll, struct thread *tp)
1051 {
1052 #pragma unused(tp)
1053 boolean_t ifp_rxpoll = ((ifp->if_eflags & IFEF_RXPOLL) && net_rxpoll);
1054 struct nexus_adapter *na = &NA(ifp)->nifna_up;
1055 struct __kern_channel_ring *kring;
1056 struct netif_stats *nifs;
1057 uint32_t r, work_done;
1058 unsigned int qlimit;
1059 struct nx_mbq *q;
1060 errno_t err = 0;
1061
1062 /* update our work timestamp */
1063 na->na_work_ts = _net_uptime;
1064
1065 if (__improbable(m_head == NULL)) {
1066 ASSERT(m_tail == NULL);
1067 ASSERT(poll);
1068 ASSERT(s->bytes_in == 0);
1069 ASSERT(s->packets_in == 0);
1070 }
1071
1072 /* BEGIN CSTYLED */
1073 /*
1074 * TODO: [email protected] -- this needs to be revisited once we
1075 * have a clear definition of how multiple RX rings are mapped
1076 * to flows; this would involve the hardware/driver doing some
1077 * kind of classification and RSS-like demuxing.
1078 *
1079 * When we enable that, we'll need to consider sifting thru the
1080 * mbuf chain we get from the caller, and enqueue them across
1081 * per-ring temporary mbuf queue (along with marking the ring
1082 * indicating pending packets.) During second stage processing,
1083 * we'll issue nx_netif_mit_rx_intr() on each marked ring to
1084 * dispatch the packets upstream.
1085 *
1086 * r = MBUF_RXQ(m);
1087 *
1088 * if (r >= na->na_num_rx_rings)
1089 * r = r % na->na_num_rx_rings;
1090 *
1091 * kring = &na->na_rx_rings[r];
1092 * q = &kring->ckr_rx_queue;
1093 *
1094 * For now, target only the first RX ring (ring 0).
1095 */
1096 /* END CSTYLED */
1097 r = 0; /* receive ring number */
1098 kring = &na->na_rx_rings[r];
1099
1100 ASSERT(na->na_type == NA_NETIF_COMPAT_DEV);
1101 nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1102
1103 if (__improbable((!NA_IS_ACTIVE(na)) || KR_DROP(kring))) {
1104 /* BEGIN CSTYLED */
1105 /*
1106 * If we deal with multiple rings, change above to:
1107 *
1108 * if (!NA_IS_ACTIVE(na) || r >= na_get_nrings(na, NR_RX)))
1109 *
1110 * then here do:
1111 *
1112 * if (r >= na_get_nrings(na, NR_RX)) {
1113 * SK_ERR("na \"%s\" (0x%llx) invalid r %u >= %u",
1114 * na->na_name, SK_KVA(na), r,
1115 * na_get_nrings(na, NR_RX));
1116 * }
1117 */
1118 /* END CSTYLED */
1119 m_freem_list(m_head);
1120 if (!NA_IS_ACTIVE(na)) {
1121 STATS_ADD(nifs, NETIF_STATS_DROP_NA_INACTIVE,
1122 s->packets_in);
1123 } else if (KR_DROP(kring)) {
1124 STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE,
1125 s->packets_in);
1126 }
1127 STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1128 err = ENXIO;
1129 goto done;
1130 }
1131 if (__improbable(m_head == NULL)) {
1132 goto send_packets;
1133 }
1134
1135 q = &kring->ckr_rx_queue;
1136 nx_mbq_lock_spin(q);
1137 qlimit = nx_mbq_limit(q);
1138 if (ifp_rxpoll) {
1139 /*
1140 * qlimit of the receive queue is much smaller when the
1141 * interface is in oppurtunistic polling mode. In this case
1142 * when the interface is operating in interrupt mode,
1143 * a sudden burst of input packets can cause the receive queue
1144 * to quickly buildup due to scheduling latency in waking up
1145 * the poller thread. To avoid drops here due to this latency
1146 * we provide a leeway on the qlimit.
1147 */
1148 qlimit <<= 5;
1149 }
1150 if (__improbable(nx_mbq_len(q) > qlimit)) {
1151 #if SK_LOG
1152 if (__improbable(sk_verbose != 0)) {
1153 nx_netif_compat_receive_log1(kring, q);
1154 }
1155 #endif /* SK_LOG */
1156 nx_mbq_unlock(q);
1157 m_freem_list(m_head);
1158 STATS_ADD(nifs, NETIF_STATS_DROP_RXQ_OVFL, s->packets_in);
1159 STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in);
1160 goto send_packets;
1161 }
1162 nx_mbq_enq_multi(q, m_head, m_tail, s->packets_in, s->bytes_in);
1163
1164 #if SK_LOG
1165 if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1166 nx_netif_compat_receive_log2(kring, q, s);
1167 }
1168 #endif /* SK_LOG */
1169
1170 nx_mbq_unlock(q);
1171
1172 (void) ifnet_stat_increment_in(ifp, s->packets_in, s->bytes_in,
1173 s->errors_in);
1174
1175 if (poll) {
1176 /* update incremental poll stats */
1177 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
1178 }
1179
1180 send_packets:
1181 /*
1182 * if the interface supports oppurtunistic input polling, then the
1183 * input packet processing is performed in context of the poller thread.
1184 */
1185 if (!poll && ifp_rxpoll) {
1186 /* wakeup the poller thread */
1187 ifnet_poll(ifp);
1188 } else {
1189 /*
1190 * wakeup the mitigation thread if needed to perform input
1191 * packet processing.
1192 * if the interface supports oppurtunistic input polling, then
1193 * mitigation thread is not created and the input packet
1194 * processing happens in context of the poller thread.
1195 */
1196 err = nx_netif_mit_rx_intr((NAKR(na, NR_RX) + r), kernproc, 0,
1197 &work_done);
1198 }
1199 done:
1200 return err;
1201 }
1202
1203 #if SK_LOG
1204 /* Hoisted out of line to reduce kernel stack footprint */
1205 SK_LOG_ATTRIBUTE
1206 static void
nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,slot_idx_t nm_i)1207 nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring *kring,
1208 struct proc *p, uint32_t flags, slot_idx_t nm_i)
1209 {
1210 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1211 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b "
1212 "ring %u flags 0x%x nm_i %u kt %u", sk_proc_name_address(p),
1213 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1214 CKRF_BITS, kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail);
1215 }
1216 #endif /* SK_LOG */
1217
1218 #if DEBUG || DEVELOPMENT
1219 /*
1220 * Split an mbuf chain at offset "split", such that the first mbuf
1221 * is a zero-length M_PKTHDR, followed by the rest of the mbufs.
1222 * Typically, the "split" value is equal to the size of the link
1223 * layer header, e.g. Ethernet header.
1224 */
1225 static struct mbuf *
nx_netif_rx_split(struct mbuf * m0,uint32_t split)1226 nx_netif_rx_split(struct mbuf *m0, uint32_t split)
1227 {
1228 struct mbuf *m = m0;
1229
1230 if (split == 0) {
1231 split = MHLEN;
1232 M_PREPEND(m, split, M_DONTWAIT, 0);
1233 } else {
1234 m->m_data -= split;
1235 m->m_len += split;
1236 m_pktlen(m) += split;
1237
1238 ASSERT((uintptr_t)m->m_data >= (uintptr_t)mbuf_datastart(m));
1239 ASSERT((uintptr_t)m->m_data < ((uintptr_t)mbuf_datastart(m) +
1240 mbuf_maxlen(m)));
1241 }
1242 if (m != NULL) {
1243 struct mbuf *n = m_split(m, split, M_DONTWAIT);
1244 if (n == NULL) {
1245 m_freem(m);
1246 return NULL;
1247 }
1248 m0 = m;
1249 ASSERT((uint32_t)m->m_len == split);
1250 m->m_data += split;
1251 m->m_len -= split;
1252 while (m->m_next != NULL) {
1253 m = m->m_next;
1254 }
1255 m->m_next = n;
1256 m = m0;
1257 m_pktlen(m) = m_length2(m, NULL);
1258 }
1259
1260 return m;
1261 }
1262 #endif /* DEBUG || DEVELOPMENT */
1263
1264 /*
1265 * nx_netif_compat_na_rxsync() extracts mbufs from the queue filled by
1266 * nx_netif_compat_receive() and puts their content in the channel
1267 * receive ring.
1268 *
1269 * Accesses to kring are serialized via kring->ckr_rx_queue lock, because
1270 * the rx handler is asynchronous,
1271 */
1272 static int
nx_netif_compat_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1273 nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1274 uint32_t flags)
1275 {
1276 #pragma unused(p)
1277 struct nexus_adapter *na = KRNA(kring);
1278 struct nexus_netif_adapter *nifna = NIFNA(na);
1279 struct nx_netif *nif = nifna->nifna_netif;
1280 slot_idx_t nm_i; /* index into the channel ring */
1281 struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1282 uint32_t npkts = 0;
1283 uint32_t byte_count = 0;
1284 const slot_idx_t lim = kring->ckr_lim;
1285 const slot_idx_t head = kring->ckr_rhead;
1286 boolean_t force_update = ((flags & NA_SYNCF_FORCE_READ) ||
1287 kring->ckr_pending_intr != 0);
1288 struct mbuf *m;
1289 uint32_t n;
1290 uint32_t avail; /* in slots */
1291 int err, mlen;
1292 boolean_t attach_mbuf = FALSE;
1293 struct nx_mbq *q, tmpq;
1294 struct kern_pbufpool *pp = kring->ckr_pp;
1295 uint32_t ph_cnt, i = 0;
1296
1297 ASSERT(pp->pp_max_frags == 1);
1298 ASSERT(head <= lim);
1299
1300 /*
1301 * First part: skip past packets that userspace has released.
1302 * This can possibly make room for the second part.
1303 * equivalent to kr_reclaim()
1304 */
1305 if (kring->ckr_khead != head) {
1306 kring->ckr_khead = head;
1307 /* ensure global visibility */
1308 os_atomic_thread_fence(seq_cst);
1309 }
1310
1311 STATS_INC(nifs, NETIF_STATS_RX_SYNC);
1312
1313 /*
1314 * Second part: import newly received packets.
1315 */
1316 if (!force_update) {
1317 return 0;
1318 }
1319
1320 /* update our work timestamp */
1321 na->na_work_ts = _net_uptime;
1322
1323 /* first empty slot in the receive ring */
1324 nm_i = kring->ckr_ktail;
1325
1326 /*
1327 * Compute the available space (in bytes) in this ring.
1328 * The first slot that is not considered in is the one
1329 * before ckr_khead.
1330 */
1331 avail = kr_available_slots_rxring(kring);
1332 if (__improbable(avail == 0)) {
1333 return 0;
1334 }
1335
1336 if (NA_KERNEL_ONLY(na)) {
1337 ASSERT(na->na_ifp != NULL &&
1338 fsw_ifp_to_fsw(na->na_ifp) != NULL);
1339 /*
1340 * We are not supporting attachment to bridge flowswitch
1341 * for now, until we support PKT_F_MBUF_DATA packets
1342 * in bridge flowswitch.
1343 */
1344 attach_mbuf = TRUE;
1345 }
1346
1347 /*
1348 * Quickly move all of ckr_rx_queue to a temporary queue to dequeue
1349 * from. For each mbuf, attach or copy it to the packet attached
1350 * to the slot. Release the lock while we're doing that, to allow
1351 * for the input thread to enqueue.
1352 */
1353 q = &kring->ckr_rx_queue;
1354 nx_mbq_init(&tmpq, NX_MBQ_NO_LIMIT);
1355 nx_mbq_lock_spin(q);
1356 nx_mbq_concat(&tmpq, q);
1357 nx_mbq_unlock(q);
1358
1359 if (__improbable(nx_mbq_len(&tmpq) == 0)) {
1360 return 0;
1361 }
1362
1363 ph_cnt = MIN(avail, nx_mbq_len(&tmpq));
1364 err = kern_pbufpool_alloc_batch_nosleep(pp, 1, kring->ckr_scratch,
1365 &ph_cnt);
1366 if (err == ENOMEM) {
1367 SK_DF(SK_VERB_MEM, "%s(%p) failed to alloc %d pkts for kr "
1368 "0x%llu", sk_proc_name_address(p), sk_proc_pid(p), ph_cnt,
1369 SK_KVA(kring));
1370 goto done;
1371 }
1372 ASSERT(ph_cnt != 0);
1373
1374 for (n = 0; (n < ph_cnt) &&
1375 ((m = nx_mbq_deq(&tmpq)) != NULL); n++) {
1376 struct __kern_slot_desc *ksd = KR_KSD(kring, nm_i);
1377 struct __kern_packet *pkt;
1378 kern_packet_t ph;
1379 uint8_t hlen;
1380 uint16_t tag;
1381 char *__single h;
1382
1383 ASSERT(m->m_flags & M_PKTHDR);
1384 mlen = m_pktlen(m);
1385 h = m->m_pkthdr.pkt_hdr;
1386 if (__improbable(mlen == 0 || h == NULL ||
1387 h < (char *)mbuf_datastart(m) || h > (char *)m->m_data)) {
1388 STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1389 SK_RD(5, "kr \"%s\" (0x%llx) m 0x%llx len %d"
1390 "bad pkt_hdr", kring->ckr_name,
1391 SK_KVA(kring), SK_KVA(m), mlen);
1392 m_freem(m);
1393 m = NULL;
1394 continue;
1395 }
1396
1397 hlen = (uint8_t)(m->m_data - (uintptr_t)h);
1398 mlen += hlen;
1399
1400 #if DEBUG || DEVELOPMENT
1401 if (__improbable(netif_rx_split != 0)) {
1402 /* callee frees mbuf upon failure */
1403 if ((m = nx_netif_rx_split(m, hlen)) == NULL) {
1404 continue;
1405 }
1406
1407 ASSERT((uintptr_t)m->m_data >=
1408 (uintptr_t)mbuf_datastart(m));
1409 ASSERT((uintptr_t)m->m_data <
1410 ((uintptr_t)mbuf_datastart(m) +
1411 mbuf_maxlen(m)));
1412 }
1413 #endif /* DEBUG || DEVELOPMENT */
1414
1415 ph = kring->ckr_scratch[i];
1416 ASSERT(ph != 0);
1417 kring->ckr_scratch[i] = 0;
1418 pkt = SK_PTR_ADDR_KPKT(ph);
1419 ++i;
1420
1421 /*
1422 * Wind back the data pointer to include any frame headers
1423 * as part of the copy below. The header length is then
1424 * stored in the corresponding metadata area of the buffer.
1425 */
1426 m->m_data -= hlen;
1427 m->m_len += hlen;
1428 m->m_pkthdr.len += hlen;
1429 ASSERT(mlen == m->m_pkthdr.len);
1430
1431 pkt->pkt_link_flags = 0;
1432 if (m->m_flags & M_HASFCS) {
1433 pkt->pkt_link_flags |= PKT_LINKF_ETHFCS;
1434 }
1435 if (mbuf_get_vlan_tag(m, &tag) == 0) {
1436 (void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag,
1437 FALSE);
1438 }
1439 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1440 "kr \"%s\" (0x%llx) m 0x%llx idx %u slot_len %d",
1441 kring->ckr_name, SK_KVA(kring), SK_KVA(m), nm_i, mlen);
1442
1443 if (__probable(attach_mbuf)) {
1444 STATS_INC(nifs, NETIF_STATS_RX_COPY_ATTACH);
1445 err = __packet_initialize_with_mbuf(pkt, m, 0, hlen);
1446 VERIFY(err == 0);
1447 } else if (__probable(mlen <= (int)PP_BUF_SIZE_DEF(pp))) {
1448 STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT);
1449 /*
1450 * We're sending this up to a user channel opened
1451 * directly to the netif; copy everything.
1452 */
1453 err = __packet_set_headroom(ph, 0);
1454 VERIFY(err == 0);
1455 err = __packet_set_link_header_length(ph, hlen);
1456 VERIFY(err == 0);
1457 nif->nif_pkt_copy_from_mbuf(NR_RX, ph, 0, m, 0,
1458 mlen, FALSE, 0);
1459 /* finalize and attach the packet */
1460 err = __packet_finalize(ph);
1461 VERIFY(err == 0);
1462 m_freem(m);
1463 m = NULL;
1464 } else {
1465 STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
1466 STATS_INC(nifs, NETIF_STATS_DROP);
1467 m_freem(m);
1468 m = NULL;
1469 kern_pbufpool_free(pp, ph);
1470 ph = 0;
1471 pkt = NULL;
1472 continue;
1473 }
1474
1475 err = KR_SLOT_ATTACH_METADATA(kring, ksd,
1476 (struct __kern_quantum *)pkt);
1477 ASSERT(err == 0);
1478
1479 byte_count += mlen;
1480 ++npkts;
1481 ASSERT(npkts < kring->ckr_num_slots);
1482 nm_i = SLOT_NEXT(nm_i, lim);
1483 }
1484
1485 if (__improbable(i < ph_cnt)) {
1486 kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1487 (ph_cnt - i));
1488 }
1489
1490 ASSERT(npkts <= ph_cnt);
1491 kr_update_stats(kring, npkts, byte_count);
1492
1493 if (npkts != 0) {
1494 kring->ckr_ktail = nm_i;
1495 STATS_ADD(nifs, NETIF_STATS_RX_PACKETS, npkts);
1496 }
1497 kring->ckr_pending_intr = 0;
1498
1499 #if SK_LOG
1500 if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) {
1501 nx_netif_compat_na_rxsync_log(kring, p, flags, nm_i);
1502 }
1503 #endif /* SK_LOG */
1504
1505 done:
1506 /*
1507 * If we didn't process all packets in temporary queue,
1508 * move them back to the head of ckr_rx_queue.
1509 */
1510 if (!nx_mbq_empty(&tmpq)) {
1511 nx_mbq_lock_spin(q);
1512 nx_mbq_concat(&tmpq, q);
1513 ASSERT(nx_mbq_empty(q));
1514 nx_mbq_concat(q, &tmpq);
1515 nx_mbq_unlock(q);
1516 }
1517 ASSERT(nx_mbq_empty(&tmpq));
1518
1519 return 0;
1520 }
1521
1522 static void
nx_netif_compat_na_dtor(struct nexus_adapter * na)1523 nx_netif_compat_na_dtor(struct nexus_adapter *na)
1524 {
1525 struct ifnet *__single ifp;
1526 struct nexus_netif_compat_adapter *nca =
1527 (struct nexus_netif_compat_adapter *)na;
1528
1529 SK_LOCK_ASSERT_HELD();
1530
1531 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1532
1533 /*
1534 * If the finalizer callback hasn't been called for whatever
1535 * reasons, pick up the embryonic ifnet stored in na_private.
1536 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1537 */
1538 if ((ifp = na->na_ifp) == NULL) {
1539 ifp = na->na_private;
1540 na->na_private = NULL;
1541 } else {
1542 ifnet_decr_iorefcnt(ifp);
1543 na->na_ifp = NULL;
1544 }
1545
1546 if (nca->nca_up.nifna_netif != NULL) {
1547 nx_netif_release(nca->nca_up.nifna_netif);
1548 nca->nca_up.nifna_netif = NULL;
1549 }
1550 ASSERT(!SKYWALK_NATIVE(ifp));
1551 }
1552
1553 /*
1554 * nx_netif_compat_attach() makes it possible to use skywalk on
1555 * a device without native skywalk support.
1556 * This is less performant than native support but potentially
1557 * faster than raw sockets or similar schemes.
1558 */
1559 int
nx_netif_compat_attach(struct kern_nexus * nx,struct ifnet * ifp)1560 nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp)
1561 {
1562 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1563 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1564 struct nexus_netif_compat_adapter *devnca = NULL;
1565 struct nexus_netif_compat_adapter *hostnca = NULL;
1566 struct nexus_adapter *__single devna = NULL;
1567 struct nexus_adapter *__single hostna = NULL;
1568 boolean_t embryonic = FALSE;
1569 uint32_t tx_rings, tx_slots;
1570 int retval = 0;
1571
1572 SK_LOCK_ASSERT_HELD();
1573 ASSERT(!SKYWALK_NATIVE(ifp));
1574 ASSERT(!SKYWALK_CAPABLE(ifp));
1575 ASSERT(ifp->if_na == NULL);
1576 ASSERT(ifp->if_na_ops == NULL);
1577
1578 devnca = na_netif_compat_alloc(Z_WAITOK);
1579 hostnca = na_netif_compat_alloc(Z_WAITOK);
1580
1581 /*
1582 * We can be called for two different interface states:
1583 *
1584 * Fully attached: get an io ref count; upon success, this
1585 * holds a reference to the ifnet for the ifp pointer stored
1586 * in 'na_ifp' down below for both adapters.
1587 *
1588 * Embryonic: temporary hold the ifnet in na_private, which
1589 * upon a successful ifnet_attach(), will be moved over to
1590 * the 'na_ifp' with an io ref count held.
1591 *
1592 * The ifnet in 'na_ifp' will be released by na_release_locked().
1593 */
1594 if (!ifnet_is_attached(ifp, 1)) {
1595 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
1596 ifp = NULL;
1597 retval = ENXIO;
1598 goto err;
1599 }
1600 embryonic = TRUE;
1601 }
1602
1603 /* initialize the (compat) device netif adapter */
1604 devnca->nca_up.nifna_netif = nif;
1605 nx_netif_retain(nif);
1606 devna = &devnca->nca_up.nifna_up;
1607 strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
1608 uuid_generate_random(devna->na_uuid);
1609 if (embryonic) {
1610 /*
1611 * We will move this over to na_ifp once
1612 * the interface is fully attached.
1613 */
1614 devna->na_private = ifp;
1615 ASSERT(devna->na_ifp == NULL);
1616 } else {
1617 ASSERT(devna->na_private == NULL);
1618 /* use I/O refcnt from ifnet_is_attached() */
1619 devna->na_ifp = ifp;
1620 }
1621
1622 devna->na_type = NA_NETIF_COMPAT_DEV;
1623 devna->na_free = na_netif_compat_free;
1624 devna->na_activate = nx_netif_compat_na_activate;
1625 devna->na_txsync = nx_netif_compat_na_txsync;
1626 devna->na_rxsync = nx_netif_compat_na_rxsync;
1627 devna->na_dtor = nx_netif_compat_na_dtor;
1628 devna->na_krings_create = nx_netif_dev_krings_create;
1629 devna->na_krings_delete = nx_netif_dev_krings_delete;
1630 devna->na_special = nx_netif_na_special;
1631
1632 *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
1633 NEXUS_STATS_TYPE_INVALID;
1634
1635 if (skywalk_netif_direct_allowed(ifp->if_xname)) {
1636 tx_rings = nxp->nxp_tx_rings;
1637 tx_slots = nxp->nxp_tx_slots;
1638 } else {
1639 tx_rings = 0;
1640 tx_slots = 0;
1641 }
1642 na_set_nrings(devna, NR_TX, tx_rings);
1643 na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
1644 na_set_nslots(devna, NR_TX, tx_slots);
1645 na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
1646 /*
1647 * Verify upper bounds; the parameters must have already been
1648 * validated by nxdom_prov_params() by the time we get here.
1649 */
1650 ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
1651 ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
1652 ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
1653 ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
1654
1655 na_attach_common(devna, nx, &nx_netif_compat_prov_s);
1656
1657 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1658 nx, devna)) != 0) {
1659 ASSERT(devna->na_arena == NULL);
1660 /* we've transferred the refcnt to na_ifp above */
1661 ifp = NULL;
1662 goto err;
1663 }
1664 ASSERT(devna->na_arena != NULL);
1665
1666 *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
1667 ASSERT(devna->na_flowadv_max == 0 ||
1668 skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
1669
1670 /* setup packet copy routines */
1671 if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
1672 nif->nif_pkt_copy_from_mbuf =
1673 pkt_copy_multi_buflet_from_mbuf;
1674 nif->nif_pkt_copy_to_mbuf =
1675 pkt_copy_multi_buflet_to_mbuf;
1676 } else {
1677 nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
1678 nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
1679 }
1680
1681 /* initialize the host netif adapter */
1682 hostnca->nca_up.nifna_netif = nif;
1683 nx_netif_retain(nif);
1684 hostna = &hostnca->nca_up.nifna_up;
1685 (void) snprintf(hostna->na_name, sizeof(hostna->na_name),
1686 "%s^", devna->na_name);
1687 uuid_generate_random(hostna->na_uuid);
1688 if (embryonic) {
1689 /*
1690 * We will move this over to na_ifp once
1691 * the interface is fully attached.
1692 */
1693 hostna->na_private = ifp;
1694 ASSERT(hostna->na_ifp == NULL);
1695 } else {
1696 ASSERT(hostna->na_private == NULL);
1697 hostna->na_ifp = devna->na_ifp;
1698 ifnet_incr_iorefcnt(hostna->na_ifp);
1699 }
1700 hostna->na_type = NA_NETIF_COMPAT_HOST;
1701 hostna->na_free = na_netif_compat_free;
1702 hostna->na_activate = nx_netif_host_na_activate;
1703 hostna->na_txsync = nx_netif_host_na_txsync;
1704 hostna->na_rxsync = nx_netif_host_na_rxsync;
1705 hostna->na_dtor = nx_netif_compat_na_dtor;
1706 hostna->na_krings_create = nx_netif_host_krings_create;
1707 hostna->na_krings_delete = nx_netif_host_krings_delete;
1708 hostna->na_special = nx_netif_host_na_special;
1709
1710 os_atomic_or(&hostna->na_flags, NAF_HOST_ONLY, relaxed);
1711 *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
1712 NEXUS_STATS_TYPE_INVALID;
1713
1714 na_set_nrings(hostna, NR_TX, 1);
1715 na_set_nrings(hostna, NR_RX, 0);
1716 na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
1717 na_set_nslots(hostna, NR_RX, 0);
1718
1719 na_attach_common(hostna, nx, &nx_netif_prov_s);
1720
1721 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
1722 nx, hostna)) != 0) {
1723 ASSERT(hostna->na_arena == NULL);
1724 /* we've transferred the refcnt to na_ifp above */
1725 ifp = NULL;
1726 goto err;
1727 }
1728 ASSERT(hostna->na_arena != NULL);
1729
1730 *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
1731 ASSERT(hostna->na_flowadv_max == 0 ||
1732 skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
1733
1734 /* these will be undone by destructor */
1735 ifp->if_na_ops = &na_netif_compat_ops;
1736 ifp->if_na = &devnca->nca_up;
1737 na_retain_locked(devna);
1738 na_retain_locked(hostna);
1739
1740 SKYWALK_SET_CAPABLE(ifp);
1741
1742 NETIF_WLOCK(nif);
1743 nif->nif_ifp = ifp;
1744 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna, kernproc);
1745 ASSERT(retval == 0);
1746 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna, kernproc);
1747 ASSERT(retval == 0);
1748 NETIF_WUNLOCK(nif);
1749
1750 #if SK_LOG
1751 uuid_string_t uuidstr;
1752 SK_DF(SK_VERB_NETIF, "na_name: \"%s\"", devna->na_name);
1753 SK_DF(SK_VERB_NETIF, " UUID: %s",
1754 sk_uuid_unparse(devna->na_uuid, uuidstr));
1755 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
1756 SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
1757 NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
1758 SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS);
1759 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max);
1760 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
1761 na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
1762 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
1763 na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
1764 #if CONFIG_NEXUS_USER_PIPE
1765 SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe);
1766 SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes);
1767 #endif /* CONFIG_NEXUS_USER_PIPE */
1768 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
1769 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
1770 SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
1771 SK_DF(SK_VERB_NETIF, " UUID: %s",
1772 sk_uuid_unparse(hostna->na_uuid, uuidstr));
1773 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
1774 SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
1775 NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
1776 SK_DF(SK_VERB_NETIF, " flags: 0x%b",
1777 hostna->na_flags, NAF_BITS);
1778 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max);
1779 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
1780 na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
1781 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
1782 na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
1783 #if CONFIG_NEXUS_USER_PIPE
1784 SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe);
1785 SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes);
1786 #endif /* CONFIG_NEXUS_USER_PIPE */
1787 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]", SK_KVA(ifp),
1788 ifp->if_xname, ifp->if_refio);
1789 #endif /* SK_LOG */
1790
1791 err:
1792 if (retval != 0) {
1793 ASSERT(ifp == NULL);
1794 if (devna != NULL) {
1795 if (devna->na_arena != NULL) {
1796 skmem_arena_release(devna->na_arena);
1797 devna->na_arena = NULL;
1798 }
1799 if (devna->na_ifp != NULL) {
1800 ifnet_decr_iorefcnt(devna->na_ifp);
1801 devna->na_ifp = NULL;
1802 }
1803 devna->na_private = NULL;
1804 }
1805 if (hostna != NULL) {
1806 if (hostna->na_arena != NULL) {
1807 skmem_arena_release(hostna->na_arena);
1808 hostna->na_arena = NULL;
1809 }
1810 if (hostna->na_ifp != NULL) {
1811 ifnet_decr_iorefcnt(hostna->na_ifp);
1812 hostna->na_ifp = NULL;
1813 }
1814 hostna->na_private = NULL;
1815 }
1816 if (devnca != NULL) {
1817 if (devnca->nca_up.nifna_netif != NULL) {
1818 nx_netif_release(devnca->nca_up.nifna_netif);
1819 devnca->nca_up.nifna_netif = NULL;
1820 }
1821 na_netif_compat_free((struct nexus_adapter *)devnca);
1822 }
1823 if (hostnca != NULL) {
1824 if (hostnca->nca_up.nifna_netif != NULL) {
1825 nx_netif_release(hostnca->nca_up.nifna_netif);
1826 hostnca->nca_up.nifna_netif = NULL;
1827 }
1828 na_netif_compat_free((struct nexus_adapter *)hostnca);
1829 }
1830 }
1831 return retval;
1832 }
1833
1834 static void
na_netif_compat_finalize(struct nexus_netif_adapter * nifna,struct ifnet * ifp)1835 na_netif_compat_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
1836 {
1837 na_netif_finalize(nifna, ifp);
1838 }
1839
1840 /*
1841 * Intercept the rx routine in the standard device driver.
1842 * Second argument is non-zero to intercept, 0 to restore
1843 */
1844 static int
nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter * nca,boolean_t enable)1845 nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *nca,
1846 boolean_t enable)
1847 {
1848 struct ifnet *ifp = nca->nca_up.nifna_up.na_ifp;
1849 int err = 0;
1850
1851 ASSERT(!(nca->nca_up.nifna_up.na_flags & NAF_HOST_ONLY));
1852
1853 if (enable) {
1854 err = dlil_set_input_handler(ifp, nx_netif_compat_receive);
1855 } else {
1856 dlil_reset_input_handler(ifp);
1857 }
1858 return err;
1859 }
1860
1861 /*
1862 * Transmit routine used by nx_netif_compat_na_txsync(). Returns 0 on success
1863 * and non-zero on error (which may be packet drops or other errors).
1864 * len identifies the channel buffer, m is the (preallocated) mbuf to use
1865 * for transmissions.
1866 *
1867 * We should add a reference to the mbuf so the m_freem() at the end
1868 * of the transmission does not consume resources.
1869 *
1870 * On FreeBSD, and on multiqueue cards, we can force the queue using
1871 * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1872 * i = m->m_pkthdr.flowid % adapter->num_queues;
1873 * else
1874 * i = curcpu % adapter->num_queues;
1875 *
1876 */
1877 static int
nx_netif_compat_xmit_frame(struct nexus_adapter * na,struct mbuf * m,struct __kern_packet * pkt)1878 nx_netif_compat_xmit_frame(struct nexus_adapter *na, struct mbuf *m,
1879 struct __kern_packet *pkt)
1880 {
1881 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1882 struct nx_netif *nif = nifna->nifna_netif;
1883 struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
1884 struct ifnet *ifp = na->na_ifp;
1885 kern_packet_t ph = SK_PTR_ENCODE(pkt, METADATA_TYPE(pkt),
1886 METADATA_SUBTYPE(pkt));
1887 uint32_t len;
1888 int ret = 0;
1889
1890 if ((ret = mbuf_ring_cluster_activate(m)) != 0) {
1891 panic("Failed to activate mbuf ring cluster 0x%llx (%d)",
1892 SK_KVA(m), ret);
1893 /* NOTREACHED */
1894 __builtin_unreachable();
1895 }
1896
1897 len = pkt->pkt_length;
1898
1899 /*
1900 * The mbuf should be a cluster from our special pool,
1901 * so we do not need to do an m_copyback but just copy.
1902 */
1903 if (m->m_ext.ext_size < len) {
1904 SK_RD(5, "size %u < len %u", m->m_ext.ext_size, len);
1905 len = m->m_ext.ext_size;
1906 }
1907
1908 STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
1909 if (PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1910 STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM);
1911 }
1912
1913 nif->nif_pkt_copy_to_mbuf(NR_TX, ph, pkt->pkt_headroom, m, 0, len,
1914 PACKET_HAS_PARTIAL_CHECKSUM(pkt), pkt->pkt_csum_tx_start_off);
1915
1916 /* used for tx notification */
1917 ret = mbuf_set_tx_compl_data(m, (uintptr_t)ifp, (uintptr_t)NULL);
1918 ASSERT(ret == 0);
1919
1920 ret = dlil_output_handler(ifp, m);
1921 return ret;
1922 }
1923