xref: /xnu-10063.141.1/bsd/skywalk/nexus/nexus_adapter.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31  * All rights reserved.
32  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  *   1. Redistributions of source code must retain the above copyright
38  *      notice, this list of conditions and the following disclaimer.
39  *   2. Redistributions in binary form must reproduce the above copyright
40  *      notice, this list of conditions and the following disclaimer in the
41  *      documentation and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  */
55 #include <sys/systm.h>
56 #include <skywalk/os_skywalk_private.h>
57 #include <skywalk/nexus/monitor/nx_monitor.h>
58 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
59 #include <skywalk/nexus/netif/nx_netif.h>
60 #include <skywalk/nexus/upipe/nx_user_pipe.h>
61 #include <skywalk/nexus/kpipe/nx_kernel_pipe.h>
62 #include <kern/thread.h>
63 
64 static int na_krings_use(struct kern_channel *);
65 static void na_krings_unuse(struct kern_channel *);
66 static void na_krings_verify(struct nexus_adapter *);
67 static int na_notify(struct __kern_channel_ring *, struct proc *, uint32_t);
68 static void na_set_ring(struct nexus_adapter *, uint32_t, enum txrx, uint32_t);
69 static void na_set_all_rings(struct nexus_adapter *, uint32_t);
70 static int na_set_ringid(struct kern_channel *, ring_set_t, ring_id_t);
71 static void na_unset_ringid(struct kern_channel *);
72 static void na_teardown(struct nexus_adapter *, struct kern_channel *,
73     boolean_t);
74 
75 static int na_kr_create(struct nexus_adapter *, boolean_t);
76 static void na_kr_delete(struct nexus_adapter *);
77 static int na_kr_setup(struct nexus_adapter *, struct kern_channel *);
78 static void na_kr_teardown_all(struct nexus_adapter *, struct kern_channel *,
79     boolean_t);
80 static void na_kr_teardown_txrx(struct nexus_adapter *, struct kern_channel *,
81     boolean_t, struct proc *);
82 static int na_kr_populate_slots(struct __kern_channel_ring *);
83 static void na_kr_depopulate_slots(struct __kern_channel_ring *,
84     struct kern_channel *, boolean_t defunct);
85 
86 static int na_schema_alloc(struct kern_channel *);
87 
88 static struct nexus_adapter *na_pseudo_alloc(zalloc_flags_t);
89 static void na_pseudo_free(struct nexus_adapter *);
90 static int na_pseudo_txsync(struct __kern_channel_ring *, struct proc *,
91     uint32_t);
92 static int na_pseudo_rxsync(struct __kern_channel_ring *, struct proc *,
93     uint32_t);
94 static int na_pseudo_activate(struct nexus_adapter *, na_activate_mode_t);
95 static void na_pseudo_dtor(struct nexus_adapter *);
96 static int na_pseudo_krings_create(struct nexus_adapter *,
97     struct kern_channel *);
98 static void na_pseudo_krings_delete(struct nexus_adapter *,
99     struct kern_channel *, boolean_t);
100 static int na_packet_pool_alloc_sync(struct __kern_channel_ring *,
101     struct proc *, uint32_t);
102 static int na_packet_pool_alloc_large_sync(struct __kern_channel_ring *,
103     struct proc *, uint32_t);
104 static int na_packet_pool_free_sync(struct __kern_channel_ring *,
105     struct proc *, uint32_t);
106 static int na_packet_pool_alloc_buf_sync(struct __kern_channel_ring *,
107     struct proc *, uint32_t);
108 static int na_packet_pool_free_buf_sync(struct __kern_channel_ring *,
109     struct proc *, uint32_t);
110 
111 #define NA_KRING_IDLE_TIMEOUT   (NSEC_PER_SEC * 30) /* 30 seconds */
112 
113 static SKMEM_TYPE_DEFINE(na_pseudo_zone, struct nexus_adapter);
114 
115 static int __na_inited = 0;
116 
117 #define NA_NUM_WMM_CLASSES      4
118 #define NAKR_WMM_SC2RINGID(_s)  PKT_SC2TC(_s)
119 #define NAKR_SET_SVC_LUT(_n, _s)                                        \
120 	(_n)->na_kring_svc_lut[MBUF_SCIDX(_s)] = NAKR_WMM_SC2RINGID(_s)
121 #define NAKR_SET_KR_SVC(_n, _s)                                         \
122 	NAKR((_n), NR_TX)[NAKR_WMM_SC2RINGID(_s)].ckr_svc = (_s)
123 
124 #define NA_UPP_ALLOC_LOWAT      8
125 static uint32_t na_upp_alloc_lowat = NA_UPP_ALLOC_LOWAT;
126 
127 #define NA_UPP_REAP_INTERVAL    10 /* seconds */
128 static uint32_t na_upp_reap_interval = NA_UPP_REAP_INTERVAL;
129 
130 #define NA_UPP_WS_HOLD_TIME     2 /* seconds */
131 static uint32_t na_upp_ws_hold_time = NA_UPP_WS_HOLD_TIME;
132 
133 #define NA_UPP_REAP_MIN_PKTS    0
134 static uint32_t na_upp_reap_min_pkts = NA_UPP_REAP_MIN_PKTS;
135 
136 #define NA_UPP_ALLOC_BUF_LOWAT     64
137 static uint32_t na_upp_alloc_buf_lowat = NA_UPP_ALLOC_BUF_LOWAT;
138 
139 #if (DEVELOPMENT || DEBUG)
140 static  uint64_t _na_inject_error = 0;
141 #define _NA_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
142 	_SK_INJECT_ERROR(_na_inject_error, _en, _ev, _ec, NULL, _f, __VA_ARGS__)
143 
144 SYSCTL_UINT(_kern_skywalk, OID_AUTO, na_upp_ws_hold_time,
145     CTLFLAG_RW | CTLFLAG_LOCKED, &na_upp_ws_hold_time,
146     NA_UPP_WS_HOLD_TIME, "");
147 SYSCTL_UINT(_kern_skywalk, OID_AUTO, na_upp_reap_interval,
148     CTLFLAG_RW | CTLFLAG_LOCKED, &na_upp_reap_interval,
149     NA_UPP_REAP_INTERVAL, "");
150 SYSCTL_UINT(_kern_skywalk, OID_AUTO, na_upp_reap_min_pkts,
151     CTLFLAG_RW | CTLFLAG_LOCKED, &na_upp_reap_min_pkts,
152     NA_UPP_REAP_MIN_PKTS, "");
153 SYSCTL_UINT(_kern_skywalk, OID_AUTO, na_upp_alloc_lowat,
154     CTLFLAG_RW | CTLFLAG_LOCKED, &na_upp_alloc_lowat,
155     NA_UPP_ALLOC_LOWAT, "");
156 SYSCTL_UINT(_kern_skywalk, OID_AUTO, na_upp_alloc_buf_lowat,
157     CTLFLAG_RW | CTLFLAG_LOCKED, &na_upp_alloc_buf_lowat,
158     NA_UPP_ALLOC_BUF_LOWAT, "");
159 SYSCTL_QUAD(_kern_skywalk, OID_AUTO, na_inject_error,
160     CTLFLAG_RW | CTLFLAG_LOCKED, &_na_inject_error, "");
161 #else
162 #define _NA_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
163 #endif /* !DEVELOPMENT && !DEBUG */
164 
165 #define SKMEM_TAG_NX_RINGS      "com.apple.skywalk.nexus.rings"
166 static SKMEM_TAG_DEFINE(skmem_tag_nx_rings, SKMEM_TAG_NX_RINGS);
167 
168 #define SKMEM_TAG_NX_CONTEXTS   "com.apple.skywalk.nexus.contexts"
169 static SKMEM_TAG_DEFINE(skmem_tag_nx_contexts, SKMEM_TAG_NX_CONTEXTS);
170 
171 #define SKMEM_TAG_NX_SCRATCH    "com.apple.skywalk.nexus.scratch"
172 static SKMEM_TAG_DEFINE(skmem_tag_nx_scratch, SKMEM_TAG_NX_SCRATCH);
173 
174 #if !XNU_TARGET_OS_OSX
175 /* see KLDBootstrap::readPrelinkedExtensions() for details */
176 extern uuid_t kernelcache_uuid;
177 #else /* XNU_TARGET_OS_OSX */
178 /* see panic_init() for details */
179 extern unsigned char *kernel_uuid;
180 #endif /* XNU_TARGET_OS_OSX */
181 
182 void
na_init(void)183 na_init(void)
184 {
185 	/*
186 	 * Changing the size of nexus_mdata structure won't break ABI,
187 	 * but we need to be mindful of memory consumption; Thus here
188 	 * we add a compile-time check to make sure the size is within
189 	 * the expected limit and that it's properly aligned.  This
190 	 * check may be adjusted in future as needed.
191 	 */
192 	_CASSERT(sizeof(struct nexus_mdata) <= 32 &&
193 	    IS_P2ALIGNED(sizeof(struct nexus_mdata), 8));
194 	_CASSERT(sizeof(struct nexus_mdata) <= sizeof(struct __user_quantum));
195 
196 	/* see comments on nexus_meta_type_t */
197 	_CASSERT(NEXUS_META_TYPE_MAX == 3);
198 	_CASSERT(NEXUS_META_SUBTYPE_MAX == 3);
199 
200 	ASSERT(!__na_inited);
201 
202 	__na_inited = 1;
203 }
204 
205 void
na_fini(void)206 na_fini(void)
207 {
208 	if (__na_inited) {
209 		__na_inited = 0;
210 	}
211 }
212 
213 /*
214  * Interpret the ringid of an chreq, by translating it into a pair
215  * of intervals of ring indices:
216  *
217  * [txfirst, txlast) and [rxfirst, rxlast)
218  */
219 int
na_interp_ringid(struct nexus_adapter * na,ring_id_t ring_id,ring_set_t ring_set,uint32_t first[NR_TXRX],uint32_t last[NR_TXRX])220 na_interp_ringid(struct nexus_adapter *na, ring_id_t ring_id,
221     ring_set_t ring_set, uint32_t first[NR_TXRX], uint32_t last[NR_TXRX])
222 {
223 	enum txrx t;
224 
225 	switch (ring_set) {
226 	case RING_SET_ALL:
227 		/*
228 		 * Ring pair eligibility: all ring(s).
229 		 */
230 		if (ring_id != CHANNEL_RING_ID_ANY &&
231 		    ring_id >= na_get_nrings(na, NR_TX) &&
232 		    ring_id >= na_get_nrings(na, NR_RX)) {
233 			SK_ERR("\"%s\": invalid ring_id %d for ring_set %u",
234 			    na->na_name, (int)ring_id, ring_set);
235 			return EINVAL;
236 		}
237 		for_rx_tx(t) {
238 			if (ring_id == CHANNEL_RING_ID_ANY) {
239 				first[t] = 0;
240 				last[t] = na_get_nrings(na, t);
241 			} else {
242 				first[t] = ring_id;
243 				last[t] = ring_id + 1;
244 			}
245 		}
246 		break;
247 
248 	default:
249 		SK_ERR("\"%s\": invalid ring_set %u", na->na_name, ring_set);
250 		return EINVAL;
251 	}
252 
253 	SK_DF(SK_VERB_NA | SK_VERB_RING,
254 	    "\"%s\": ring_id %d, ring_set %u tx [%u,%u) rx [%u,%u)",
255 	    na->na_name, (int)ring_id, ring_set, first[NR_TX], last[NR_TX],
256 	    first[NR_RX], last[NR_RX]);
257 
258 	return 0;
259 }
260 
261 /*
262  * Set the ring ID. For devices with a single queue, a request
263  * for all rings is the same as a single ring.
264  */
265 static int
na_set_ringid(struct kern_channel * ch,ring_set_t ring_set,ring_id_t ring_id)266 na_set_ringid(struct kern_channel *ch, ring_set_t ring_set, ring_id_t ring_id)
267 {
268 	struct nexus_adapter *na = ch->ch_na;
269 	int error;
270 	enum txrx t;
271 	uint32_t n_alloc_rings;
272 
273 	if ((error = na_interp_ringid(na, ring_id, ring_set,
274 	    ch->ch_first, ch->ch_last)) != 0) {
275 		return error;
276 	}
277 
278 	n_alloc_rings = na_get_nrings(na, NR_A);
279 	if (n_alloc_rings != 0) {
280 		uint32_t n_large_alloc_rings;
281 
282 		ch->ch_first[NR_A] = ch->ch_first[NR_F] = 0;
283 		ch->ch_last[NR_A] = ch->ch_last[NR_F] =
284 		    ch->ch_first[NR_A] + n_alloc_rings;
285 
286 		n_large_alloc_rings = na_get_nrings(na, NR_LBA);
287 		ch->ch_first[NR_LBA] = 0;
288 		ch->ch_last[NR_LBA] = ch->ch_first[NR_LBA] + n_large_alloc_rings;
289 	} else {
290 		ch->ch_first[NR_A] = ch->ch_last[NR_A] = 0;
291 		ch->ch_first[NR_F] = ch->ch_last[NR_F] = 0;
292 		ch->ch_first[NR_LBA] = ch->ch_last[NR_LBA] = 0;
293 	}
294 	ch->ch_first[NR_EV] = 0;
295 	ch->ch_last[NR_EV] = ch->ch_first[NR_EV] + na_get_nrings(na, NR_EV);
296 
297 	/* XXX: should we initialize na_si_users for event ring ? */
298 
299 	/*
300 	 * Optimization: count the users registered for more than
301 	 * one ring, which are the ones sleeping on the global queue.
302 	 * The default na_notify() callback will then avoid signaling
303 	 * the global queue if nobody is using it
304 	 */
305 	for_rx_tx(t) {
306 		if (ch_is_multiplex(ch, t)) {
307 			na->na_si_users[t]++;
308 			ASSERT(na->na_si_users[t] != 0);
309 		}
310 	}
311 	return 0;
312 }
313 
314 static void
na_unset_ringid(struct kern_channel * ch)315 na_unset_ringid(struct kern_channel *ch)
316 {
317 	struct nexus_adapter *na = ch->ch_na;
318 	enum txrx t;
319 
320 	for_rx_tx(t) {
321 		if (ch_is_multiplex(ch, t)) {
322 			ASSERT(na->na_si_users[t] != 0);
323 			na->na_si_users[t]--;
324 		}
325 		ch->ch_first[t] = ch->ch_last[t] = 0;
326 	}
327 }
328 
329 /*
330  * Check that the rings we want to bind are not exclusively owned by a previous
331  * bind.  If exclusive ownership has been requested, we also mark the rings.
332  */
333 /* Hoisted out of line to reduce kernel stack footprint */
334 SK_NO_INLINE_ATTRIBUTE
335 static int
na_krings_use(struct kern_channel * ch)336 na_krings_use(struct kern_channel *ch)
337 {
338 	struct nexus_adapter *na = ch->ch_na;
339 	struct __kern_channel_ring *kring;
340 	boolean_t excl = !!(ch->ch_flags & CHANF_EXCLUSIVE);
341 	enum txrx t;
342 	uint32_t i;
343 
344 	SK_DF(SK_VERB_NA | SK_VERB_RING, "na \"%s\" (0x%llx) grabbing tx [%u,%u) rx [%u,%u)",
345 	    na->na_name, SK_KVA(na), ch->ch_first[NR_TX], ch->ch_last[NR_TX],
346 	    ch->ch_first[NR_RX], ch->ch_last[NR_RX]);
347 
348 	/*
349 	 * First round: check that all the requested rings
350 	 * are neither alread exclusively owned, nor we
351 	 * want exclusive ownership when they are already in use
352 	 */
353 	for_all_rings(t) {
354 		for (i = ch->ch_first[t]; i < ch->ch_last[t]; i++) {
355 			kring = &NAKR(na, t)[i];
356 			if ((kring->ckr_flags & CKRF_EXCLUSIVE) ||
357 			    (kring->ckr_users && excl)) {
358 				SK_DF(SK_VERB_NA | SK_VERB_RING,
359 				    "kr \"%s\" (0x%llx) krflags 0x%b is busy",
360 				    kring->ckr_name, SK_KVA(kring),
361 				    kring->ckr_flags, CKRF_BITS);
362 				return EBUSY;
363 			}
364 		}
365 	}
366 
367 	/*
368 	 * Second round: increment usage count and possibly
369 	 * mark as exclusive
370 	 */
371 
372 	for_all_rings(t) {
373 		for (i = ch->ch_first[t]; i < ch->ch_last[t]; i++) {
374 			kring = &NAKR(na, t)[i];
375 			kring->ckr_users++;
376 			if (excl) {
377 				kring->ckr_flags |= CKRF_EXCLUSIVE;
378 			}
379 		}
380 	}
381 
382 	return 0;
383 }
384 
385 /* Hoisted out of line to reduce kernel stack footprint */
386 SK_NO_INLINE_ATTRIBUTE
387 static void
na_krings_unuse(struct kern_channel * ch)388 na_krings_unuse(struct kern_channel *ch)
389 {
390 	struct nexus_adapter *na = ch->ch_na;
391 	struct __kern_channel_ring *kring;
392 	boolean_t excl = !!(ch->ch_flags & CHANF_EXCLUSIVE);
393 	enum txrx t;
394 	uint32_t i;
395 
396 	SK_DF(SK_VERB_NA | SK_VERB_RING,
397 	    "na \"%s\" (0x%llx) releasing tx [%u, %u) rx [%u, %u)",
398 	    na->na_name, SK_KVA(na), ch->ch_first[NR_TX], ch->ch_last[NR_TX],
399 	    ch->ch_first[NR_RX], ch->ch_last[NR_RX]);
400 
401 	for_all_rings(t) {
402 		for (i = ch->ch_first[t]; i < ch->ch_last[t]; i++) {
403 			kring = &NAKR(na, t)[i];
404 			if (excl) {
405 				kring->ckr_flags &= ~CKRF_EXCLUSIVE;
406 			}
407 			kring->ckr_users--;
408 		}
409 	}
410 }
411 
412 /* Hoisted out of line to reduce kernel stack footprint */
413 SK_NO_INLINE_ATTRIBUTE
414 static void
na_krings_verify(struct nexus_adapter * na)415 na_krings_verify(struct nexus_adapter *na)
416 {
417 	struct __kern_channel_ring *kring;
418 	enum txrx t;
419 	uint32_t i;
420 
421 	for_all_rings(t) {
422 		for (i = 0; i < na_get_nrings(na, t); i++) {
423 			kring = &NAKR(na, t)[i];
424 			/* na_kr_create() validations */
425 			ASSERT(kring->ckr_num_slots > 0);
426 			ASSERT(kring->ckr_lim == (kring->ckr_num_slots - 1));
427 			ASSERT(kring->ckr_pp != NULL);
428 
429 			if (!(kring->ckr_flags & CKRF_MEM_RING_INITED)) {
430 				continue;
431 			}
432 			/* na_kr_setup() validations */
433 			if (KR_KERNEL_ONLY(kring)) {
434 				ASSERT(kring->ckr_ring == NULL);
435 			} else {
436 				ASSERT(kring->ckr_ring != NULL);
437 			}
438 			ASSERT(kring->ckr_ksds_last ==
439 			    &kring->ckr_ksds[kring->ckr_lim]);
440 		}
441 	}
442 }
443 
444 int
na_bind_channel(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr)445 na_bind_channel(struct nexus_adapter *na, struct kern_channel *ch,
446     struct chreq *chr)
447 {
448 	struct kern_pbufpool *rx_pp = skmem_arena_nexus(na->na_arena)->arn_rx_pp;
449 	struct kern_pbufpool *tx_pp = skmem_arena_nexus(na->na_arena)->arn_tx_pp;
450 	uint32_t ch_mode = chr->cr_mode;
451 	int err = 0;
452 
453 	SK_LOCK_ASSERT_HELD();
454 	ASSERT(ch->ch_schema == NULL);
455 	ASSERT(ch->ch_na == NULL);
456 
457 	/* ring configuration may have changed, fetch from the card */
458 	na_update_config(na);
459 	ch->ch_na = na; /* store the reference */
460 	err = na_set_ringid(ch, chr->cr_ring_set, chr->cr_ring_id);
461 	if (err != 0) {
462 		goto err;
463 	}
464 
465 	os_atomic_andnot(&ch->ch_flags, (CHANF_RXONLY | CHANF_EXCLUSIVE |
466 	    CHANF_USER_PACKET_POOL | CHANF_EVENT_RING), relaxed);
467 	if (ch_mode & CHMODE_EXCLUSIVE) {
468 		os_atomic_or(&ch->ch_flags, CHANF_EXCLUSIVE, relaxed);
469 	}
470 	/*
471 	 * Disallow automatic sync for monitor mode, since TX
472 	 * direction is disabled.
473 	 */
474 	if (ch_mode & CHMODE_MONITOR) {
475 		os_atomic_or(&ch->ch_flags, CHANF_RXONLY, relaxed);
476 	}
477 
478 	if (!!(na->na_flags & NAF_USER_PKT_POOL) ^
479 	    !!(ch_mode & CHMODE_USER_PACKET_POOL)) {
480 		SK_ERR("incompatible channel mode (0x%b), na_flags (0x%b)",
481 		    ch_mode, CHMODE_BITS, na->na_flags, NAF_BITS);
482 		err = EINVAL;
483 		goto err;
484 	}
485 
486 	if (na->na_arena->ar_flags & ARF_DEFUNCT) {
487 		err = ENXIO;
488 		goto err;
489 	}
490 
491 	if (ch_mode & CHMODE_USER_PACKET_POOL) {
492 		ASSERT(na->na_flags & NAF_USER_PKT_POOL);
493 		ASSERT(ch->ch_first[NR_A] != ch->ch_last[NR_A]);
494 		ASSERT(ch->ch_first[NR_F] != ch->ch_last[NR_F]);
495 		os_atomic_or(&ch->ch_flags, CHANF_USER_PACKET_POOL, relaxed);
496 	}
497 
498 	if (ch_mode & CHMODE_EVENT_RING) {
499 		ASSERT(na->na_flags & NAF_USER_PKT_POOL);
500 		ASSERT(na->na_flags & NAF_EVENT_RING);
501 		ASSERT(ch->ch_first[NR_EV] != ch->ch_last[NR_EV]);
502 		os_atomic_or(&ch->ch_flags, CHANF_EVENT_RING, relaxed);
503 	}
504 
505 	/*
506 	 * If this is the first channel of the adapter, create
507 	 * the rings and their in-kernel view, the krings.
508 	 */
509 	if (na->na_channels == 0) {
510 		err = na->na_krings_create(na, ch);
511 		if (err != 0) {
512 			goto err;
513 		}
514 
515 		/*
516 		 * Sanity check; this is already done in na_kr_create(),
517 		 * but we do it here as well to validate na_kr_setup().
518 		 */
519 		na_krings_verify(na);
520 		*(nexus_meta_type_t *)(uintptr_t)&na->na_md_type =
521 		    skmem_arena_nexus(na->na_arena)->arn_rx_pp->pp_md_type;
522 		*(nexus_meta_subtype_t *)(uintptr_t)&na->na_md_subtype =
523 		    skmem_arena_nexus(na->na_arena)->arn_rx_pp->pp_md_subtype;
524 	}
525 
526 	/*
527 	 * Validate ownership and usability of the krings; take into account
528 	 * whether some previous bind has exclusive ownership on them.
529 	 */
530 	err = na_krings_use(ch);
531 	if (err != 0) {
532 		goto err_del_rings;
533 	}
534 
535 	/* for user-facing channel, create a new channel schema */
536 	if (!(ch->ch_flags & CHANF_KERNEL)) {
537 		err = na_schema_alloc(ch);
538 		if (err != 0) {
539 			goto err_rel_excl;
540 		}
541 
542 		ASSERT(ch->ch_schema != NULL);
543 		ASSERT(ch->ch_schema_offset != (mach_vm_offset_t)-1);
544 	} else {
545 		ASSERT(ch->ch_schema == NULL);
546 		ch->ch_schema_offset = (mach_vm_offset_t)-1;
547 	}
548 
549 	/* update our work timestamp */
550 	na->na_work_ts = net_uptime();
551 
552 	na->na_channels++;
553 
554 	/*
555 	 * If user packet pool is desired, initialize the allocated
556 	 * object hash table in the pool, if not already.  This also
557 	 * retains a refcnt on the pool which the caller must release.
558 	 */
559 	ASSERT(ch->ch_pp == NULL);
560 	if (ch_mode & CHMODE_USER_PACKET_POOL) {
561 #pragma unused(tx_pp)
562 		ASSERT(rx_pp == tx_pp);
563 		err = pp_init_upp(rx_pp, TRUE);
564 		if (err != 0) {
565 			goto err_free_schema;
566 		}
567 		ch->ch_pp = rx_pp;
568 	}
569 
570 	if (!NA_IS_ACTIVE(na)) {
571 		err = na->na_activate(na, NA_ACTIVATE_MODE_ON);
572 		if (err != 0) {
573 			goto err_release_pp;
574 		}
575 
576 		SK_D("activated \"%s\" adapter 0x%llx", na->na_name,
577 		    SK_KVA(na));
578 		SK_D("  na_md_type:    %u", na->na_md_type);
579 		SK_D("  na_md_subtype: %u", na->na_md_subtype);
580 	}
581 
582 	SK_D("ch 0x%llx", SK_KVA(ch));
583 	SK_D("  ch_flags:     0x%b", ch->ch_flags, CHANF_BITS);
584 	if (ch->ch_schema != NULL) {
585 		SK_D("  ch_schema:    0x%llx", SK_KVA(ch->ch_schema));
586 	}
587 	SK_D("  ch_na:        0x%llx (chcnt %u)", SK_KVA(ch->ch_na),
588 	    ch->ch_na->na_channels);
589 	SK_D("  ch_tx_rings:  [%u,%u)", ch->ch_first[NR_TX],
590 	    ch->ch_last[NR_TX]);
591 	SK_D("  ch_rx_rings:  [%u,%u)", ch->ch_first[NR_RX],
592 	    ch->ch_last[NR_RX]);
593 	SK_D("  ch_alloc_rings:  [%u,%u)", ch->ch_first[NR_A],
594 	    ch->ch_last[NR_A]);
595 	SK_D("  ch_free_rings:  [%u,%u)", ch->ch_first[NR_F],
596 	    ch->ch_last[NR_F]);
597 	SK_D("  ch_ev_rings:  [%u,%u)", ch->ch_first[NR_EV],
598 	    ch->ch_last[NR_EV]);
599 
600 	return 0;
601 
602 err_release_pp:
603 	if (ch_mode & CHMODE_USER_PACKET_POOL) {
604 		ASSERT(ch->ch_pp != NULL);
605 		pp_release(rx_pp);
606 		ch->ch_pp = NULL;
607 	}
608 err_free_schema:
609 	*(nexus_meta_type_t *)(uintptr_t)&na->na_md_type =
610 	    NEXUS_META_TYPE_INVALID;
611 	*(nexus_meta_subtype_t *)(uintptr_t)&na->na_md_subtype =
612 	    NEXUS_META_SUBTYPE_INVALID;
613 	ASSERT(na->na_channels != 0);
614 	na->na_channels--;
615 	if (ch->ch_schema != NULL) {
616 		skmem_cache_free(
617 			skmem_arena_nexus(na->na_arena)->arn_schema_cache,
618 			ch->ch_schema);
619 		ch->ch_schema = NULL;
620 		ch->ch_schema_offset = (mach_vm_offset_t)-1;
621 	}
622 err_rel_excl:
623 	na_krings_unuse(ch);
624 err_del_rings:
625 	if (na->na_channels == 0) {
626 		na->na_krings_delete(na, ch, FALSE);
627 	}
628 err:
629 	ch->ch_na = NULL;
630 	ASSERT(err != 0);
631 
632 	return err;
633 }
634 
635 /*
636  * Undo everything that was done in na_bind_channel().
637  */
638 /* call with SK_LOCK held */
639 void
na_unbind_channel(struct kern_channel * ch)640 na_unbind_channel(struct kern_channel *ch)
641 {
642 	struct nexus_adapter *na = ch->ch_na;
643 
644 	SK_LOCK_ASSERT_HELD();
645 
646 	ASSERT(na->na_channels != 0);
647 	na->na_channels--;
648 
649 	/* release exclusive use if it was requested at bind time */
650 	na_krings_unuse(ch);
651 
652 	if (na->na_channels == 0) {     /* last instance */
653 		SK_D("%s(%d): deleting last channel instance for %s",
654 		    ch->ch_name, ch->ch_pid, na->na_name);
655 
656 		/*
657 		 * Free any remaining allocated packets attached to
658 		 * the slots, followed by a teardown of the arena.
659 		 */
660 		na_teardown(na, ch, FALSE);
661 
662 		*(nexus_meta_type_t *)(uintptr_t)&na->na_md_type =
663 		    NEXUS_META_TYPE_INVALID;
664 		*(nexus_meta_subtype_t *)(uintptr_t)&na->na_md_subtype =
665 		    NEXUS_META_SUBTYPE_INVALID;
666 	} else {
667 		SK_D("%s(%d): %s has %u remaining channel instance(s)",
668 		    ch->ch_name, ch->ch_pid, na->na_name, na->na_channels);
669 	}
670 
671 	/*
672 	 * Free any allocated packets (for the process) attached to the slots;
673 	 * note that na_teardown() could have done this there as well.
674 	 */
675 	if (ch->ch_pp != NULL) {
676 		ASSERT(ch->ch_flags & CHANF_USER_PACKET_POOL);
677 		pp_purge_upp(ch->ch_pp, ch->ch_pid);
678 		pp_release(ch->ch_pp);
679 		ch->ch_pp = NULL;
680 	}
681 
682 	/* possibily decrement counter of tx_si/rx_si users */
683 	na_unset_ringid(ch);
684 
685 	/* reap the caches now (purge if adapter is idle) */
686 	skmem_arena_reap(na->na_arena, (na->na_channels == 0));
687 
688 	/* delete the csm */
689 	if (ch->ch_schema != NULL) {
690 		skmem_cache_free(
691 			skmem_arena_nexus(na->na_arena)->arn_schema_cache,
692 			ch->ch_schema);
693 		ch->ch_schema = NULL;
694 		ch->ch_schema_offset = (mach_vm_offset_t)-1;
695 	}
696 
697 	/* destroy the memory map */
698 	skmem_arena_munmap_channel(na->na_arena, ch);
699 
700 	/* mark the channel as unbound */
701 	os_atomic_andnot(&ch->ch_flags, (CHANF_RXONLY | CHANF_EXCLUSIVE), relaxed);
702 	ch->ch_na = NULL;
703 
704 	/* and finally release the nexus adapter; this might free it */
705 	(void) na_release_locked(na);
706 }
707 
708 static void
na_teardown(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)709 na_teardown(struct nexus_adapter *na, struct kern_channel *ch,
710     boolean_t defunct)
711 {
712 	SK_LOCK_ASSERT_HELD();
713 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
714 
715 #if CONFIG_NEXUS_MONITOR
716 	/*
717 	 * Walk through all the rings and tell any monitor
718 	 * that the port is going to exit Skywalk mode
719 	 */
720 	nx_mon_stop(na);
721 #endif /* CONFIG_NEXUS_MONITOR */
722 
723 	/*
724 	 * Deactive the adapter.
725 	 */
726 	(void) na->na_activate(na,
727 	    (defunct ? NA_ACTIVATE_MODE_DEFUNCT : NA_ACTIVATE_MODE_OFF));
728 
729 	/*
730 	 * Free any remaining allocated packets for this process.
731 	 */
732 	if (ch->ch_pp != NULL) {
733 		ASSERT(ch->ch_flags & CHANF_USER_PACKET_POOL);
734 		pp_purge_upp(ch->ch_pp, ch->ch_pid);
735 		if (!defunct) {
736 			pp_release(ch->ch_pp);
737 			ch->ch_pp = NULL;
738 		}
739 	}
740 
741 	/*
742 	 * Delete rings and buffers.
743 	 */
744 	na->na_krings_delete(na, ch, defunct);
745 }
746 
747 /* call with SK_LOCK held */
748 /*
749  * Allocate the per-fd structure __user_channel_schema.
750  */
751 static int
na_schema_alloc(struct kern_channel * ch)752 na_schema_alloc(struct kern_channel *ch)
753 {
754 	struct nexus_adapter *na = ch->ch_na;
755 	struct skmem_arena *ar = na->na_arena;
756 	struct skmem_arena_nexus *arn;
757 	mach_vm_offset_t roff[SKMEM_REGIONS];
758 	struct __kern_channel_ring *kr;
759 	struct __user_channel_schema *csm;
760 	struct skmem_obj_info csm_oi, ring_oi, ksd_oi, usd_oi;
761 	mach_vm_offset_t base;
762 	uint32_t i, j, k, n[NR_ALL];
763 	enum txrx t;
764 
765 	/* see comments for struct __user_channel_schema */
766 	_CASSERT(offsetof(struct __user_channel_schema, csm_ver) == 0);
767 	_CASSERT(offsetof(struct __user_channel_schema, csm_flags) ==
768 	    sizeof(csm->csm_ver));
769 	_CASSERT(offsetof(struct __user_channel_schema, csm_kern_name) ==
770 	    sizeof(csm->csm_ver) + sizeof(csm->csm_flags));
771 	_CASSERT(offsetof(struct __user_channel_schema, csm_kern_uuid) ==
772 	    sizeof(csm->csm_ver) + sizeof(csm->csm_flags) +
773 	    sizeof(csm->csm_kern_name));
774 
775 	SK_LOCK_ASSERT_HELD();
776 
777 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
778 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
779 	arn = skmem_arena_nexus(ar);
780 	ASSERT(arn != NULL);
781 	for_all_rings(t) {
782 		n[t] = 0;
783 	}
784 
785 	csm = skmem_cache_alloc(arn->arn_schema_cache, SKMEM_NOSLEEP);
786 	if (csm == NULL) {
787 		return ENOMEM;
788 	}
789 
790 	skmem_cache_get_obj_info(arn->arn_schema_cache, csm, &csm_oi, NULL);
791 	bzero(csm, SKMEM_OBJ_SIZE(&csm_oi));
792 
793 	*(uint32_t *)(uintptr_t)&csm->csm_ver = CSM_CURRENT_VERSION;
794 
795 	/* kernel version and executable UUID */
796 	_CASSERT(sizeof(csm->csm_kern_name) == _SYS_NAMELEN);
797 	(void) strncpy((char *)(uintptr_t)csm->csm_kern_name,
798 	    version, sizeof(csm->csm_kern_name) - 1);
799 #if !XNU_TARGET_OS_OSX
800 	(void) memcpy((void *)(uintptr_t)csm->csm_kern_uuid,
801 	    kernelcache_uuid, sizeof(csm->csm_kern_uuid));
802 #else /* XNU_TARGET_OS_OSX */
803 	if (kernel_uuid != NULL) {
804 		(void) memcpy((void *)(uintptr_t)csm->csm_kern_uuid,
805 		    kernel_uuid, sizeof(csm->csm_kern_uuid));
806 	}
807 #endif /* XNU_TARGET_OS_OSX */
808 
809 	for_rx_tx(t) {
810 		ASSERT((ch->ch_last[t] > 0) || (ch->ch_first[t] == 0));
811 		n[t] = ch->ch_last[t] - ch->ch_first[t];
812 		ASSERT(n[t] == 0 || n[t] <= na_get_nrings(na, t));
813 	}
814 
815 	/* return total number of tx and rx rings for this channel */
816 	*(uint32_t *)(uintptr_t)&csm->csm_tx_rings = n[NR_TX];
817 	*(uint32_t *)(uintptr_t)&csm->csm_rx_rings = n[NR_RX];
818 
819 	if (ch->ch_flags & CHANF_USER_PACKET_POOL) {
820 		*(uint32_t *)(uintptr_t)&csm->csm_allocator_ring_pairs =
821 		    na->na_num_allocator_ring_pairs;
822 		n[NR_A] = n[NR_F] = na->na_num_allocator_ring_pairs;
823 		ASSERT(n[NR_A] != 0 && n[NR_A] <= na_get_nrings(na, NR_A));
824 		ASSERT(n[NR_A] == (ch->ch_last[NR_A] - ch->ch_first[NR_A]));
825 		ASSERT(n[NR_F] == (ch->ch_last[NR_F] - ch->ch_first[NR_F]));
826 
827 		n[NR_LBA] = na->na_num_large_buf_alloc_rings;
828 		if (n[NR_LBA] != 0) {
829 			*(uint32_t *)(uintptr_t)&csm->csm_large_buf_alloc_rings = n[NR_LBA];
830 			ASSERT(n[NR_LBA] == (ch->ch_last[NR_LBA] - ch->ch_first[NR_LBA]));
831 		}
832 	}
833 
834 	if (ch->ch_flags & CHANF_EVENT_RING) {
835 		n[NR_EV] = ch->ch_last[NR_EV] - ch->ch_first[NR_EV];
836 		ASSERT(n[NR_EV] != 0 && n[NR_EV] <= na_get_nrings(na, NR_EV));
837 		*(uint32_t *)(uintptr_t)&csm->csm_num_event_rings = n[NR_EV];
838 	}
839 
840 	bzero(&roff, sizeof(roff));
841 	for (i = 0; i < SKMEM_REGIONS; i++) {
842 		if (ar->ar_regions[i] == NULL) {
843 			ASSERT(i == SKMEM_REGION_GUARD_HEAD ||
844 			    i == SKMEM_REGION_SCHEMA ||
845 			    i == SKMEM_REGION_BUF_LARGE ||
846 			    i == SKMEM_REGION_RXBUF_DEF ||
847 			    i == SKMEM_REGION_RXBUF_LARGE ||
848 			    i == SKMEM_REGION_TXBUF_DEF ||
849 			    i == SKMEM_REGION_TXBUF_LARGE ||
850 			    i == SKMEM_REGION_RXKMD ||
851 			    i == SKMEM_REGION_TXKMD ||
852 			    i == SKMEM_REGION_UMD ||
853 			    i == SKMEM_REGION_UBFT ||
854 			    i == SKMEM_REGION_KBFT ||
855 			    i == SKMEM_REGION_RXKBFT ||
856 			    i == SKMEM_REGION_TXKBFT ||
857 			    i == SKMEM_REGION_TXAUSD ||
858 			    i == SKMEM_REGION_RXFUSD ||
859 			    i == SKMEM_REGION_USTATS ||
860 			    i == SKMEM_REGION_KSTATS ||
861 			    i == SKMEM_REGION_INTRINSIC ||
862 			    i == SKMEM_REGION_FLOWADV ||
863 			    i == SKMEM_REGION_NEXUSADV ||
864 			    i == SKMEM_REGION_SYSCTLS ||
865 			    i == SKMEM_REGION_GUARD_TAIL);
866 			continue;
867 		}
868 
869 		/* not for nexus */
870 		ASSERT(i != SKMEM_REGION_SYSCTLS);
871 
872 		/*
873 		 * Get region offsets from base of mmap span; the arena
874 		 * doesn't need to be mmap'd at this point, since we
875 		 * simply compute the relative offset.
876 		 */
877 		roff[i] = skmem_arena_get_region_offset(ar, i);
878 	}
879 
880 	/*
881 	 * The schema is made up of the descriptor followed inline by an array
882 	 * of offsets to the tx, rx, allocator and event rings in the mmap span.
883 	 * They contain the offset between the ring and schema, so the
884 	 * information is usable in userspace to reach the ring from
885 	 * the schema.
886 	 */
887 	base = roff[SKMEM_REGION_SCHEMA] + SKMEM_OBJ_ROFF(&csm_oi);
888 
889 	/* initialize schema with tx ring info */
890 	for (i = 0, j = ch->ch_first[NR_TX]; i < n[NR_TX]; i++, j++) {
891 		kr = &na->na_tx_rings[j];
892 		if (KR_KERNEL_ONLY(kr)) { /* skip kernel-only rings */
893 			continue;
894 		}
895 
896 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
897 		skmem_cache_get_obj_info(arn->arn_ring_cache,
898 		    kr->ckr_ring, &ring_oi, NULL);
899 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_ring_ofs[i].ring_off =
900 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
901 
902 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
903 		skmem_cache_get_obj_info(kr->ckr_ksds_cache,
904 		    kr->ckr_ksds, &ksd_oi, &usd_oi);
905 
906 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_ring_ofs[i].sd_off =
907 		    (roff[SKMEM_REGION_TXAUSD] + SKMEM_OBJ_ROFF(&usd_oi)) -
908 		    base;
909 	}
910 	/* initialize schema with rx ring info */
911 	for (i = 0, j = ch->ch_first[NR_RX]; i < n[NR_RX]; i++, j++) {
912 		kr = &na->na_rx_rings[j];
913 		if (KR_KERNEL_ONLY(kr)) { /* skip kernel-only rings */
914 			continue;
915 		}
916 
917 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
918 		skmem_cache_get_obj_info(arn->arn_ring_cache,
919 		    kr->ckr_ring, &ring_oi, NULL);
920 		*(mach_vm_offset_t *)
921 		(uintptr_t)&csm->csm_ring_ofs[i + n[NR_TX]].ring_off =
922 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
923 
924 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
925 		skmem_cache_get_obj_info(kr->ckr_ksds_cache,
926 		    kr->ckr_ksds, &ksd_oi, &usd_oi);
927 
928 		*(mach_vm_offset_t *)
929 		(uintptr_t)&csm->csm_ring_ofs[i + n[NR_TX]].sd_off =
930 		    (roff[SKMEM_REGION_RXFUSD] + SKMEM_OBJ_ROFF(&usd_oi)) -
931 		    base;
932 	}
933 	/* initialize schema with allocator ring info */
934 	for (i = 0, j = ch->ch_first[NR_A], k = n[NR_TX] + n[NR_RX];
935 	    i < n[NR_A]; i++, j++) {
936 		mach_vm_offset_t usd_roff;
937 
938 		usd_roff = roff[SKMEM_REGION_TXAUSD];
939 		kr = &na->na_alloc_rings[j];
940 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
941 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
942 
943 		skmem_cache_get_obj_info(arn->arn_ring_cache, kr->ckr_ring,
944 		    &ring_oi, NULL);
945 		*(mach_vm_offset_t *)
946 		(uintptr_t)&csm->csm_ring_ofs[i + k].ring_off =
947 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
948 
949 		skmem_cache_get_obj_info(kr->ckr_ksds_cache, kr->ckr_ksds,
950 		    &ksd_oi, &usd_oi);
951 		*(mach_vm_offset_t *)
952 		(uintptr_t)&csm->csm_ring_ofs[i + k].sd_off =
953 		    (usd_roff + SKMEM_OBJ_ROFF(&usd_oi)) - base;
954 	}
955 	/* initialize schema with free ring info */
956 	for (i = 0, j = ch->ch_first[NR_F], k = n[NR_TX] + n[NR_RX] + n[NR_A];
957 	    i < n[NR_F]; i++, j++) {
958 		mach_vm_offset_t usd_roff;
959 
960 		usd_roff = roff[SKMEM_REGION_RXFUSD];
961 		kr = &na->na_free_rings[j];
962 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
963 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
964 
965 		skmem_cache_get_obj_info(arn->arn_ring_cache, kr->ckr_ring,
966 		    &ring_oi, NULL);
967 		*(mach_vm_offset_t *)
968 		(uintptr_t)&csm->csm_ring_ofs[i + k].ring_off =
969 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
970 
971 		skmem_cache_get_obj_info(kr->ckr_ksds_cache, kr->ckr_ksds,
972 		    &ksd_oi, &usd_oi);
973 		*(mach_vm_offset_t *)
974 		(uintptr_t)&csm->csm_ring_ofs[i + k].sd_off =
975 		    (usd_roff + SKMEM_OBJ_ROFF(&usd_oi)) - base;
976 	}
977 	/* initialize schema with event ring info */
978 	for (i = 0, j = ch->ch_first[NR_EV], k = n[NR_TX] + n[NR_RX] +
979 	    n[NR_A] + n[NR_F]; i < n[NR_EV]; i++, j++) {
980 		ASSERT(csm->csm_num_event_rings != 0);
981 		kr = &na->na_event_rings[j];
982 		ASSERT(!KR_KERNEL_ONLY(kr));
983 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
984 		skmem_cache_get_obj_info(arn->arn_ring_cache,
985 		    kr->ckr_ring, &ring_oi, NULL);
986 		*(mach_vm_offset_t *)
987 		(uintptr_t)&csm->csm_ring_ofs[i + k].ring_off =
988 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
989 
990 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
991 		skmem_cache_get_obj_info(kr->ckr_ksds_cache,
992 		    kr->ckr_ksds, &ksd_oi, &usd_oi);
993 
994 		*(mach_vm_offset_t *)
995 		(uintptr_t)&csm->csm_ring_ofs[i + k].sd_off =
996 		    (roff[SKMEM_REGION_TXAUSD] + SKMEM_OBJ_ROFF(&usd_oi)) -
997 		    base;
998 	}
999 	/* initialize schema with large buf alloc ring info */
1000 	for (i = 0, j = ch->ch_first[NR_LBA], k = n[NR_TX] + n[NR_RX] +
1001 	    n[NR_A] + n[NR_F] + n[NR_EV]; i < n[NR_LBA]; i++, j++) {
1002 		ASSERT(csm->csm_large_buf_alloc_rings != 0);
1003 		kr = &na->na_large_buf_alloc_rings[j];
1004 		ASSERT(!KR_KERNEL_ONLY(kr));
1005 		ASSERT(kr->ckr_flags & CKRF_MEM_RING_INITED);
1006 		skmem_cache_get_obj_info(arn->arn_ring_cache,
1007 		    kr->ckr_ring, &ring_oi, NULL);
1008 		*(mach_vm_offset_t *)
1009 		(uintptr_t)&csm->csm_ring_ofs[i + k].ring_off =
1010 		    (roff[SKMEM_REGION_RING] + SKMEM_OBJ_ROFF(&ring_oi)) - base;
1011 
1012 		ASSERT(kr->ckr_flags & CKRF_MEM_SD_INITED);
1013 		skmem_cache_get_obj_info(kr->ckr_ksds_cache,
1014 		    kr->ckr_ksds, &ksd_oi, &usd_oi);
1015 
1016 		*(mach_vm_offset_t *)
1017 		(uintptr_t)&csm->csm_ring_ofs[i + k].sd_off =
1018 		    (roff[SKMEM_REGION_TXAUSD] + SKMEM_OBJ_ROFF(&usd_oi)) -
1019 		    base;
1020 	}
1021 
1022 	*(uint64_t *)(uintptr_t)&csm->csm_md_redzone_cookie =
1023 	    __ch_umd_redzone_cookie;
1024 	*(nexus_meta_type_t *)(uintptr_t)&csm->csm_md_type = na->na_md_type;
1025 	*(nexus_meta_subtype_t *)(uintptr_t)&csm->csm_md_subtype =
1026 	    na->na_md_subtype;
1027 
1028 	if (arn->arn_stats_obj != NULL) {
1029 		ASSERT(ar->ar_regions[SKMEM_REGION_USTATS] != NULL);
1030 		ASSERT(roff[SKMEM_REGION_USTATS] != 0);
1031 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_stats_ofs =
1032 		    roff[SKMEM_REGION_USTATS];
1033 		*(nexus_stats_type_t *)(uintptr_t)&csm->csm_stats_type =
1034 		    na->na_stats_type;
1035 	} else {
1036 		ASSERT(ar->ar_regions[SKMEM_REGION_USTATS] == NULL);
1037 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_stats_ofs = 0;
1038 		*(nexus_stats_type_t *)(uintptr_t)&csm->csm_stats_type =
1039 		    NEXUS_STATS_TYPE_INVALID;
1040 	}
1041 
1042 	if (arn->arn_flowadv_obj != NULL) {
1043 		ASSERT(ar->ar_regions[SKMEM_REGION_FLOWADV] != NULL);
1044 		ASSERT(roff[SKMEM_REGION_FLOWADV] != 0);
1045 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_flowadv_ofs =
1046 		    roff[SKMEM_REGION_FLOWADV];
1047 		*(uint32_t *)(uintptr_t)&csm->csm_flowadv_max =
1048 		    na->na_flowadv_max;
1049 	} else {
1050 		ASSERT(ar->ar_regions[SKMEM_REGION_FLOWADV] == NULL);
1051 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_flowadv_ofs = 0;
1052 		*(uint32_t *)(uintptr_t)&csm->csm_flowadv_max = 0;
1053 	}
1054 
1055 	if (arn->arn_nexusadv_obj != NULL) {
1056 		struct __kern_nexus_adv_metadata *adv_md;
1057 
1058 		adv_md = arn->arn_nexusadv_obj;
1059 		ASSERT(adv_md->knam_version == NX_ADVISORY_MD_CURRENT_VERSION);
1060 		ASSERT(ar->ar_regions[SKMEM_REGION_NEXUSADV] != NULL);
1061 		ASSERT(roff[SKMEM_REGION_NEXUSADV] != 0);
1062 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_nexusadv_ofs =
1063 		    roff[SKMEM_REGION_NEXUSADV];
1064 	} else {
1065 		ASSERT(ar->ar_regions[SKMEM_REGION_NEXUSADV] == NULL);
1066 		*(mach_vm_offset_t *)(uintptr_t)&csm->csm_nexusadv_ofs = 0;
1067 	}
1068 
1069 	ch->ch_schema = csm;
1070 	ch->ch_schema_offset = base;
1071 
1072 	return 0;
1073 }
1074 
1075 /*
1076  * Called by all routines that create nexus_adapters.
1077  * Attach na to the ifp (if any) and provide defaults
1078  * for optional callbacks. Defaults assume that we
1079  * are creating an hardware nexus_adapter.
1080  */
1081 void
na_attach_common(struct nexus_adapter * na,struct kern_nexus * nx,struct kern_nexus_domain_provider * nxdom_prov)1082 na_attach_common(struct nexus_adapter *na, struct kern_nexus *nx,
1083     struct kern_nexus_domain_provider *nxdom_prov)
1084 {
1085 	SK_LOCK_ASSERT_HELD();
1086 
1087 	ASSERT(nx != NULL);
1088 	ASSERT(nxdom_prov != NULL);
1089 	ASSERT(na->na_krings_create != NULL);
1090 	ASSERT(na->na_krings_delete != NULL);
1091 	if (na->na_type != NA_NETIF_COMPAT_DEV) {
1092 		ASSERT(na_get_nrings(na, NR_TX) != 0);
1093 	}
1094 	if (na->na_type != NA_NETIF_COMPAT_HOST) {
1095 		ASSERT(na_get_nrings(na, NR_RX) != 0);
1096 	}
1097 	ASSERT(na->na_channels == 0);
1098 
1099 	if (na->na_notify == NULL) {
1100 		na->na_notify = na_notify;
1101 	}
1102 
1103 	na->na_nx = nx;
1104 	na->na_nxdom_prov = nxdom_prov;
1105 
1106 	SK_D("na 0x%llx nx 0x%llx nxtype %u ar 0x%llx",
1107 	    SK_KVA(na), SK_KVA(nx), nxdom_prov->nxdom_prov_dom->nxdom_type,
1108 	    SK_KVA(na->na_arena));
1109 }
1110 
1111 void
na_post_event(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)1112 na_post_event(struct __kern_channel_ring *kring, boolean_t nodelay,
1113     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
1114 {
1115 	struct nexus_adapter *na = KRNA(kring);
1116 	enum txrx t = kring->ckr_tx;
1117 
1118 	SK_DF(SK_VERB_EVENTS,
1119 	    "%s(%d) na \"%s\" (0x%llx) kr 0x%llx kev %u sel %u hint 0x%b",
1120 	    sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1121 	    na->na_name, SK_KVA(na), SK_KVA(kring), within_kevent, selwake,
1122 	    hint, CHAN_FILT_HINT_BITS);
1123 
1124 	csi_selwakeup_one(kring, nodelay, within_kevent, selwake, hint);
1125 	/*
1126 	 * optimization: avoid a wake up on the global
1127 	 * queue if nobody has registered for more
1128 	 * than one ring
1129 	 */
1130 	if (na->na_si_users[t] > 0) {
1131 		csi_selwakeup_all(na, t, nodelay, within_kevent, selwake, hint);
1132 	}
1133 }
1134 
1135 /* default notify callback */
1136 static int
na_notify(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)1137 na_notify(struct __kern_channel_ring *kring, struct proc *p, uint32_t flags)
1138 {
1139 #pragma unused(p)
1140 	SK_DF(SK_VERB_NOTIFY | ((kring->ckr_tx == NR_TX) ?
1141 	    SK_VERB_TX : SK_VERB_RX),
1142 	    "%s(%d) [%s] na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
1143 	    "flags 0x%x, kh %u kt %u | h %u t %u",
1144 	    sk_proc_name_address(p), sk_proc_pid(p),
1145 	    (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
1146 	    SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
1147 	    kring->ckr_flags, CKRF_BITS, flags, kring->ckr_khead,
1148 	    kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail);
1149 
1150 	na_post_event(kring, (flags & NA_NOTEF_PUSH),
1151 	    (flags & NA_NOTEF_IN_KEVENT), TRUE, 0);
1152 
1153 	return 0;
1154 }
1155 
1156 /*
1157  * Fetch configuration from the device, to cope with dynamic
1158  * reconfigurations after loading the module.
1159  */
1160 /* call with SK_LOCK held */
1161 int
na_update_config(struct nexus_adapter * na)1162 na_update_config(struct nexus_adapter *na)
1163 {
1164 	uint32_t txr, txd, rxr, rxd;
1165 
1166 	SK_LOCK_ASSERT_HELD();
1167 
1168 	txr = txd = rxr = rxd = 0;
1169 	if (na->na_config == NULL ||
1170 	    na->na_config(na, &txr, &txd, &rxr, &rxd)) {
1171 		/* take whatever we had at init time */
1172 		txr = na_get_nrings(na, NR_TX);
1173 		txd = na_get_nslots(na, NR_TX);
1174 		rxr = na_get_nrings(na, NR_RX);
1175 		rxd = na_get_nslots(na, NR_RX);
1176 	}
1177 
1178 	if (na_get_nrings(na, NR_TX) == txr &&
1179 	    na_get_nslots(na, NR_TX) == txd &&
1180 	    na_get_nrings(na, NR_RX) == rxr &&
1181 	    na_get_nslots(na, NR_RX) == rxd) {
1182 		return 0; /* nothing changed */
1183 	}
1184 	SK_D("stored config %s: txring %u x %u, rxring %u x %u",
1185 	    na->na_name, na_get_nrings(na, NR_TX), na_get_nslots(na, NR_TX),
1186 	    na_get_nrings(na, NR_RX), na_get_nslots(na, NR_RX));
1187 	SK_D("new config %s: txring %u x %u, rxring %u x %u",
1188 	    na->na_name, txr, txd, rxr, rxd);
1189 
1190 	if (na->na_channels == 0) {
1191 		SK_D("configuration changed (but fine)");
1192 		na_set_nrings(na, NR_TX, txr);
1193 		na_set_nslots(na, NR_TX, txd);
1194 		na_set_nrings(na, NR_RX, rxr);
1195 		na_set_nslots(na, NR_RX, rxd);
1196 		return 0;
1197 	}
1198 	SK_ERR("configuration changed while active, this is bad...");
1199 	return 1;
1200 }
1201 
1202 static void
na_kr_setup_netif_svc_map(struct nexus_adapter * na)1203 na_kr_setup_netif_svc_map(struct nexus_adapter *na)
1204 {
1205 	uint32_t i;
1206 	uint32_t num_tx_rings;
1207 
1208 	ASSERT(na->na_type == NA_NETIF_DEV);
1209 	num_tx_rings = na_get_nrings(na, NR_TX);
1210 
1211 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BK_SYS) ==
1212 	    NAKR_WMM_SC2RINGID(KPKT_SC_BK));
1213 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) ==
1214 	    NAKR_WMM_SC2RINGID(KPKT_SC_RD));
1215 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) ==
1216 	    NAKR_WMM_SC2RINGID(KPKT_SC_OAM));
1217 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_AV) ==
1218 	    NAKR_WMM_SC2RINGID(KPKT_SC_RV));
1219 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_AV) ==
1220 	    NAKR_WMM_SC2RINGID(KPKT_SC_VI));
1221 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VO) ==
1222 	    NAKR_WMM_SC2RINGID(KPKT_SC_CTL));
1223 
1224 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BK) < NA_NUM_WMM_CLASSES);
1225 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) < NA_NUM_WMM_CLASSES);
1226 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VI) < NA_NUM_WMM_CLASSES);
1227 	_CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VO) < NA_NUM_WMM_CLASSES);
1228 
1229 	_CASSERT(MBUF_SCIDX(KPKT_SC_BK_SYS) < KPKT_SC_MAX_CLASSES);
1230 	_CASSERT(MBUF_SCIDX(KPKT_SC_BK) < KPKT_SC_MAX_CLASSES);
1231 	_CASSERT(MBUF_SCIDX(KPKT_SC_BE) < KPKT_SC_MAX_CLASSES);
1232 	_CASSERT(MBUF_SCIDX(KPKT_SC_RD) < KPKT_SC_MAX_CLASSES);
1233 	_CASSERT(MBUF_SCIDX(KPKT_SC_OAM) < KPKT_SC_MAX_CLASSES);
1234 	_CASSERT(MBUF_SCIDX(KPKT_SC_AV) < KPKT_SC_MAX_CLASSES);
1235 	_CASSERT(MBUF_SCIDX(KPKT_SC_RV) < KPKT_SC_MAX_CLASSES);
1236 	_CASSERT(MBUF_SCIDX(KPKT_SC_VI) < KPKT_SC_MAX_CLASSES);
1237 	_CASSERT(MBUF_SCIDX(KPKT_SC_SIG) < KPKT_SC_MAX_CLASSES);
1238 	_CASSERT(MBUF_SCIDX(KPKT_SC_VO) < KPKT_SC_MAX_CLASSES);
1239 	_CASSERT(MBUF_SCIDX(KPKT_SC_CTL) < KPKT_SC_MAX_CLASSES);
1240 
1241 	/*
1242 	 * we support the following 2 configurations:
1243 	 * 1. packets from all 10 service class map to one ring.
1244 	 * 2. a 10:4 mapping between service classes and the rings. These 4
1245 	 *    rings map to the 4 WMM access categories.
1246 	 */
1247 	if (na->na_nx->nx_prov->nxprov_params->nxp_qmap == NEXUS_QMAP_TYPE_WMM) {
1248 		ASSERT(num_tx_rings == NEXUS_NUM_WMM_QUEUES);
1249 		/* setup the adapter's service class LUT */
1250 		NAKR_SET_SVC_LUT(na, KPKT_SC_BK_SYS);
1251 		NAKR_SET_SVC_LUT(na, KPKT_SC_BK);
1252 		NAKR_SET_SVC_LUT(na, KPKT_SC_BE);
1253 		NAKR_SET_SVC_LUT(na, KPKT_SC_RD);
1254 		NAKR_SET_SVC_LUT(na, KPKT_SC_OAM);
1255 		NAKR_SET_SVC_LUT(na, KPKT_SC_AV);
1256 		NAKR_SET_SVC_LUT(na, KPKT_SC_RV);
1257 		NAKR_SET_SVC_LUT(na, KPKT_SC_VI);
1258 		NAKR_SET_SVC_LUT(na, KPKT_SC_SIG);
1259 		NAKR_SET_SVC_LUT(na, KPKT_SC_VO);
1260 		NAKR_SET_SVC_LUT(na, KPKT_SC_CTL);
1261 
1262 		/* Initialize the service class for each of the 4 ring */
1263 		NAKR_SET_KR_SVC(na, KPKT_SC_BK);
1264 		NAKR_SET_KR_SVC(na, KPKT_SC_BE);
1265 		NAKR_SET_KR_SVC(na, KPKT_SC_VI);
1266 		NAKR_SET_KR_SVC(na, KPKT_SC_VO);
1267 	} else {
1268 		ASSERT(na->na_nx->nx_prov->nxprov_params->nxp_qmap ==
1269 		    NEXUS_QMAP_TYPE_DEFAULT);
1270 		/* 10: 1 mapping */
1271 		for (i = 0; i < KPKT_SC_MAX_CLASSES; i++) {
1272 			na->na_kring_svc_lut[i] = 0;
1273 		}
1274 		for (i = 0; i < num_tx_rings; i++) {
1275 			NAKR(na, NR_TX)[i].ckr_svc = KPKT_SC_UNSPEC;
1276 		}
1277 	}
1278 }
1279 
1280 static LCK_GRP_DECLARE(channel_txq_lock_group, "sk_ch_txq_lock");
1281 static LCK_GRP_DECLARE(channel_rxq_lock_group, "sk_ch_rxq_lock");
1282 static LCK_GRP_DECLARE(channel_txs_lock_group, "sk_ch_txs_lock");
1283 static LCK_GRP_DECLARE(channel_rxs_lock_group, "sk_ch_rxs_lock");
1284 static LCK_GRP_DECLARE(channel_alloc_lock_group, "sk_ch_alloc_lock");
1285 static LCK_GRP_DECLARE(channel_evq_lock_group, "sk_ch_evq_lock");
1286 static LCK_GRP_DECLARE(channel_evs_lock_group, "sk_ch_evs_lock");
1287 
1288 static lck_grp_t *
na_kr_q_lck_grp(enum txrx t)1289 na_kr_q_lck_grp(enum txrx t)
1290 {
1291 	switch (t) {
1292 	case NR_TX:
1293 		return &channel_txq_lock_group;
1294 	case NR_RX:
1295 		return &channel_rxq_lock_group;
1296 	case NR_A:
1297 	case NR_F:
1298 	case NR_LBA:
1299 		return &channel_alloc_lock_group;
1300 	case NR_EV:
1301 		return &channel_evq_lock_group;
1302 	default:
1303 		VERIFY(0);
1304 		/* NOTREACHED */
1305 		__builtin_unreachable();
1306 	}
1307 }
1308 
1309 static lck_grp_t *
na_kr_s_lck_grp(enum txrx t)1310 na_kr_s_lck_grp(enum txrx t)
1311 {
1312 	switch (t) {
1313 	case NR_TX:
1314 		return &channel_txs_lock_group;
1315 	case NR_RX:
1316 		return &channel_rxs_lock_group;
1317 	case NR_A:
1318 	case NR_F:
1319 	case NR_LBA:
1320 		return &channel_alloc_lock_group;
1321 	case NR_EV:
1322 		return &channel_evs_lock_group;
1323 	default:
1324 		VERIFY(0);
1325 		/* NOTREACHED */
1326 		__builtin_unreachable();
1327 	}
1328 }
1329 
1330 static void
kr_init_tbr(struct __kern_channel_ring * r)1331 kr_init_tbr(struct __kern_channel_ring *r)
1332 {
1333 	r->ckr_tbr_depth = CKR_TBR_TOKEN_INVALID;
1334 	r->ckr_tbr_token = CKR_TBR_TOKEN_INVALID;
1335 	r->ckr_tbr_last = 0;
1336 }
1337 
1338 struct kern_pbufpool *
na_kr_get_pp(struct nexus_adapter * na,enum txrx t)1339 na_kr_get_pp(struct nexus_adapter *na, enum txrx t)
1340 {
1341 	struct kern_pbufpool *pp = NULL;
1342 	switch (t) {
1343 	case NR_RX:
1344 	case NR_F:
1345 	case NR_EV:
1346 		pp = skmem_arena_nexus(na->na_arena)->arn_rx_pp;
1347 		break;
1348 	case NR_TX:
1349 	case NR_A:
1350 	case NR_LBA:
1351 		pp = skmem_arena_nexus(na->na_arena)->arn_tx_pp;
1352 		break;
1353 	default:
1354 		VERIFY(0);
1355 		/* NOTREACHED */
1356 		__builtin_unreachable();
1357 	}
1358 
1359 	return pp;
1360 }
1361 
1362 /*
1363  * Create the krings array and initialize the fields common to all adapters.
1364  * The array layout is this:
1365  *
1366  *                                 +----------+
1367  * na->na_tx_rings ----->          |          | \
1368  *                                 |          |  } na->na_num_tx_rings
1369  *                                 |          | /
1370  * na->na_rx_rings ---->           +----------+
1371  *                                 |          | \
1372  *                                 |          |  } na->na_num_rx_rings
1373  *                                 |          | /
1374  * na->na_alloc_rings ->           +----------+
1375  *                                 |          | \
1376  * na->na_free_rings -->           +----------+  } na->na_num_allocator_ring_pairs
1377  *                                 |          | /
1378  * na->na_event_rings ->           +----------+
1379  *                                 |          | \
1380  *                                 |          |  } na->na_num_event_rings
1381  *                                 |          | /
1382  * na->na_large_buf_alloc_rings -> +----------+
1383  *                                 |          | \
1384  *                                 |          |  } na->na_num_large_buf_alloc_rings
1385  *                                 |          | /
1386  * na->na_tail ----->              +----------+
1387  */
1388 /* call with SK_LOCK held */
1389 static int
na_kr_create(struct nexus_adapter * na,boolean_t alloc_ctx)1390 na_kr_create(struct nexus_adapter *na, boolean_t alloc_ctx)
1391 {
1392 	lck_grp_t *q_lck_grp, *s_lck_grp;
1393 	uint32_t i, count, ndesc;
1394 	struct kern_pbufpool *pp = NULL;
1395 	struct __kern_channel_ring *kring;
1396 	uint32_t n[NR_ALL];
1397 	int c, tot_slots, err = 0;
1398 	enum txrx t;
1399 
1400 	SK_LOCK_ASSERT_HELD();
1401 
1402 	n[NR_TX] = na_get_nrings(na, NR_TX);
1403 	n[NR_RX] = na_get_nrings(na, NR_RX);
1404 	n[NR_A] = na_get_nrings(na, NR_A);
1405 	n[NR_F] = na_get_nrings(na, NR_F);
1406 	n[NR_EV] = na_get_nrings(na, NR_EV);
1407 	n[NR_LBA] = na_get_nrings(na, NR_LBA);
1408 
1409 	count = n[NR_TX] + n[NR_RX] + n[NR_A] + n[NR_F] + n[NR_EV] + n[NR_LBA];
1410 
1411 	na->na_tx_rings = sk_alloc_type_array(struct __kern_channel_ring, count,
1412 	    Z_WAITOK, skmem_tag_nx_rings);
1413 	if (__improbable(na->na_tx_rings == NULL)) {
1414 		SK_ERR("Cannot allocate krings");
1415 		err = ENOMEM;
1416 		goto error;
1417 	}
1418 
1419 	na->na_rx_rings = na->na_tx_rings + n[NR_TX];
1420 	if (n[NR_A] != 0) {
1421 		na->na_alloc_rings = na->na_rx_rings + n[NR_RX];
1422 		na->na_free_rings = na->na_alloc_rings + n[NR_A];
1423 	} else {
1424 		na->na_alloc_rings = na->na_free_rings = NULL;
1425 	}
1426 	if (n[NR_EV] != 0) {
1427 		if (na->na_free_rings != NULL) {
1428 			na->na_event_rings = na->na_free_rings + n[NR_F];
1429 		} else {
1430 			na->na_event_rings = na->na_rx_rings + n[NR_RX];
1431 		}
1432 	}
1433 	if (n[NR_LBA] != 0) {
1434 		ASSERT(n[NR_A] != 0);
1435 		if (na->na_event_rings != NULL) {
1436 			na->na_large_buf_alloc_rings = na->na_event_rings + n[NR_EV];
1437 		} else {
1438 			/* alloc/free rings must also be present */
1439 			ASSERT(na->na_free_rings != NULL);
1440 			na->na_large_buf_alloc_rings = na->na_free_rings + n[NR_F];
1441 		}
1442 	}
1443 
1444 	/* total number of slots for TX/RX adapter rings */
1445 	c = tot_slots = (n[NR_TX] * na_get_nslots(na, NR_TX)) +
1446 	    (n[NR_RX] * na_get_nslots(na, NR_RX));
1447 
1448 	/* for scratch space on alloc and free rings */
1449 	if (n[NR_A] != 0) {
1450 		tot_slots += n[NR_A] * na_get_nslots(na, NR_A);
1451 		tot_slots += n[NR_F] * na_get_nslots(na, NR_F);
1452 		tot_slots += n[NR_LBA] * na_get_nslots(na, NR_LBA);
1453 		c = tot_slots;
1454 	}
1455 	na->na_total_slots = tot_slots;
1456 
1457 	/* slot context (optional) for all TX/RX ring slots of this adapter */
1458 	if (alloc_ctx) {
1459 		na->na_slot_ctxs =
1460 		    skn_alloc_type_array(slot_ctxs, struct slot_ctx,
1461 		    na->na_total_slots, Z_WAITOK, skmem_tag_nx_contexts);
1462 		if (na->na_slot_ctxs == NULL) {
1463 			SK_ERR("Cannot allocate slot contexts");
1464 			err = ENOMEM;
1465 			goto error;
1466 		}
1467 		os_atomic_or(&na->na_flags, NAF_SLOT_CONTEXT, relaxed);
1468 	}
1469 
1470 	/*
1471 	 * packet handle array storage for all TX/RX ring slots of this
1472 	 * adapter.
1473 	 */
1474 	na->na_scratch = skn_alloc_type_array(scratch, kern_packet_t,
1475 	    na->na_total_slots, Z_WAITOK, skmem_tag_nx_scratch);
1476 	if (na->na_scratch == NULL) {
1477 		SK_ERR("Cannot allocate slot contexts");
1478 		err = ENOMEM;
1479 		goto error;
1480 	}
1481 
1482 	/*
1483 	 * All fields in krings are 0 except the one initialized below.
1484 	 * but better be explicit on important kring fields.
1485 	 */
1486 	for_all_rings(t) {
1487 		ndesc = na_get_nslots(na, t);
1488 		pp = na_kr_get_pp(na, t);
1489 		for (i = 0; i < n[t]; i++) {
1490 			kring = &NAKR(na, t)[i];
1491 			bzero(kring, sizeof(*kring));
1492 			kring->ckr_na = na;
1493 			kring->ckr_pp = pp;
1494 			kring->ckr_max_pkt_len =
1495 			    (t == NR_LBA ? PP_BUF_SIZE_LARGE(pp) :
1496 			    PP_BUF_SIZE_DEF(pp)) *
1497 			    pp->pp_max_frags;
1498 			kring->ckr_ring_id = i;
1499 			kring->ckr_tx = t;
1500 			kr_init_to_mhints(kring, ndesc);
1501 			kr_init_tbr(kring);
1502 			if (NA_KERNEL_ONLY(na)) {
1503 				kring->ckr_flags |= CKRF_KERNEL_ONLY;
1504 			}
1505 			if (na->na_flags & NAF_HOST_ONLY) {
1506 				kring->ckr_flags |= CKRF_HOST;
1507 			}
1508 			ASSERT((t >= NR_TXRX) || (c > 0));
1509 			if ((t < NR_TXRX) &&
1510 			    (na->na_flags & NAF_SLOT_CONTEXT)) {
1511 				ASSERT(na->na_slot_ctxs != NULL);
1512 				kring->ckr_flags |= CKRF_SLOT_CONTEXT;
1513 				kring->ckr_slot_ctxs =
1514 				    na->na_slot_ctxs + (tot_slots - c);
1515 			}
1516 			ASSERT(na->na_scratch != NULL);
1517 			if (t < NR_TXRXAF || t == NR_LBA) {
1518 				kring->ckr_scratch =
1519 				    na->na_scratch + (tot_slots - c);
1520 			}
1521 			if (t < NR_TXRXAF || t == NR_LBA) {
1522 				c -= ndesc;
1523 			}
1524 			switch (t) {
1525 			case NR_A:
1526 				if (i == 0) {
1527 					kring->ckr_na_sync =
1528 					    na_packet_pool_alloc_sync;
1529 					kring->ckr_alloc_ws =
1530 					    na_upp_alloc_lowat;
1531 				} else {
1532 					ASSERT(i == 1);
1533 					kring->ckr_na_sync =
1534 					    na_packet_pool_alloc_buf_sync;
1535 					kring->ckr_alloc_ws =
1536 					    na_upp_alloc_buf_lowat;
1537 				}
1538 				break;
1539 			case NR_F:
1540 				if (i == 0) {
1541 					kring->ckr_na_sync =
1542 					    na_packet_pool_free_sync;
1543 				} else {
1544 					ASSERT(i == 1);
1545 					kring->ckr_na_sync =
1546 					    na_packet_pool_free_buf_sync;
1547 				}
1548 				break;
1549 			case NR_TX:
1550 				kring->ckr_na_sync = na->na_txsync;
1551 				if (na->na_flags & NAF_TX_MITIGATION) {
1552 					kring->ckr_flags |= CKRF_MITIGATION;
1553 				}
1554 				switch (na->na_type) {
1555 #if CONFIG_NEXUS_USER_PIPE
1556 				case NA_USER_PIPE:
1557 					ASSERT(!(na->na_flags &
1558 					    NAF_USER_PKT_POOL));
1559 					kring->ckr_prologue = kr_txprologue;
1560 					kring->ckr_finalize = NULL;
1561 					break;
1562 #endif /* CONFIG_NEXUS_USER_PIPE */
1563 #if CONFIG_NEXUS_MONITOR
1564 				case NA_MONITOR:
1565 					ASSERT(!(na->na_flags &
1566 					    NAF_USER_PKT_POOL));
1567 					kring->ckr_prologue = kr_txprologue;
1568 					kring->ckr_finalize = NULL;
1569 					break;
1570 #endif /* CONFIG_NEXUS_MONITOR */
1571 				default:
1572 					if (na->na_flags & NAF_USER_PKT_POOL) {
1573 						kring->ckr_prologue =
1574 						    kr_txprologue_upp;
1575 						kring->ckr_finalize =
1576 						    kr_txfinalize_upp;
1577 					} else {
1578 						kring->ckr_prologue =
1579 						    kr_txprologue;
1580 						kring->ckr_finalize =
1581 						    kr_txfinalize;
1582 					}
1583 					break;
1584 				}
1585 				break;
1586 			case NR_RX:
1587 				kring->ckr_na_sync = na->na_rxsync;
1588 				if (na->na_flags & NAF_RX_MITIGATION) {
1589 					kring->ckr_flags |= CKRF_MITIGATION;
1590 				}
1591 				switch (na->na_type) {
1592 #if CONFIG_NEXUS_USER_PIPE
1593 				case NA_USER_PIPE:
1594 					ASSERT(!(na->na_flags &
1595 					    NAF_USER_PKT_POOL));
1596 					kring->ckr_prologue =
1597 					    kr_rxprologue_nodetach;
1598 					kring->ckr_finalize = kr_rxfinalize;
1599 					break;
1600 #endif /* CONFIG_NEXUS_USER_PIPE */
1601 #if CONFIG_NEXUS_MONITOR
1602 				case NA_MONITOR:
1603 					ASSERT(!(na->na_flags &
1604 					    NAF_USER_PKT_POOL));
1605 					kring->ckr_prologue =
1606 					    kr_rxprologue_nodetach;
1607 					kring->ckr_finalize = kr_rxfinalize;
1608 					break;
1609 #endif /* CONFIG_NEXUS_MONITOR */
1610 				default:
1611 					if (na->na_flags & NAF_USER_PKT_POOL) {
1612 						kring->ckr_prologue =
1613 						    kr_rxprologue_upp;
1614 						kring->ckr_finalize =
1615 						    kr_rxfinalize_upp;
1616 					} else {
1617 						kring->ckr_prologue =
1618 						    kr_rxprologue;
1619 						kring->ckr_finalize =
1620 						    kr_rxfinalize;
1621 					}
1622 					break;
1623 				}
1624 				break;
1625 			case NR_EV:
1626 				kring->ckr_na_sync = kern_channel_event_sync;
1627 				break;
1628 			case NR_LBA:
1629 				kring->ckr_na_sync = na_packet_pool_alloc_large_sync;
1630 				kring->ckr_alloc_ws = na_upp_alloc_lowat;
1631 				break;
1632 			default:
1633 				VERIFY(0);
1634 				/* NOTREACHED */
1635 				__builtin_unreachable();
1636 			}
1637 			if (t != NR_EV) {
1638 				kring->ckr_na_notify = na->na_notify;
1639 			} else {
1640 				kring->ckr_na_notify = NULL;
1641 			}
1642 			(void) snprintf(kring->ckr_name,
1643 			    sizeof(kring->ckr_name) - 1,
1644 			    "%s %s%u%s", na->na_name, sk_ring2str(t), i,
1645 			    ((kring->ckr_flags & CKRF_HOST) ? "^" : ""));
1646 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1647 			    "kr \"%s\" (0x%llx) krflags 0x%b rh %u rt %u",
1648 			    kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1649 			    CKRF_BITS, kring->ckr_rhead, kring->ckr_rtail);
1650 			kring->ckr_state = KR_READY;
1651 			q_lck_grp = na_kr_q_lck_grp(t);
1652 			s_lck_grp = na_kr_s_lck_grp(t);
1653 			kring->ckr_qlock_group = q_lck_grp;
1654 			lck_mtx_init(&kring->ckr_qlock, kring->ckr_qlock_group,
1655 			    &channel_lock_attr);
1656 			kring->ckr_slock_group = s_lck_grp;
1657 			lck_spin_init(&kring->ckr_slock, kring->ckr_slock_group,
1658 			    &channel_lock_attr);
1659 			csi_init(&kring->ckr_si,
1660 			    (kring->ckr_flags & CKRF_MITIGATION),
1661 			    na->na_ch_mit_ival);
1662 		}
1663 		csi_init(&na->na_si[t],
1664 		    (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)),
1665 		    na->na_ch_mit_ival);
1666 	}
1667 	ASSERT(c == 0);
1668 	na->na_tail = na->na_rx_rings + n[NR_RX] + n[NR_A] + n[NR_F] +
1669 	    n[NR_EV] + n[NR_LBA];
1670 
1671 	if (na->na_type == NA_NETIF_DEV) {
1672 		na_kr_setup_netif_svc_map(na);
1673 	}
1674 
1675 	/* validate now for cases where we create only krings */
1676 	na_krings_verify(na);
1677 	return 0;
1678 
1679 error:
1680 	ASSERT(err != 0);
1681 	if (na->na_tx_rings != NULL) {
1682 		sk_free_type_array(struct __kern_channel_ring,
1683 		    na->na_tail - na->na_tx_rings, na->na_tx_rings);
1684 	}
1685 	if (na->na_slot_ctxs != NULL) {
1686 		ASSERT(na->na_flags & NAF_SLOT_CONTEXT);
1687 		skn_free_type_array(slot_ctxs,
1688 		    struct slot_ctx, na->na_total_slots,
1689 		    na->na_slot_ctxs);
1690 		na->na_slot_ctxs = NULL;
1691 	}
1692 	if (na->na_scratch != NULL) {
1693 		skn_free_type_array(scratch,
1694 		    kern_packet_t, na->na_total_slots,
1695 		    na->na_scratch);
1696 		na->na_scratch = NULL;
1697 	}
1698 	return err;
1699 }
1700 
1701 /* undo the actions performed by na_kr_create() */
1702 /* call with SK_LOCK held */
1703 static void
na_kr_delete(struct nexus_adapter * na)1704 na_kr_delete(struct nexus_adapter *na)
1705 {
1706 	struct __kern_channel_ring *kring = na->na_tx_rings;
1707 	enum txrx t;
1708 
1709 	ASSERT((kring != NULL) && (na->na_tail != NULL));
1710 	SK_LOCK_ASSERT_HELD();
1711 
1712 	for_all_rings(t) {
1713 		csi_destroy(&na->na_si[t]);
1714 	}
1715 	/* we rely on the krings layout described above */
1716 	for (; kring != na->na_tail; kring++) {
1717 		lck_mtx_destroy(&kring->ckr_qlock, kring->ckr_qlock_group);
1718 		lck_spin_destroy(&kring->ckr_slock, kring->ckr_slock_group);
1719 		csi_destroy(&kring->ckr_si);
1720 		if (kring->ckr_flags & CKRF_SLOT_CONTEXT) {
1721 			kring->ckr_flags &= ~CKRF_SLOT_CONTEXT;
1722 			ASSERT(kring->ckr_slot_ctxs != NULL);
1723 			kring->ckr_slot_ctxs = NULL;
1724 		}
1725 	}
1726 	if (na->na_slot_ctxs != NULL) {
1727 		ASSERT(na->na_flags & NAF_SLOT_CONTEXT);
1728 		os_atomic_andnot(&na->na_flags, NAF_SLOT_CONTEXT, relaxed);
1729 		skn_free_type_array(slot_ctxs,
1730 		    struct slot_ctx, na->na_total_slots,
1731 		    na->na_slot_ctxs);
1732 		na->na_slot_ctxs = NULL;
1733 	}
1734 	if (na->na_scratch != NULL) {
1735 		skn_free_type_array(scratch,
1736 		    kern_packet_t, na->na_total_slots,
1737 		    na->na_scratch);
1738 		na->na_scratch = NULL;
1739 	}
1740 	ASSERT(!(na->na_flags & NAF_SLOT_CONTEXT));
1741 	sk_free_type_array(struct __kern_channel_ring,
1742 	    na->na_tail - na->na_tx_rings, na->na_tx_rings);
1743 	na->na_tx_rings = na->na_rx_rings = na->na_alloc_rings =
1744 	    na->na_free_rings = na->na_event_rings = na->na_tail = NULL;
1745 }
1746 
1747 static void
na_kr_slot_desc_init(struct __slot_desc * ksds,boolean_t kernel_only,struct __slot_desc * usds,size_t ndesc)1748 na_kr_slot_desc_init(struct __slot_desc *ksds,
1749     boolean_t kernel_only, struct __slot_desc *usds, size_t ndesc)
1750 {
1751 	size_t i;
1752 
1753 	bzero(ksds, ndesc * SLOT_DESC_SZ);
1754 	if (usds != NULL) {
1755 		ASSERT(!kernel_only);
1756 		bzero(usds, ndesc * SLOT_DESC_SZ);
1757 	} else {
1758 		ASSERT(kernel_only);
1759 	}
1760 
1761 	for (i = 0; i < ndesc; i++) {
1762 		KSD_INIT(SLOT_DESC_KSD(&ksds[i]));
1763 		if (!kernel_only) {
1764 			USD_INIT(SLOT_DESC_USD(&usds[i]));
1765 		}
1766 	}
1767 }
1768 
1769 /* call with SK_LOCK held */
1770 static int
na_kr_setup(struct nexus_adapter * na,struct kern_channel * ch)1771 na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch)
1772 {
1773 	struct skmem_arena *ar = na->na_arena;
1774 	struct skmem_arena_nexus *arn;
1775 	mach_vm_offset_t roff[SKMEM_REGIONS];
1776 	enum txrx t;
1777 	uint32_t i;
1778 
1779 	SK_LOCK_ASSERT_HELD();
1780 	ASSERT(!(na->na_flags & NAF_MEM_NO_INIT));
1781 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
1782 	arn = skmem_arena_nexus(ar);
1783 	ASSERT(arn != NULL);
1784 
1785 	bzero(&roff, sizeof(roff));
1786 	for (i = 0; i < SKMEM_REGIONS; i++) {
1787 		if (ar->ar_regions[i] == NULL) {
1788 			continue;
1789 		}
1790 
1791 		/* not for nexus */
1792 		ASSERT(i != SKMEM_REGION_SYSCTLS);
1793 
1794 		/*
1795 		 * Get region offsets from base of mmap span; the arena
1796 		 * doesn't need to be mmap'd at this point, since we
1797 		 * simply compute the relative offset.
1798 		 */
1799 		roff[i] = skmem_arena_get_region_offset(ar, i);
1800 	}
1801 
1802 	for_all_rings(t) {
1803 		for (i = 0; i < na_get_nrings(na, t); i++) {
1804 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
1805 			struct __user_channel_ring *ring = kring->ckr_ring;
1806 			mach_vm_offset_t ring_off, usd_roff;
1807 			struct skmem_obj_info oi, oim;
1808 			uint32_t ndesc;
1809 
1810 			if (ring != NULL) {
1811 				SK_DF(SK_VERB_NA | SK_VERB_RING,
1812 				    "kr 0x%llx (\"%s\") is already "
1813 				    "initialized", SK_KVA(kring),
1814 				    kring->ckr_name);
1815 				continue; /* already created by somebody else */
1816 			}
1817 
1818 			if (!KR_KERNEL_ONLY(kring) &&
1819 			    (ring = skmem_cache_alloc(arn->arn_ring_cache,
1820 			    SKMEM_NOSLEEP)) == NULL) {
1821 				SK_ERR("Cannot allocate %s_ring for kr "
1822 				    "0x%llx (\"%s\")", sk_ring2str(t),
1823 				    SK_KVA(kring), kring->ckr_name);
1824 				goto cleanup;
1825 			}
1826 			kring->ckr_flags |= CKRF_MEM_RING_INITED;
1827 			kring->ckr_ring = ring;
1828 			ndesc = kring->ckr_num_slots;
1829 
1830 			if (ring == NULL) {
1831 				goto skip_user_ring_setup;
1832 			}
1833 
1834 			*(uint32_t *)(uintptr_t)&ring->ring_num_slots = ndesc;
1835 
1836 			/* offset of current ring in mmap span */
1837 			skmem_cache_get_obj_info(arn->arn_ring_cache,
1838 			    ring, &oi, NULL);
1839 			ring_off = (roff[SKMEM_REGION_RING] +
1840 			    SKMEM_OBJ_ROFF(&oi));
1841 
1842 			/*
1843 			 * ring_{buf,md,sd}_ofs offsets are relative to the
1844 			 * current ring, and not to the base of mmap span.
1845 			 */
1846 			*(mach_vm_offset_t *)(uintptr_t)
1847 			&ring->ring_def_buf_base =
1848 			    (roff[SKMEM_REGION_BUF_DEF] - ring_off);
1849 			*(mach_vm_offset_t *)(uintptr_t)
1850 			&ring->ring_large_buf_base =
1851 			    (roff[SKMEM_REGION_BUF_LARGE] - ring_off);
1852 			*(mach_vm_offset_t *)(uintptr_t)&ring->ring_md_base =
1853 			    (roff[SKMEM_REGION_UMD] - ring_off);
1854 			_CASSERT(sizeof(uint16_t) ==
1855 			    sizeof(ring->ring_bft_size));
1856 			if (roff[SKMEM_REGION_UBFT] != 0) {
1857 				ASSERT(ar->ar_regions[SKMEM_REGION_UBFT] !=
1858 				    NULL);
1859 				*(mach_vm_offset_t *)(uintptr_t)
1860 				&ring->ring_bft_base =
1861 				    (roff[SKMEM_REGION_UBFT] - ring_off);
1862 				*(uint16_t *)(uintptr_t)&ring->ring_bft_size =
1863 				    (uint16_t)ar->ar_regions[SKMEM_REGION_UBFT]->
1864 				    skr_c_obj_size;
1865 				ASSERT(ring->ring_bft_size ==
1866 				    ar->ar_regions[SKMEM_REGION_KBFT]->
1867 				    skr_c_obj_size);
1868 			} else {
1869 				*(mach_vm_offset_t *)(uintptr_t)
1870 				&ring->ring_bft_base = 0;
1871 				*(uint16_t *)(uintptr_t)&ring->ring_md_size = 0;
1872 			}
1873 
1874 			if (t == NR_TX || t == NR_A || t == NR_EV || t == NR_LBA) {
1875 				usd_roff = roff[SKMEM_REGION_TXAUSD];
1876 			} else {
1877 				ASSERT(t == NR_RX || t == NR_F);
1878 				usd_roff = roff[SKMEM_REGION_RXFUSD];
1879 			}
1880 			*(mach_vm_offset_t *)(uintptr_t)&ring->ring_sd_base =
1881 			    (usd_roff - ring_off);
1882 
1883 			/* copy values from kring */
1884 			ring->ring_head = kring->ckr_rhead;
1885 			*(slot_idx_t *)(uintptr_t)&ring->ring_khead =
1886 			    kring->ckr_khead;
1887 			*(slot_idx_t *)(uintptr_t)&ring->ring_tail =
1888 			    kring->ckr_rtail;
1889 
1890 			_CASSERT(sizeof(uint32_t) ==
1891 			    sizeof(ring->ring_def_buf_size));
1892 			_CASSERT(sizeof(uint32_t) ==
1893 			    sizeof(ring->ring_large_buf_size));
1894 			_CASSERT(sizeof(uint16_t) ==
1895 			    sizeof(ring->ring_md_size));
1896 			*(uint32_t *)(uintptr_t)&ring->ring_def_buf_size =
1897 			    ar->ar_regions[SKMEM_REGION_BUF_DEF]->skr_c_obj_size;
1898 			if (ar->ar_regions[SKMEM_REGION_BUF_LARGE] != NULL) {
1899 				*(uint32_t *)(uintptr_t)&ring->ring_large_buf_size =
1900 				    ar->ar_regions[SKMEM_REGION_BUF_LARGE]->skr_c_obj_size;
1901 			} else {
1902 				*(uint32_t *)(uintptr_t)&ring->ring_large_buf_size = 0;
1903 			}
1904 			if (ar->ar_regions[SKMEM_REGION_UMD] != NULL) {
1905 				*(uint16_t *)(uintptr_t)&ring->ring_md_size =
1906 				    (uint16_t)ar->ar_regions[SKMEM_REGION_UMD]->
1907 				    skr_c_obj_size;
1908 				ASSERT(ring->ring_md_size ==
1909 				    ar->ar_regions[SKMEM_REGION_KMD]->
1910 				    skr_c_obj_size);
1911 			} else {
1912 				*(uint16_t *)(uintptr_t)&ring->ring_md_size = 0;
1913 				ASSERT(PP_KERNEL_ONLY(arn->arn_rx_pp));
1914 				ASSERT(PP_KERNEL_ONLY(arn->arn_tx_pp));
1915 			}
1916 
1917 			/* ring info */
1918 			_CASSERT(sizeof(uint16_t) == sizeof(ring->ring_id));
1919 			_CASSERT(sizeof(uint16_t) == sizeof(ring->ring_kind));
1920 			*(uint16_t *)(uintptr_t)&ring->ring_id =
1921 			    (uint16_t)kring->ckr_ring_id;
1922 			*(uint16_t *)(uintptr_t)&ring->ring_kind =
1923 			    (uint16_t)kring->ckr_tx;
1924 
1925 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1926 			    "%s_ring at 0x%llx kr 0x%llx (\"%s\")",
1927 			    sk_ring2str(t), SK_KVA(ring), SK_KVA(kring),
1928 			    kring->ckr_name);
1929 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1930 			    "  num_slots:  %u", ring->ring_num_slots);
1931 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1932 			    "  def_buf_base:   0x%llx",
1933 			    (uint64_t)ring->ring_def_buf_base);
1934 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1935 			    "  large_buf_base:   0x%llx",
1936 			    (uint64_t)ring->ring_large_buf_base);
1937 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1938 			    "  md_base:    0x%llx",
1939 			    (uint64_t)ring->ring_md_base);
1940 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1941 			    "  sd_base:    0x%llx",
1942 			    (uint64_t)ring->ring_sd_base);
1943 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1944 			    "  h, t:    %u, %u, %u", ring->ring_head,
1945 			    ring->ring_tail);
1946 			SK_DF(SK_VERB_NA | SK_VERB_RING,
1947 			    "  md_size:    %d",
1948 			    (uint64_t)ring->ring_md_size);
1949 
1950 			/* make sure they're in synch */
1951 			_CASSERT(NR_RX == CR_KIND_RX);
1952 			_CASSERT(NR_TX == CR_KIND_TX);
1953 			_CASSERT(NR_A == CR_KIND_ALLOC);
1954 			_CASSERT(NR_F == CR_KIND_FREE);
1955 			_CASSERT(NR_EV == CR_KIND_EVENT);
1956 			_CASSERT(NR_LBA == CR_KIND_LARGE_BUF_ALLOC);
1957 
1958 skip_user_ring_setup:
1959 			/*
1960 			 * This flag tells na_kr_teardown_all() that it should
1961 			 * go thru the checks to free up the slot maps.
1962 			 */
1963 			kring->ckr_flags |= CKRF_MEM_SD_INITED;
1964 			if (t == NR_TX || t == NR_A || t == NR_EV || t == NR_LBA) {
1965 				kring->ckr_ksds_cache = arn->arn_txaksd_cache;
1966 			} else {
1967 				ASSERT(t == NR_RX || t == NR_F);
1968 				kring->ckr_ksds_cache = arn->arn_rxfksd_cache;
1969 			}
1970 			kring->ckr_ksds =
1971 			    skmem_cache_alloc(kring->ckr_ksds_cache,
1972 			    SKMEM_NOSLEEP);
1973 			if (kring->ckr_ksds == NULL) {
1974 				SK_ERR("Cannot allocate %s_ksds for kr "
1975 				    "0x%llx (\"%s\")", sk_ring2str(t),
1976 				    SK_KVA(kring), kring->ckr_name);
1977 				goto cleanup;
1978 			}
1979 			if (!KR_KERNEL_ONLY(kring)) {
1980 				skmem_cache_get_obj_info(kring->ckr_ksds_cache,
1981 				    kring->ckr_ksds, &oi, &oim);
1982 				kring->ckr_usds = SKMEM_OBJ_ADDR(&oim);
1983 			}
1984 			na_kr_slot_desc_init(kring->ckr_ksds,
1985 			    KR_KERNEL_ONLY(kring), kring->ckr_usds, ndesc);
1986 
1987 			/* cache last slot descriptor address */
1988 			ASSERT(kring->ckr_lim == (ndesc - 1));
1989 			kring->ckr_ksds_last = &kring->ckr_ksds[kring->ckr_lim];
1990 
1991 			if ((t < NR_TXRX) &&
1992 			    !(na->na_flags & NAF_USER_PKT_POOL) &&
1993 			    na_kr_populate_slots(kring) != 0) {
1994 				SK_ERR("Cannot allocate buffers for kr "
1995 				    "0x%llx (\"%s\")", SK_KVA(kring),
1996 				    kring->ckr_name);
1997 				goto cleanup;
1998 			}
1999 		}
2000 	}
2001 
2002 	return 0;
2003 
2004 cleanup:
2005 	na_kr_teardown_all(na, ch, FALSE);
2006 
2007 	return ENOMEM;
2008 }
2009 
2010 static void
na_kr_teardown_common(struct nexus_adapter * na,struct __kern_channel_ring * kring,enum txrx t,struct kern_channel * ch,boolean_t defunct)2011 na_kr_teardown_common(struct nexus_adapter *na,
2012     struct __kern_channel_ring *kring, enum txrx t, struct kern_channel *ch,
2013     boolean_t defunct)
2014 {
2015 	struct skmem_arena_nexus *arn = skmem_arena_nexus(na->na_arena);
2016 	struct __user_channel_ring *ckr_ring;
2017 	boolean_t sd_idle, sd_inited;
2018 
2019 	ASSERT(arn != NULL);
2020 	kr_enter(kring, TRUE);
2021 	/*
2022 	 * Check for CKRF_MEM_SD_INITED and CKRF_MEM_RING_INITED
2023 	 * to make sure that the freeing needs to happen (else just
2024 	 * nullify the values).
2025 	 * If this adapter owns the memory for the slot descriptors,
2026 	 * check if the region is marked as busy (sd_idle is false)
2027 	 * and leave the kring's slot descriptor fields alone if so,
2028 	 * at defunct time.  At final teardown time, sd_idle must be
2029 	 * true else we assert; this indicates a missing call to
2030 	 * skmem_arena_nexus_sd_set_noidle().
2031 	 */
2032 	sd_inited = ((kring->ckr_flags & CKRF_MEM_SD_INITED) != 0);
2033 	if (sd_inited) {
2034 		/* callee will do KR_KSD(), so check */
2035 		if (((t < NR_TXRX) || (t == NR_EV)) &&
2036 		    (kring->ckr_ksds != NULL)) {
2037 			na_kr_depopulate_slots(kring, ch, defunct);
2038 		}
2039 		/* leave CKRF_MEM_SD_INITED flag alone until idle */
2040 		sd_idle = skmem_arena_nexus_sd_idle(arn);
2041 		VERIFY(sd_idle || defunct);
2042 	} else {
2043 		sd_idle = TRUE;
2044 	}
2045 
2046 	if (sd_idle) {
2047 		kring->ckr_flags &= ~CKRF_MEM_SD_INITED;
2048 		if (kring->ckr_ksds != NULL) {
2049 			if (sd_inited) {
2050 				skmem_cache_free(kring->ckr_ksds_cache,
2051 				    kring->ckr_ksds);
2052 			}
2053 			kring->ckr_ksds = NULL;
2054 			kring->ckr_ksds_last = NULL;
2055 			kring->ckr_usds = NULL;
2056 		}
2057 		ASSERT(kring->ckr_ksds_last == NULL);
2058 		ASSERT(kring->ckr_usds == NULL);
2059 	}
2060 
2061 	if ((ckr_ring = kring->ckr_ring) != NULL) {
2062 		kring->ckr_ring = NULL;
2063 	}
2064 
2065 	if (kring->ckr_flags & CKRF_MEM_RING_INITED) {
2066 		ASSERT(ckr_ring != NULL || KR_KERNEL_ONLY(kring));
2067 		if (ckr_ring != NULL) {
2068 			skmem_cache_free(arn->arn_ring_cache, ckr_ring);
2069 		}
2070 		kring->ckr_flags &= ~CKRF_MEM_RING_INITED;
2071 	}
2072 
2073 	if (defunct) {
2074 		/* if defunct, drop everything; see KR_DROP() */
2075 		kring->ckr_flags |= CKRF_DEFUNCT;
2076 	}
2077 	kr_exit(kring);
2078 }
2079 
2080 /*
2081  * Teardown ALL rings of a nexus adapter; this includes {tx,rx,alloc,free,event}
2082  */
2083 static void
na_kr_teardown_all(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)2084 na_kr_teardown_all(struct nexus_adapter *na, struct kern_channel *ch,
2085     boolean_t defunct)
2086 {
2087 	enum txrx t;
2088 
2089 	ASSERT(na->na_arena->ar_type == SKMEM_ARENA_TYPE_NEXUS);
2090 
2091 	/* skip if this adapter has no allocated rings */
2092 	if (na->na_tx_rings == NULL) {
2093 		return;
2094 	}
2095 
2096 	for_all_rings(t) {
2097 		for (uint32_t i = 0; i < na_get_nrings(na, t); i++) {
2098 			na_kr_teardown_common(na, &NAKR(na, t)[i],
2099 			    t, ch, defunct);
2100 		}
2101 	}
2102 }
2103 
2104 /*
2105  * Teardown only {tx,rx} rings assigned to the channel.
2106  */
2107 static void
na_kr_teardown_txrx(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct,struct proc * p)2108 na_kr_teardown_txrx(struct nexus_adapter *na, struct kern_channel *ch,
2109     boolean_t defunct, struct proc *p)
2110 {
2111 	enum txrx t;
2112 
2113 	ASSERT(na->na_arena->ar_type == SKMEM_ARENA_TYPE_NEXUS);
2114 
2115 	for_rx_tx(t) {
2116 		ring_id_t qfirst = ch->ch_first[t];
2117 		ring_id_t qlast = ch->ch_last[t];
2118 		uint32_t i;
2119 
2120 		for (i = qfirst; i < qlast; i++) {
2121 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2122 			na_kr_teardown_common(na, kring, t, ch, defunct);
2123 
2124 			/*
2125 			 * Issue a notify to wake up anyone sleeping in kqueue
2126 			 * so that they notice the newly defuncted channels and
2127 			 * return an error
2128 			 */
2129 			kring->ckr_na_notify(kring, p, 0);
2130 		}
2131 	}
2132 }
2133 
2134 static int
na_kr_populate_slots(struct __kern_channel_ring * kring)2135 na_kr_populate_slots(struct __kern_channel_ring *kring)
2136 {
2137 	const boolean_t kernel_only = KR_KERNEL_ONLY(kring);
2138 	struct nexus_adapter *na = KRNA(kring);
2139 	kern_pbufpool_t pp = kring->ckr_pp;
2140 	uint32_t nslots = kring->ckr_num_slots;
2141 	uint32_t start_idx, i;
2142 	uint32_t sidx = 0;      /* slot counter */
2143 	struct __kern_slot_desc *ksd;
2144 	struct __user_slot_desc *usd;
2145 	struct __kern_quantum *kqum;
2146 	nexus_type_t nexus_type;
2147 	int err = 0;
2148 
2149 	ASSERT(kring->ckr_tx < NR_TXRX);
2150 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
2151 	ASSERT(na->na_arena->ar_type == SKMEM_ARENA_TYPE_NEXUS);
2152 	ASSERT(pp != NULL);
2153 
2154 	/*
2155 	 * xxx_ppool: remove this special case
2156 	 */
2157 	nexus_type = na->na_nxdom_prov->nxdom_prov_dom->nxdom_type;
2158 
2159 	switch (nexus_type) {
2160 	case NEXUS_TYPE_FLOW_SWITCH:
2161 	case NEXUS_TYPE_KERNEL_PIPE:
2162 		/*
2163 		 * xxx_ppool: This is temporary code until we come up with a
2164 		 * scheme for user space to alloc & attach packets to tx ring.
2165 		 */
2166 		if (kernel_only || kring->ckr_tx == NR_RX) {
2167 			return 0;
2168 		}
2169 		break;
2170 
2171 	case NEXUS_TYPE_NET_IF:
2172 		if (((na->na_type == NA_NETIF_DEV) ||
2173 		    (na->na_type == NA_NETIF_HOST)) &&
2174 		    (kernel_only || (kring->ckr_tx == NR_RX))) {
2175 			return 0;
2176 		}
2177 
2178 		ASSERT((na->na_type == NA_NETIF_COMPAT_DEV) ||
2179 		    (na->na_type == NA_NETIF_COMPAT_HOST) ||
2180 		    (na->na_type == NA_NETIF_DEV) ||
2181 		    (na->na_type == NA_NETIF_VP));
2182 
2183 		if (!kernel_only) {
2184 			if (kring->ckr_tx == NR_RX) {
2185 				return 0;
2186 			} else {
2187 				break;
2188 			}
2189 		}
2190 
2191 		ASSERT(kernel_only);
2192 
2193 		if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
2194 		    (na->na_type == NA_NETIF_COMPAT_HOST)) {
2195 			return 0;
2196 		}
2197 		VERIFY(0);
2198 		/* NOTREACHED */
2199 		__builtin_unreachable();
2200 
2201 	case NEXUS_TYPE_USER_PIPE:
2202 	case NEXUS_TYPE_MONITOR:
2203 		break;
2204 
2205 	default:
2206 		VERIFY(0);
2207 		/* NOTREACHED */
2208 		__builtin_unreachable();
2209 	}
2210 
2211 	/* Fill the ring with packets */
2212 	sidx = start_idx = 0;
2213 	for (i = 0; i < nslots; i++) {
2214 		kqum = SK_PTR_ADDR_KQUM(pp_alloc_packet(pp, pp->pp_max_frags,
2215 		    SKMEM_NOSLEEP));
2216 		if (kqum == NULL) {
2217 			err = ENOMEM;
2218 			SK_ERR("ar 0x%llx (\"%s\") no more buffers "
2219 			    "after %u of %u, err %d", SK_KVA(na->na_arena),
2220 			    na->na_arena->ar_name, i, nslots, err);
2221 			goto cleanup;
2222 		}
2223 		ksd = KR_KSD(kring, i);
2224 		usd = (kernel_only ? NULL : KR_USD(kring, i));
2225 
2226 		/* attach packet to slot */
2227 		kqum->qum_ksd = ksd;
2228 		ASSERT(!KSD_VALID_METADATA(ksd));
2229 		KSD_ATTACH_METADATA(ksd, kqum);
2230 		if (usd != NULL) {
2231 			USD_ATTACH_METADATA(usd, METADATA_IDX(kqum));
2232 			kr_externalize_metadata(kring, pp->pp_max_frags,
2233 			    kqum, current_proc());
2234 		}
2235 
2236 		SK_DF(SK_VERB_MEM, " C ksd [%-3d, 0x%llx] kqum [%-3u, 0x%llx] "
2237 		    " kbuf[%-3u, 0x%llx]", i, SK_KVA(ksd), METADATA_IDX(kqum),
2238 		    SK_KVA(kqum), kqum->qum_buf[0].buf_idx,
2239 		    SK_KVA(&kqum->qum_buf[0]));
2240 		if (!(kqum->qum_qflags & QUM_F_KERNEL_ONLY)) {
2241 			SK_DF(SK_VERB_MEM, " C usd [%-3d, 0x%llx] "
2242 			    "uqum [%-3u, 0x%llx]  ubuf[%-3u, 0x%llx]",
2243 			    (int)(usd ? usd->sd_md_idx : OBJ_IDX_NONE),
2244 			    SK_KVA(usd), METADATA_IDX(kqum),
2245 			    SK_KVA(kqum->qum_user),
2246 			    kqum->qum_user->qum_buf[0].buf_idx,
2247 			    SK_KVA(&kqum->qum_user->qum_buf[0]));
2248 		}
2249 
2250 		sidx = SLOT_NEXT(sidx, kring->ckr_lim);
2251 	}
2252 
2253 	SK_DF(SK_VERB_NA | SK_VERB_RING, "ar 0x%llx (\"%s\") populated %u slots from idx %u",
2254 	    SK_KVA(na->na_arena), na->na_arena->ar_name, nslots, start_idx);
2255 
2256 cleanup:
2257 	if (err != 0) {
2258 		sidx = start_idx;
2259 		while (i-- > 0) {
2260 			ksd = KR_KSD(kring, i);
2261 			usd = (kernel_only ? NULL : KR_USD(kring, i));
2262 			kqum = ksd->sd_qum;
2263 
2264 			ASSERT(ksd == kqum->qum_ksd);
2265 			KSD_RESET(ksd);
2266 			if (usd != NULL) {
2267 				USD_RESET(usd);
2268 			}
2269 			/* detach packet from slot */
2270 			kqum->qum_ksd = NULL;
2271 			pp_free_packet(pp, SK_PTR_ADDR(kqum));
2272 
2273 			sidx = SLOT_NEXT(sidx, kring->ckr_lim);
2274 		}
2275 	}
2276 	return err;
2277 }
2278 
2279 static void
na_kr_depopulate_slots(struct __kern_channel_ring * kring,struct kern_channel * ch,boolean_t defunct)2280 na_kr_depopulate_slots(struct __kern_channel_ring *kring,
2281     struct kern_channel *ch, boolean_t defunct)
2282 {
2283 #pragma unused(ch)
2284 	const boolean_t kernel_only = KR_KERNEL_ONLY(kring);
2285 	uint32_t i, j, n = kring->ckr_num_slots;
2286 	struct nexus_adapter *na = KRNA(kring);
2287 	struct kern_pbufpool *pp = kring->ckr_pp;
2288 	boolean_t upp = FALSE;
2289 	obj_idx_t midx;
2290 
2291 	ASSERT((kring->ckr_tx < NR_TXRX) || (kring->ckr_tx == NR_EV));
2292 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2293 
2294 	ASSERT(na->na_arena->ar_type == SKMEM_ARENA_TYPE_NEXUS);
2295 
2296 	if (((na->na_flags & NAF_USER_PKT_POOL) != 0) &&
2297 	    (kring->ckr_tx != NR_EV)) {
2298 		upp = TRUE;
2299 	}
2300 	for (i = 0, j = 0; i < n; i++) {
2301 		struct __kern_slot_desc *ksd = KR_KSD(kring, i);
2302 		struct __user_slot_desc *usd;
2303 		struct __kern_quantum *qum, *kqum;
2304 		boolean_t free_packet = FALSE;
2305 		int err;
2306 
2307 		if (!KSD_VALID_METADATA(ksd)) {
2308 			continue;
2309 		}
2310 
2311 		kqum = ksd->sd_qum;
2312 		usd = (kernel_only ? NULL : KR_USD(kring, i));
2313 		midx = METADATA_IDX(kqum);
2314 
2315 		/*
2316 		 * if the packet is internalized it should not be in the
2317 		 * hash table of packets loaned to user space.
2318 		 */
2319 		if (upp && (kqum->qum_qflags & QUM_F_INTERNALIZED)) {
2320 			if ((qum = pp_find_upp(pp, midx)) != NULL) {
2321 				panic("internalized packet 0x%llx in htbl",
2322 				    SK_KVA(qum));
2323 				/* NOTREACHED */
2324 				__builtin_unreachable();
2325 			}
2326 			free_packet = TRUE;
2327 		} else if (upp) {
2328 			/*
2329 			 * if the packet is not internalized check if it is
2330 			 * in the list of packets loaned to user-space.
2331 			 * Remove from the list before freeing.
2332 			 */
2333 			ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
2334 			qum = pp_remove_upp(pp, midx, &err);
2335 			if (err != 0) {
2336 				SK_ERR("un-allocated packet or buflet %d %p",
2337 				    midx, SK_KVA(qum));
2338 				if (qum != NULL) {
2339 					free_packet = TRUE;
2340 				}
2341 			}
2342 		} else {
2343 			free_packet = TRUE;
2344 		}
2345 
2346 		/*
2347 		 * Clear the user and kernel slot descriptors.  Note that
2348 		 * if we are depopulating the slots due to defunct (and not
2349 		 * due to normal deallocation/teardown), we leave the user
2350 		 * slot descriptor alone.  At that point the process may
2351 		 * be suspended, and later when it resumes it would just
2352 		 * pick up the original contents and move forward with
2353 		 * whatever it was doing.
2354 		 */
2355 		KSD_RESET(ksd);
2356 		if (usd != NULL && !defunct) {
2357 			USD_RESET(usd);
2358 		}
2359 
2360 		/* detach packet from slot */
2361 		kqum->qum_ksd = NULL;
2362 
2363 		SK_DF(SK_VERB_MEM, " D ksd [%-3d, 0x%llx] kqum [%-3u, 0x%llx] "
2364 		    " kbuf[%-3u, 0x%llx]", i, SK_KVA(ksd),
2365 		    METADATA_IDX(kqum), SK_KVA(kqum), kqum->qum_buf[0].buf_idx,
2366 		    SK_KVA(&kqum->qum_buf[0]));
2367 		if (!(kqum->qum_qflags & QUM_F_KERNEL_ONLY)) {
2368 			SK_DF(SK_VERB_MEM, " D usd [%-3u, 0x%llx] "
2369 			    "uqum [%-3u, 0x%llx]  ubuf[%-3u, 0x%llx]",
2370 			    (int)(usd ? usd->sd_md_idx : OBJ_IDX_NONE),
2371 			    SK_KVA(usd), METADATA_IDX(kqum),
2372 			    SK_KVA(kqum->qum_user),
2373 			    kqum->qum_user->qum_buf[0].buf_idx,
2374 			    SK_KVA(&kqum->qum_user->qum_buf[0]));
2375 		}
2376 
2377 		if (free_packet) {
2378 			pp_free_packet(pp, SK_PTR_ADDR(kqum)); ++j;
2379 		}
2380 	}
2381 
2382 	SK_DF(SK_VERB_NA | SK_VERB_RING, "ar 0x%llx (\"%s\") depopulated %u of %u slots",
2383 	    SK_KVA(KRNA(kring)->na_arena), KRNA(kring)->na_arena->ar_name,
2384 	    j, n);
2385 }
2386 
2387 int
na_rings_mem_setup(struct nexus_adapter * na,boolean_t alloc_ctx,struct kern_channel * ch)2388 na_rings_mem_setup(struct nexus_adapter *na,
2389     boolean_t alloc_ctx, struct kern_channel *ch)
2390 {
2391 	boolean_t kronly;
2392 	int err;
2393 
2394 	SK_LOCK_ASSERT_HELD();
2395 	ASSERT(na->na_channels == 0);
2396 	/*
2397 	 * If NAF_MEM_NO_INIT is set, then only create the krings and not
2398 	 * the backing memory regions for the adapter.
2399 	 */
2400 	kronly = (na->na_flags & NAF_MEM_NO_INIT);
2401 	ASSERT(!kronly || NA_KERNEL_ONLY(na));
2402 
2403 	/*
2404 	 * Create and initialize the common fields of the krings array.
2405 	 * using the information that must be already available in the na.
2406 	 */
2407 	if ((err = na_kr_create(na, alloc_ctx)) == 0 && !kronly) {
2408 		err = na_kr_setup(na, ch);
2409 		if (err != 0) {
2410 			na_kr_delete(na);
2411 		}
2412 	}
2413 
2414 	return err;
2415 }
2416 
2417 void
na_rings_mem_teardown(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)2418 na_rings_mem_teardown(struct nexus_adapter *na, struct kern_channel *ch,
2419     boolean_t defunct)
2420 {
2421 	SK_LOCK_ASSERT_HELD();
2422 	ASSERT(na->na_channels == 0 || (na->na_flags & NAF_DEFUNCT));
2423 
2424 	/*
2425 	 * Deletes the kring and ring array of the adapter. They
2426 	 * must have been created using na_rings_mem_setup().
2427 	 *
2428 	 * XXX: [email protected] -- the parameter "ch" should not be
2429 	 * needed here; however na_kr_depopulate_slots() needs to
2430 	 * go thru the channel's user packet pool hash, and so for
2431 	 * now we leave it here.
2432 	 */
2433 	na_kr_teardown_all(na, ch, defunct);
2434 	if (!defunct) {
2435 		na_kr_delete(na);
2436 	}
2437 }
2438 
2439 void
na_ch_rings_defunct(struct kern_channel * ch,struct proc * p)2440 na_ch_rings_defunct(struct kern_channel *ch, struct proc *p)
2441 {
2442 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2443 
2444 	/*
2445 	 * Depopulate slots on the TX and RX rings of this channel,
2446 	 * but don't touch other rings owned by other channels if
2447 	 * this adapter is being shared.
2448 	 */
2449 	na_kr_teardown_txrx(ch->ch_na, ch, TRUE, p);
2450 }
2451 
2452 void
na_kr_drop(struct nexus_adapter * na,boolean_t drop)2453 na_kr_drop(struct nexus_adapter *na, boolean_t drop)
2454 {
2455 	enum txrx t;
2456 	uint32_t i;
2457 
2458 	for_rx_tx(t) {
2459 		for (i = 0; i < na_get_nrings(na, t); i++) {
2460 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2461 			int error;
2462 			error = kr_enter(kring, TRUE);
2463 			if (drop) {
2464 				kring->ckr_flags |= CKRF_DROP;
2465 			} else {
2466 				kring->ckr_flags &= ~CKRF_DROP;
2467 			}
2468 
2469 			if (error != 0) {
2470 				SK_ERR("na \"%s\" (0x%llx) kr \"%s\" (0x%llx) "
2471 				    "kr_enter failed %d",
2472 				    na->na_name, SK_KVA(na),
2473 				    kring->ckr_name, SK_KVA(kring),
2474 				    error);
2475 			} else {
2476 				kr_exit(kring);
2477 			}
2478 			SK_D("na \"%s\" (0x%llx) kr \"%s\" (0x%llx) "
2479 			    "krflags 0x%b", na->na_name, SK_KVA(na),
2480 			    kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
2481 			    CKRF_BITS);
2482 		}
2483 	}
2484 }
2485 
2486 /*
2487  * Set the stopped/enabled status of ring.  When stopping, they also wait
2488  * for all current activity on the ring to terminate.  The status change
2489  * is then notified using the na na_notify callback.
2490  */
2491 static void
na_set_ring(struct nexus_adapter * na,uint32_t ring_id,enum txrx t,uint32_t state)2492 na_set_ring(struct nexus_adapter *na, uint32_t ring_id, enum txrx t,
2493     uint32_t state)
2494 {
2495 	struct __kern_channel_ring *kr = &NAKR(na, t)[ring_id];
2496 
2497 	/*
2498 	 * Mark the ring as stopped/enabled, and run through the
2499 	 * locks to make sure other users get to see it.
2500 	 */
2501 	if (state == KR_READY) {
2502 		kr_start(kr);
2503 	} else {
2504 		kr_stop(kr, state);
2505 	}
2506 }
2507 
2508 
2509 /* stop or enable all the rings of na */
2510 static void
na_set_all_rings(struct nexus_adapter * na,uint32_t state)2511 na_set_all_rings(struct nexus_adapter *na, uint32_t state)
2512 {
2513 	uint32_t i;
2514 	enum txrx t;
2515 
2516 	SK_LOCK_ASSERT_HELD();
2517 
2518 	if (!NA_IS_ACTIVE(na)) {
2519 		return;
2520 	}
2521 
2522 	for_rx_tx(t) {
2523 		for (i = 0; i < na_get_nrings(na, t); i++) {
2524 			na_set_ring(na, i, t, state);
2525 		}
2526 	}
2527 }
2528 
2529 /*
2530  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
2531  * to finish and prevents any new one from starting.  Call this before turning
2532  * Skywalk mode off, or before removing the harware rings (e.g., on module
2533  * onload).  As a rule of thumb for linux drivers, this should be placed near
2534  * each napi_disable().
2535  */
2536 void
na_disable_all_rings(struct nexus_adapter * na)2537 na_disable_all_rings(struct nexus_adapter *na)
2538 {
2539 	na_set_all_rings(na, KR_STOPPED);
2540 }
2541 
2542 /*
2543  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
2544  * adapter's rings In linux drivers, this should be placed near each
2545  * napi_enable().
2546  */
2547 void
na_enable_all_rings(struct nexus_adapter * na)2548 na_enable_all_rings(struct nexus_adapter *na)
2549 {
2550 	na_set_all_rings(na, KR_READY /* enabled */);
2551 }
2552 
2553 void
na_lock_all_rings(struct nexus_adapter * na)2554 na_lock_all_rings(struct nexus_adapter *na)
2555 {
2556 	na_set_all_rings(na, KR_LOCKED);
2557 }
2558 
2559 void
na_unlock_all_rings(struct nexus_adapter * na)2560 na_unlock_all_rings(struct nexus_adapter *na)
2561 {
2562 	na_enable_all_rings(na);
2563 }
2564 
2565 int
na_connect(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)2566 na_connect(struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
2567     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
2568 {
2569 	struct nexus_adapter *na = NULL;
2570 	mach_vm_size_t memsize = 0;
2571 	int err = 0;
2572 	enum txrx t;
2573 
2574 	ASSERT(!(chr->cr_mode & CHMODE_KERNEL));
2575 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
2576 
2577 	SK_LOCK_ASSERT_HELD();
2578 
2579 	/* find the nexus adapter and return the reference */
2580 	err = na_find(ch, nx, chr, ch0, nxb, p, &na, TRUE /* create */);
2581 	if (err != 0) {
2582 		ASSERT(na == NULL);
2583 		goto done;
2584 	}
2585 
2586 	if (NA_KERNEL_ONLY(na)) {
2587 		err = EBUSY;
2588 		goto done;
2589 	}
2590 
2591 	/* reject if the adapter is defunct of non-permissive */
2592 	if ((na->na_flags & NAF_DEFUNCT) || na_reject_channel(ch, na)) {
2593 		err = ENXIO;
2594 		goto done;
2595 	}
2596 
2597 	err = na_bind_channel(na, ch, chr);
2598 	if (err != 0) {
2599 		goto done;
2600 	}
2601 
2602 	ASSERT(ch->ch_schema != NULL);
2603 	ASSERT(na == ch->ch_na);
2604 
2605 	for_all_rings(t) {
2606 		if (na_get_nrings(na, t) == 0) {
2607 			ch->ch_si[t] = NULL;
2608 			continue;
2609 		}
2610 		ch->ch_si[t] = ch_is_multiplex(ch, t) ? &na->na_si[t] :
2611 		    &NAKR(na, t)[ch->ch_first[t]].ckr_si;
2612 	}
2613 
2614 	skmem_arena_get_stats(na->na_arena, &memsize, NULL);
2615 
2616 	if (!(skmem_arena_nexus(na->na_arena)->arn_mode &
2617 	    AR_NEXUS_MODE_EXTERNAL_PPOOL)) {
2618 		os_atomic_or(__DECONST(uint32_t *, &ch->ch_schema->csm_flags), CSM_PRIV_MEM, relaxed);
2619 	}
2620 
2621 	err = skmem_arena_mmap(na->na_arena, p, &ch->ch_mmap);
2622 	if (err != 0) {
2623 		goto done;
2624 	}
2625 
2626 	os_atomic_or(__DECONST(uint32_t *, &ch->ch_schema->csm_flags), CSM_ACTIVE, relaxed);
2627 	chr->cr_memsize = memsize;
2628 	chr->cr_memoffset = ch->ch_schema_offset;
2629 
2630 	SK_D("%s(%d) ch 0x%llx <-> nx 0x%llx (%s:\"%s\":%d:%d) na 0x%llx "
2631 	    "naflags %b", sk_proc_name_address(p), sk_proc_pid(p),
2632 	    SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
2633 	    na->na_name, (int)chr->cr_port, (int)chr->cr_ring_id, SK_KVA(na),
2634 	    na->na_flags, NAF_BITS);
2635 
2636 done:
2637 	if (err != 0) {
2638 		if (ch->ch_schema != NULL || na != NULL) {
2639 			if (ch->ch_schema != NULL) {
2640 				ASSERT(na == ch->ch_na);
2641 				/*
2642 				 * Callee will unmap memory region if needed,
2643 				 * as well as release reference held on 'na'.
2644 				 */
2645 				na_disconnect(nx, ch);
2646 				na = NULL;
2647 			}
2648 			if (na != NULL) {
2649 				(void) na_release_locked(na);
2650 				na = NULL;
2651 			}
2652 		}
2653 	}
2654 
2655 	return err;
2656 }
2657 
2658 void
na_disconnect(struct kern_nexus * nx,struct kern_channel * ch)2659 na_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
2660 {
2661 #pragma unused(nx)
2662 	enum txrx t;
2663 
2664 	SK_LOCK_ASSERT_HELD();
2665 
2666 	SK_D("ch 0x%llx -!- nx 0x%llx (%s:\"%s\":%u:%d) na 0x%llx naflags %b",
2667 	    SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
2668 	    ch->ch_na->na_name, ch->ch_info->cinfo_nx_port,
2669 	    (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(ch->ch_na),
2670 	    ch->ch_na->na_flags, NAF_BITS);
2671 
2672 	/* destroy mapping and release references */
2673 	na_unbind_channel(ch);
2674 	ASSERT(ch->ch_na == NULL);
2675 	ASSERT(ch->ch_schema == NULL);
2676 	for_all_rings(t) {
2677 		ch->ch_si[t] = NULL;
2678 	}
2679 }
2680 
2681 void
na_defunct(struct kern_nexus * nx,struct kern_channel * ch,struct nexus_adapter * na,boolean_t locked)2682 na_defunct(struct kern_nexus *nx, struct kern_channel *ch,
2683     struct nexus_adapter *na, boolean_t locked)
2684 {
2685 #pragma unused(nx)
2686 	SK_LOCK_ASSERT_HELD();
2687 	if (!locked) {
2688 		lck_mtx_lock(&ch->ch_lock);
2689 	}
2690 
2691 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2692 
2693 	if (!(na->na_flags & NAF_DEFUNCT)) {
2694 		/*
2695 		 * Mark this adapter as defunct to inform nexus-specific
2696 		 * teardown handler called by na_teardown() below.
2697 		 */
2698 		os_atomic_or(&na->na_flags, NAF_DEFUNCT, relaxed);
2699 
2700 		/*
2701 		 * Depopulate slots.
2702 		 */
2703 		na_teardown(na, ch, TRUE);
2704 
2705 		/*
2706 		 * And finally destroy any already-defunct memory regions.
2707 		 * Do this only if the nexus adapter owns the arena, i.e.
2708 		 * NAF_MEM_LOANED is not set.  Otherwise, we'd expect
2709 		 * that this routine be called again for the real owner.
2710 		 */
2711 		if (!(na->na_flags & NAF_MEM_LOANED)) {
2712 			skmem_arena_defunct(na->na_arena);
2713 		}
2714 	}
2715 
2716 	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) "
2717 	    "na 0x%llx naflags %b", ch->ch_name, ch->ch_pid,
2718 	    SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
2719 	    na->na_name, ch->ch_info->cinfo_nx_port,
2720 	    (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(na),
2721 	    na->na_flags, NAF_BITS);
2722 
2723 	if (!locked) {
2724 		lck_mtx_unlock(&ch->ch_lock);
2725 	}
2726 }
2727 
2728 /*
2729  * TODO: [email protected] -- merge this into na_connect()
2730  */
2731 int
na_connect_spec(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct proc * p)2732 na_connect_spec(struct kern_nexus *nx, struct kern_channel *ch,
2733     struct chreq *chr, struct proc *p)
2734 {
2735 #pragma unused(p)
2736 	struct nexus_adapter *na = NULL;
2737 	mach_vm_size_t memsize = 0;
2738 	int error = 0;
2739 	enum txrx t;
2740 
2741 	ASSERT(chr->cr_mode & CHMODE_KERNEL);
2742 	ASSERT(ch->ch_flags & CHANF_KERNEL);
2743 	ASSERT(ch->ch_na == NULL);
2744 	ASSERT(ch->ch_schema == NULL);
2745 
2746 	SK_LOCK_ASSERT_HELD();
2747 
2748 	error = na_find(ch, nx, chr, NULL, NULL, kernproc, &na, TRUE);
2749 	if (error != 0) {
2750 		goto done;
2751 	}
2752 
2753 	if (na == NULL) {
2754 		error = EINVAL;
2755 		goto done;
2756 	}
2757 
2758 	if (na->na_channels > 0) {
2759 		error = EBUSY;
2760 		goto done;
2761 	}
2762 
2763 	if (na->na_flags & NAF_DEFUNCT) {
2764 		error = ENXIO;
2765 		goto done;
2766 	}
2767 
2768 	/*
2769 	 * Special connect requires the nexus adapter to handle its
2770 	 * own channel binding and unbinding via na_special(); bail
2771 	 * if this adapter doesn't support it.
2772 	 */
2773 	if (na->na_special == NULL) {
2774 		error = ENOTSUP;
2775 		goto done;
2776 	}
2777 
2778 	/* upon success, "ch->ch_na" will point to "na" */
2779 	error = na->na_special(na, ch, chr, NXSPEC_CMD_CONNECT);
2780 	if (error != 0) {
2781 		ASSERT(ch->ch_na == NULL);
2782 		goto done;
2783 	}
2784 
2785 	ASSERT(na->na_flags & NAF_SPEC_INIT);
2786 	ASSERT(na == ch->ch_na);
2787 	/* make sure this is still the case */
2788 	ASSERT(ch->ch_schema == NULL);
2789 
2790 	for_rx_tx(t) {
2791 		ch->ch_si[t] = ch_is_multiplex(ch, t) ? &na->na_si[t] :
2792 		    &NAKR(na, t)[ch->ch_first[t]].ckr_si;
2793 	}
2794 
2795 	skmem_arena_get_stats(na->na_arena, &memsize, NULL);
2796 	chr->cr_memsize = memsize;
2797 
2798 	SK_D("%s(%d) ch 0x%llx <-> nx 0x%llx (%s:\"%s\":%d:%d) na 0x%llx "
2799 	    "naflags %b", sk_proc_name_address(p), sk_proc_pid(p),
2800 	    SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
2801 	    na->na_name, (int)chr->cr_port, (int)chr->cr_ring_id, SK_KVA(na),
2802 	    na->na_flags, NAF_BITS);
2803 
2804 done:
2805 	if (error != 0) {
2806 		if (ch->ch_na != NULL || na != NULL) {
2807 			if (ch->ch_na != NULL) {
2808 				ASSERT(na == ch->ch_na);
2809 				/* callee will release reference on 'na' */
2810 				na_disconnect_spec(nx, ch);
2811 				na = NULL;
2812 			}
2813 			if (na != NULL) {
2814 				(void) na_release_locked(na);
2815 				na = NULL;
2816 			}
2817 		}
2818 	}
2819 
2820 	return error;
2821 }
2822 
2823 /*
2824  * TODO: [email protected] -- merge this into na_disconnect()
2825  */
2826 void
na_disconnect_spec(struct kern_nexus * nx,struct kern_channel * ch)2827 na_disconnect_spec(struct kern_nexus *nx, struct kern_channel *ch)
2828 {
2829 #pragma unused(nx)
2830 	struct nexus_adapter *na = ch->ch_na;
2831 	enum txrx t;
2832 	int error;
2833 
2834 	SK_LOCK_ASSERT_HELD();
2835 	ASSERT(na != NULL);
2836 	ASSERT(na->na_flags & NAF_SPEC_INIT);   /* has been bound */
2837 
2838 	SK_D("ch 0x%llx -!- nx 0x%llx (%s:\"%s\":%u:%d) na 0x%llx naflags %b",
2839 	    SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
2840 	    na->na_name, ch->ch_info->cinfo_nx_port,
2841 	    (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(na),
2842 	    na->na_flags, NAF_BITS);
2843 
2844 	/* take a reference for this routine */
2845 	na_retain_locked(na);
2846 
2847 	ASSERT(ch->ch_flags & CHANF_KERNEL);
2848 	ASSERT(ch->ch_schema == NULL);
2849 	ASSERT(na->na_special != NULL);
2850 	/* unbind this channel */
2851 	error = na->na_special(na, ch, NULL, NXSPEC_CMD_DISCONNECT);
2852 	ASSERT(error == 0);
2853 	ASSERT(!(na->na_flags & NAF_SPEC_INIT));
2854 
2855 	/* now release our reference; this may be the last */
2856 	na_release_locked(na);
2857 	na = NULL;
2858 
2859 	ASSERT(ch->ch_na == NULL);
2860 	for_rx_tx(t) {
2861 		ch->ch_si[t] = NULL;
2862 	}
2863 }
2864 
2865 void
na_start_spec(struct kern_nexus * nx,struct kern_channel * ch)2866 na_start_spec(struct kern_nexus *nx, struct kern_channel *ch)
2867 {
2868 #pragma unused(nx)
2869 	struct nexus_adapter *na = ch->ch_na;
2870 
2871 	SK_LOCK_ASSERT_HELD();
2872 
2873 	ASSERT(ch->ch_flags & CHANF_KERNEL);
2874 	ASSERT(NA_KERNEL_ONLY(na));
2875 	ASSERT(na->na_special != NULL);
2876 
2877 	na->na_special(na, ch, NULL, NXSPEC_CMD_START);
2878 }
2879 
2880 void
na_stop_spec(struct kern_nexus * nx,struct kern_channel * ch)2881 na_stop_spec(struct kern_nexus *nx, struct kern_channel *ch)
2882 {
2883 #pragma unused(nx)
2884 	struct nexus_adapter *na = ch->ch_na;
2885 
2886 	SK_LOCK_ASSERT_HELD();
2887 
2888 	ASSERT(ch->ch_flags & CHANF_KERNEL);
2889 	ASSERT(NA_KERNEL_ONLY(na));
2890 	ASSERT(na->na_special != NULL);
2891 
2892 	na->na_special(na, ch, NULL, NXSPEC_CMD_STOP);
2893 }
2894 
2895 /*
2896  * MUST BE CALLED UNDER SK_LOCK()
2897  *
2898  * Get a refcounted reference to a nexus adapter attached
2899  * to the interface specified by chr.
2900  * This is always called in the execution of an ioctl().
2901  *
2902  * Return ENXIO if the interface specified by the request does
2903  * not exist, ENOTSUP if Skywalk is not supported by the interface,
2904  * EINVAL if parameters are invalid, ENOMEM if needed resources
2905  * could not be allocated.
2906  * If successful, hold a reference to the nexus adapter.
2907  *
2908  * No reference is kept on the real interface, which may then
2909  * disappear at any time.
2910  */
2911 int
na_find(struct kern_channel * ch,struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)2912 na_find(struct kern_channel *ch, struct kern_nexus *nx, struct chreq *chr,
2913     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p,
2914     struct nexus_adapter **na, boolean_t create)
2915 {
2916 	int error = 0;
2917 
2918 	_CASSERT(sizeof(chr->cr_name) == sizeof((*na)->na_name));
2919 
2920 	*na = NULL;     /* default return value */
2921 
2922 	SK_LOCK_ASSERT_HELD();
2923 
2924 	/*
2925 	 * We cascade through all possibile types of nexus adapter.
2926 	 * All nx_*_na_find() functions return an error and an na,
2927 	 * with the following combinations:
2928 	 *
2929 	 * error    na
2930 	 *   0	   NULL		type doesn't match
2931 	 *  !0	   NULL		type matches, but na creation/lookup failed
2932 	 *   0	  !NULL		type matches and na created/found
2933 	 *  !0    !NULL		impossible
2934 	 */
2935 
2936 #if CONFIG_NEXUS_MONITOR
2937 	/* try to see if this is a monitor port */
2938 	error = nx_monitor_na_find(nx, ch, chr, ch0, nxb, p, na, create);
2939 	if (error != 0 || *na != NULL) {
2940 		return error;
2941 	}
2942 #endif /* CONFIG_NEXUS_MONITOR */
2943 #if CONFIG_NEXUS_USER_PIPE
2944 	/* try to see if this is a pipe port */
2945 	error = nx_upipe_na_find(nx, ch, chr, nxb, p, na, create);
2946 	if (error != 0 || *na != NULL) {
2947 		return error;
2948 	}
2949 #endif /* CONFIG_NEXUS_USER_PIPE */
2950 #if CONFIG_NEXUS_KERNEL_PIPE
2951 	/* try to see if this is a kernel pipe port */
2952 	error = nx_kpipe_na_find(nx, ch, chr, nxb, p, na, create);
2953 	if (error != 0 || *na != NULL) {
2954 		return error;
2955 	}
2956 #endif /* CONFIG_NEXUS_KERNEL_PIPE */
2957 #if CONFIG_NEXUS_FLOWSWITCH
2958 	/* try to see if this is a flowswitch port */
2959 	error = nx_fsw_na_find(nx, ch, chr, nxb, p, na, create);
2960 	if (error != 0 || *na != NULL) {
2961 		return error;
2962 	}
2963 #endif /* CONFIG_NEXUS_FLOWSWITCH */
2964 #if CONFIG_NEXUS_NETIF
2965 	error = nx_netif_na_find(nx, ch, chr, nxb, p, na, create);
2966 	if (error != 0 || *na != NULL) {
2967 		return error;
2968 	}
2969 #endif /* CONFIG_NEXUS_NETIF */
2970 
2971 	ASSERT(*na == NULL);
2972 	return ENXIO;
2973 }
2974 
2975 void
na_retain_locked(struct nexus_adapter * na)2976 na_retain_locked(struct nexus_adapter *na)
2977 {
2978 	SK_LOCK_ASSERT_HELD();
2979 
2980 	if (na != NULL) {
2981 #if SK_LOG
2982 		uint32_t oref = os_atomic_inc_orig(&na->na_refcount, relaxed);
2983 		SK_DF(SK_VERB_REFCNT, "na \"%s\" (0x%llx) refcnt %u chcnt %u",
2984 		    na->na_name, SK_KVA(na), oref + 1, na->na_channels);
2985 #else /* !SK_LOG */
2986 		os_atomic_inc(&na->na_refcount, relaxed);
2987 #endif /* !SK_LOG */
2988 	}
2989 }
2990 
2991 /* returns 1 iff the nexus_adapter is destroyed */
2992 int
na_release_locked(struct nexus_adapter * na)2993 na_release_locked(struct nexus_adapter *na)
2994 {
2995 	uint32_t oref;
2996 
2997 	SK_LOCK_ASSERT_HELD();
2998 
2999 	ASSERT(na->na_refcount > 0);
3000 	oref = os_atomic_dec_orig(&na->na_refcount, relaxed);
3001 	if (oref > 1) {
3002 		SK_DF(SK_VERB_REFCNT, "na \"%s\" (0x%llx) refcnt %u chcnt %u",
3003 		    na->na_name, SK_KVA(na), oref - 1, na->na_channels);
3004 		return 0;
3005 	}
3006 	ASSERT(na->na_channels == 0);
3007 
3008 	if (na->na_dtor != NULL) {
3009 		na->na_dtor(na);
3010 	}
3011 
3012 	ASSERT(na->na_tx_rings == NULL && na->na_rx_rings == NULL);
3013 	ASSERT(na->na_slot_ctxs == NULL);
3014 	ASSERT(na->na_scratch == NULL);
3015 
3016 #if CONFIG_NEXUS_USER_PIPE
3017 	nx_upipe_na_dealloc(na);
3018 #endif /* CONFIG_NEXUS_USER_PIPE */
3019 	if (na->na_arena != NULL) {
3020 		skmem_arena_release(na->na_arena);
3021 		na->na_arena = NULL;
3022 	}
3023 
3024 	SK_DF(SK_VERB_MEM, "na \"%s\" (0x%llx) being freed",
3025 	    na->na_name, SK_KVA(na));
3026 
3027 	NA_FREE(na);
3028 	return 1;
3029 }
3030 
3031 static struct nexus_adapter *
na_pseudo_alloc(zalloc_flags_t how)3032 na_pseudo_alloc(zalloc_flags_t how)
3033 {
3034 	struct nexus_adapter *na;
3035 
3036 	na = zalloc_flags(na_pseudo_zone, how | Z_ZERO);
3037 	if (na) {
3038 		na->na_type = NA_PSEUDO;
3039 		na->na_free = na_pseudo_free;
3040 	}
3041 	return na;
3042 }
3043 
3044 static void
na_pseudo_free(struct nexus_adapter * na)3045 na_pseudo_free(struct nexus_adapter *na)
3046 {
3047 	ASSERT(na->na_refcount == 0);
3048 	SK_DF(SK_VERB_MEM, "na 0x%llx FREE", SK_KVA(na));
3049 	bzero(na, sizeof(*na));
3050 	zfree(na_pseudo_zone, na);
3051 }
3052 
3053 static int
na_pseudo_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3054 na_pseudo_txsync(struct __kern_channel_ring *kring, struct proc *p,
3055     uint32_t flags)
3056 {
3057 #pragma unused(kring, p, flags)
3058 	SK_DF(SK_VERB_SYNC | SK_VERB_TX,
3059 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
3060 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
3061 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
3062 	    flags);
3063 
3064 	return 0;
3065 }
3066 
3067 static int
na_pseudo_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3068 na_pseudo_rxsync(struct __kern_channel_ring *kring, struct proc *p,
3069     uint32_t flags)
3070 {
3071 #pragma unused(kring, p, flags)
3072 	SK_DF(SK_VERB_SYNC | SK_VERB_RX,
3073 	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
3074 	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
3075 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
3076 	    flags);
3077 
3078 	ASSERT(kring->ckr_rhead <= kring->ckr_lim);
3079 
3080 	return 0;
3081 }
3082 
3083 static int
na_pseudo_activate(struct nexus_adapter * na,na_activate_mode_t mode)3084 na_pseudo_activate(struct nexus_adapter *na, na_activate_mode_t mode)
3085 {
3086 	SK_D("na \"%s\" (0x%llx) %s", na->na_name,
3087 	    SK_KVA(na), na_activate_mode2str(mode));
3088 
3089 	switch (mode) {
3090 	case NA_ACTIVATE_MODE_ON:
3091 		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
3092 		break;
3093 
3094 	case NA_ACTIVATE_MODE_DEFUNCT:
3095 		break;
3096 
3097 	case NA_ACTIVATE_MODE_OFF:
3098 		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
3099 		break;
3100 
3101 	default:
3102 		VERIFY(0);
3103 		/* NOTREACHED */
3104 		__builtin_unreachable();
3105 	}
3106 
3107 	return 0;
3108 }
3109 
3110 static void
na_pseudo_dtor(struct nexus_adapter * na)3111 na_pseudo_dtor(struct nexus_adapter *na)
3112 {
3113 #pragma unused(na)
3114 }
3115 
3116 static int
na_pseudo_krings_create(struct nexus_adapter * na,struct kern_channel * ch)3117 na_pseudo_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3118 {
3119 	return na_rings_mem_setup(na, FALSE, ch);
3120 }
3121 
3122 static void
na_pseudo_krings_delete(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)3123 na_pseudo_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3124     boolean_t defunct)
3125 {
3126 	na_rings_mem_teardown(na, ch, defunct);
3127 }
3128 
3129 /*
3130  * Pseudo nexus adapter; typically used as a generic parent adapter.
3131  */
3132 int
na_pseudo_create(struct kern_nexus * nx,struct chreq * chr,struct nexus_adapter ** ret)3133 na_pseudo_create(struct kern_nexus *nx, struct chreq *chr,
3134     struct nexus_adapter **ret)
3135 {
3136 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
3137 	struct nexus_adapter *na;
3138 	int error;
3139 
3140 	SK_LOCK_ASSERT_HELD();
3141 	*ret = NULL;
3142 
3143 	na = na_pseudo_alloc(Z_WAITOK);
3144 
3145 	ASSERT(na->na_type == NA_PSEUDO);
3146 	ASSERT(na->na_free == na_pseudo_free);
3147 
3148 	(void) strncpy(na->na_name, chr->cr_name, sizeof(na->na_name) - 1);
3149 	na->na_name[sizeof(na->na_name) - 1] = '\0';
3150 	uuid_generate_random(na->na_uuid);
3151 
3152 	/*
3153 	 * Verify upper bounds; for all cases including user pipe nexus,
3154 	 * the parameters must have already been validated by corresponding
3155 	 * nxdom_prov_params() function defined by each domain.
3156 	 */
3157 	na_set_nrings(na, NR_TX, nxp->nxp_tx_rings);
3158 	na_set_nrings(na, NR_RX, nxp->nxp_rx_rings);
3159 	na_set_nslots(na, NR_TX, nxp->nxp_tx_slots);
3160 	na_set_nslots(na, NR_RX, nxp->nxp_rx_slots);
3161 	ASSERT(na_get_nrings(na, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
3162 	ASSERT(na_get_nrings(na, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
3163 	ASSERT(na_get_nslots(na, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
3164 	ASSERT(na_get_nslots(na, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
3165 
3166 	na->na_txsync = na_pseudo_txsync;
3167 	na->na_rxsync = na_pseudo_rxsync;
3168 	na->na_activate = na_pseudo_activate;
3169 	na->na_dtor = na_pseudo_dtor;
3170 	na->na_krings_create = na_pseudo_krings_create;
3171 	na->na_krings_delete = na_pseudo_krings_delete;
3172 
3173 	*(nexus_stats_type_t *)(uintptr_t)&na->na_stats_type =
3174 	    NEXUS_STATS_TYPE_INVALID;
3175 
3176 	/* other fields are set in the common routine */
3177 	na_attach_common(na, nx, NX_DOM_PROV(nx));
3178 
3179 	if ((error = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
3180 	    nx, na)) != 0) {
3181 		ASSERT(na->na_arena == NULL);
3182 		goto err;
3183 	}
3184 	ASSERT(na->na_arena != NULL);
3185 
3186 	*(uint32_t *)(uintptr_t)&na->na_flowadv_max = nxp->nxp_flowadv_max;
3187 	ASSERT(na->na_flowadv_max == 0 ||
3188 	    skmem_arena_nexus(na->na_arena)->arn_flowadv_obj != NULL);
3189 
3190 #if SK_LOG
3191 	uuid_string_t uuidstr;
3192 	SK_D("na_name: \"%s\"", na->na_name);
3193 	SK_D("  UUID:        %s", sk_uuid_unparse(na->na_uuid, uuidstr));
3194 	SK_D("  nx:          0x%llx (\"%s\":\"%s\")",
3195 	    SK_KVA(na->na_nx), NX_DOM(na->na_nx)->nxdom_name,
3196 	    NX_DOM_PROV(na->na_nx)->nxdom_prov_name);
3197 	SK_D("  flags:       %b", na->na_flags, NAF_BITS);
3198 	SK_D("  flowadv_max: %u", na->na_flowadv_max);
3199 	SK_D("  rings:       tx %u rx %u",
3200 	    na_get_nrings(na, NR_TX), na_get_nrings(na, NR_RX));
3201 	SK_D("  slots:       tx %u rx %u",
3202 	    na_get_nslots(na, NR_TX), na_get_nslots(na, NR_RX));
3203 #if CONFIG_NEXUS_USER_PIPE
3204 	SK_D("  next_pipe:   %u", na->na_next_pipe);
3205 	SK_D("  max_pipes:   %u", na->na_max_pipes);
3206 #endif /* CONFIG_NEXUS_USER_PIPE */
3207 #endif /* SK_LOG */
3208 
3209 	*ret = na;
3210 	na_retain_locked(na);
3211 
3212 	return 0;
3213 
3214 err:
3215 	if (na != NULL) {
3216 		if (na->na_arena != NULL) {
3217 			skmem_arena_release(na->na_arena);
3218 			na->na_arena = NULL;
3219 		}
3220 		NA_FREE(na);
3221 	}
3222 	return error;
3223 }
3224 
3225 void
na_flowadv_entry_alloc(const struct nexus_adapter * na,uuid_t fae_id,const flowadv_idx_t fe_idx,const uint32_t flowid)3226 na_flowadv_entry_alloc(const struct nexus_adapter *na, uuid_t fae_id,
3227     const flowadv_idx_t fe_idx, const uint32_t flowid)
3228 {
3229 	struct skmem_arena *ar = na->na_arena;
3230 	struct skmem_arena_nexus *arn = skmem_arena_nexus(na->na_arena);
3231 	struct __flowadv_entry *fae;
3232 
3233 	ASSERT(NA_IS_ACTIVE(na) && na->na_flowadv_max != 0);
3234 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
3235 
3236 	AR_LOCK(ar);
3237 
3238 	/* we must not get here if arena is defunct; this must be valid */
3239 	ASSERT(arn->arn_flowadv_obj != NULL);
3240 
3241 	VERIFY(fe_idx < na->na_flowadv_max);
3242 	fae = &arn->arn_flowadv_obj[fe_idx];
3243 	uuid_copy(fae->fae_id, fae_id);
3244 	fae->fae_flowid = flowid;
3245 	fae->fae_flags = FLOWADVF_VALID;
3246 
3247 	AR_UNLOCK(ar);
3248 }
3249 
3250 void
na_flowadv_entry_free(const struct nexus_adapter * na,uuid_t fae_id,const flowadv_idx_t fe_idx,const uint32_t flowid)3251 na_flowadv_entry_free(const struct nexus_adapter *na, uuid_t fae_id,
3252     const flowadv_idx_t fe_idx, const uint32_t flowid)
3253 {
3254 #pragma unused(fae_id)
3255 	struct skmem_arena *ar = na->na_arena;
3256 	struct skmem_arena_nexus *arn = skmem_arena_nexus(ar);
3257 
3258 	ASSERT(NA_IS_ACTIVE(na) && (na->na_flowadv_max != 0));
3259 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
3260 
3261 	AR_LOCK(ar);
3262 
3263 	ASSERT(arn->arn_flowadv_obj != NULL || (ar->ar_flags & ARF_DEFUNCT));
3264 	if (arn->arn_flowadv_obj != NULL) {
3265 		struct __flowadv_entry *fae;
3266 
3267 		VERIFY(fe_idx < na->na_flowadv_max);
3268 		fae = &arn->arn_flowadv_obj[fe_idx];
3269 		ASSERT(uuid_compare(fae->fae_id, fae_id) == 0);
3270 		uuid_clear(fae->fae_id);
3271 		VERIFY(fae->fae_flowid == flowid);
3272 		fae->fae_flowid = 0;
3273 		fae->fae_flags = 0;
3274 	}
3275 
3276 	AR_UNLOCK(ar);
3277 }
3278 
3279 bool
na_flowadv_set(const struct nexus_adapter * na,const flowadv_idx_t fe_idx,const flowadv_token_t flow_token)3280 na_flowadv_set(const struct nexus_adapter *na, const flowadv_idx_t fe_idx,
3281     const flowadv_token_t flow_token)
3282 {
3283 	struct skmem_arena *ar = na->na_arena;
3284 	struct skmem_arena_nexus *arn = skmem_arena_nexus(ar);
3285 	bool suspend;
3286 
3287 	ASSERT(NA_IS_ACTIVE(na) && (na->na_flowadv_max != 0));
3288 	ASSERT(fe_idx < na->na_flowadv_max);
3289 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
3290 
3291 	AR_LOCK(ar);
3292 
3293 	ASSERT(arn->arn_flowadv_obj != NULL || (ar->ar_flags & ARF_DEFUNCT));
3294 
3295 	if (arn->arn_flowadv_obj != NULL) {
3296 		struct __flowadv_entry *fae = &arn->arn_flowadv_obj[fe_idx];
3297 
3298 		_CASSERT(sizeof(fae->fae_token) == sizeof(flow_token));
3299 		/*
3300 		 * We cannot guarantee that the flow is still around by now,
3301 		 * so check if that's the case and let the caller know.
3302 		 */
3303 		if ((suspend = (fae->fae_token == flow_token))) {
3304 			ASSERT(fae->fae_flags & FLOWADVF_VALID);
3305 			fae->fae_flags |= FLOWADVF_SUSPENDED;
3306 		}
3307 	} else {
3308 		suspend = false;
3309 	}
3310 	if (suspend) {
3311 		SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) flow token 0x%llu fidx %u "
3312 		    "SUSPEND", sk_proc_name_address(current_proc()),
3313 		    sk_proc_pid(current_proc()), flow_token, fe_idx);
3314 	} else {
3315 		SK_ERR("%s(%d) flow token 0x%llu fidx %u no longer around",
3316 		    sk_proc_name_address(current_proc()),
3317 		    sk_proc_pid(current_proc()), flow_token, fe_idx);
3318 	}
3319 
3320 	AR_UNLOCK(ar);
3321 
3322 	return suspend;
3323 }
3324 
3325 int
na_flowadv_clear(const struct kern_channel * ch,const flowadv_idx_t fe_idx,const flowadv_token_t flow_token)3326 na_flowadv_clear(const struct kern_channel *ch, const flowadv_idx_t fe_idx,
3327     const flowadv_token_t flow_token)
3328 {
3329 	struct nexus_adapter *na = ch->ch_na;
3330 	struct skmem_arena *ar = na->na_arena;
3331 	struct skmem_arena_nexus *arn = skmem_arena_nexus(ar);
3332 	boolean_t resume;
3333 
3334 	ASSERT(NA_IS_ACTIVE(na) && (na->na_flowadv_max != 0));
3335 	ASSERT(fe_idx < na->na_flowadv_max);
3336 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
3337 
3338 	AR_LOCK(ar);
3339 
3340 	ASSERT(arn->arn_flowadv_obj != NULL || (ar->ar_flags & ARF_DEFUNCT));
3341 
3342 	if (arn->arn_flowadv_obj != NULL) {
3343 		struct __flowadv_entry *fae = &arn->arn_flowadv_obj[fe_idx];
3344 
3345 		_CASSERT(sizeof(fae->fae_token) == sizeof(flow_token));
3346 		/*
3347 		 * We cannot guarantee that the flow is still around by now,
3348 		 * so check if that's the case and let the caller know.
3349 		 */
3350 		if ((resume = (fae->fae_token == flow_token))) {
3351 			ASSERT(fae->fae_flags & FLOWADVF_VALID);
3352 			fae->fae_flags &= ~FLOWADVF_SUSPENDED;
3353 		}
3354 	} else {
3355 		resume = FALSE;
3356 	}
3357 	if (resume) {
3358 		SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d): flow token 0x%x "
3359 		    "fidx %u RESUME", ch->ch_name, ch->ch_pid, flow_token,
3360 		    fe_idx);
3361 	} else {
3362 		SK_ERR("%s(%d): flow token 0x%x fidx %u no longer around",
3363 		    ch->ch_name, ch->ch_pid, flow_token, fe_idx);
3364 	}
3365 
3366 	AR_UNLOCK(ar);
3367 
3368 	return resume;
3369 }
3370 
3371 int
na_flowadv_report_ce_event(const struct kern_channel * ch,const flowadv_idx_t fe_idx,const flowadv_token_t flow_token,uint32_t ce_cnt,uint32_t total_pkt_cnt)3372 na_flowadv_report_ce_event(const struct kern_channel *ch, const flowadv_idx_t fe_idx,
3373     const flowadv_token_t flow_token, uint32_t ce_cnt, uint32_t total_pkt_cnt)
3374 {
3375 	struct nexus_adapter *na = ch->ch_na;
3376 	struct skmem_arena *ar = na->na_arena;
3377 	struct skmem_arena_nexus *arn = skmem_arena_nexus(ar);
3378 	boolean_t added;
3379 
3380 	ASSERT(NA_IS_ACTIVE(na) && (na->na_flowadv_max != 0));
3381 	ASSERT(fe_idx < na->na_flowadv_max);
3382 	ASSERT(ar->ar_type == SKMEM_ARENA_TYPE_NEXUS);
3383 
3384 	AR_LOCK(ar);
3385 
3386 	ASSERT(arn->arn_flowadv_obj != NULL || (ar->ar_flags & ARF_DEFUNCT));
3387 
3388 	if (arn->arn_flowadv_obj != NULL) {
3389 		struct __flowadv_entry *fae = &arn->arn_flowadv_obj[fe_idx];
3390 
3391 		_CASSERT(sizeof(fae->fae_token) == sizeof(flow_token));
3392 		/*
3393 		 * We cannot guarantee that the flow is still around by now,
3394 		 * so check if that's the case and let the caller know.
3395 		 */
3396 		if ((added = (fae->fae_token == flow_token))) {
3397 			ASSERT(fae->fae_flags & FLOWADVF_VALID);
3398 			fae->fae_ce_cnt += ce_cnt;
3399 			fae->fae_pkt_cnt += total_pkt_cnt;
3400 		}
3401 	} else {
3402 		added = FALSE;
3403 	}
3404 	if (added) {
3405 		SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d): flow token 0x%x "
3406 		    "fidx %u ce cnt incremented", ch->ch_name,
3407 		    ch->ch_pid, flow_token, fe_idx);
3408 	} else {
3409 		SK_ERR("%s(%d): flow token 0x%x fidx %u no longer around",
3410 		    ch->ch_name, ch->ch_pid, flow_token, fe_idx);
3411 	}
3412 
3413 	AR_UNLOCK(ar);
3414 
3415 	return added;
3416 }
3417 
3418 void
na_flowadv_event(struct __kern_channel_ring * kring)3419 na_flowadv_event(struct __kern_channel_ring *kring)
3420 {
3421 	ASSERT(kring->ckr_tx == NR_TX);
3422 
3423 	SK_DF(SK_VERB_EVENTS, "%s(%d) na \"%s\" (0x%llx) kr 0x%llx",
3424 	    sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
3425 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), SK_KVA(kring));
3426 
3427 	na_post_event(kring, TRUE, FALSE, FALSE, CHAN_FILT_HINT_FLOW_ADV_UPD);
3428 }
3429 
3430 static int
na_packet_pool_free_sync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3431 na_packet_pool_free_sync(struct __kern_channel_ring *kring, struct proc *p,
3432     uint32_t flags)
3433 {
3434 #pragma unused(flags, p)
3435 	int n, ret = 0;
3436 	slot_idx_t j;
3437 	struct __kern_slot_desc *ksd;
3438 	struct __user_slot_desc *usd;
3439 	struct __kern_quantum *kqum;
3440 	struct kern_pbufpool *pp = kring->ckr_pp;
3441 	uint32_t nfree = 0;
3442 
3443 	/* packet pool list is protected by channel lock */
3444 	ASSERT(!KR_KERNEL_ONLY(kring));
3445 
3446 	/* # of new slots */
3447 	n = kring->ckr_rhead - kring->ckr_khead;
3448 	if (n < 0) {
3449 		n += kring->ckr_num_slots;
3450 	}
3451 
3452 	/* nothing to free */
3453 	if (__improbable(n == 0)) {
3454 		SK_DF(SK_VERB_MEM | SK_VERB_SYNC, "%s(%d) kr \"%s\" %s",
3455 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
3456 		    "nothing to free");
3457 		goto done;
3458 	}
3459 
3460 	j = kring->ckr_khead;
3461 	PP_LOCK(pp);
3462 	while (n--) {
3463 		int err;
3464 
3465 		ksd = KR_KSD(kring, j);
3466 		usd = KR_USD(kring, j);
3467 
3468 		if (__improbable(!SD_VALID_METADATA(usd))) {
3469 			SK_ERR("bad slot %d 0x%llx", j, SK_KVA(ksd));
3470 			ret = EINVAL;
3471 			break;
3472 		}
3473 
3474 		kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
3475 		if (__improbable(err != 0)) {
3476 			SK_ERR("un-allocated packet or buflet %d %p",
3477 			    usd->sd_md_idx, SK_KVA(kqum));
3478 			ret = EINVAL;
3479 			break;
3480 		}
3481 
3482 		/* detach and free the packet */
3483 		kqum->qum_qflags &= ~QUM_F_FINALIZED;
3484 		kqum->qum_ksd = NULL;
3485 		ASSERT(!KSD_VALID_METADATA(ksd));
3486 		USD_DETACH_METADATA(usd);
3487 		ASSERT(pp == kqum->qum_pp);
3488 		ASSERT(nfree < kring->ckr_num_slots);
3489 		kring->ckr_scratch[nfree++] = (uint64_t)kqum;
3490 		j = SLOT_NEXT(j, kring->ckr_lim);
3491 	}
3492 	PP_UNLOCK(pp);
3493 
3494 	if (__probable(nfree > 0)) {
3495 		pp_free_packet_batch(pp, &kring->ckr_scratch[0], nfree);
3496 	}
3497 
3498 	kring->ckr_khead = j;
3499 	kring->ckr_ktail = SLOT_PREV(j, kring->ckr_lim);
3500 
3501 done:
3502 	return ret;
3503 }
3504 
3505 #define MAX_BUFLETS 64
3506 static int
alloc_packets(kern_pbufpool_t pp,uint64_t * buf_arr,bool large,uint32_t * ph_cnt)3507 alloc_packets(kern_pbufpool_t pp, uint64_t *buf_arr, bool large, uint32_t *ph_cnt)
3508 {
3509 	int err;
3510 	uint32_t need, need_orig, remain, alloced, i;
3511 	uint64_t buflets[MAX_BUFLETS];
3512 	uint64_t *pkts;
3513 
3514 	need_orig = *ph_cnt;
3515 	err = kern_pbufpool_alloc_batch_nosleep(pp, large ? 0 : 1, buf_arr, ph_cnt);
3516 	if (!large) {
3517 		return err;
3518 	}
3519 	if (*ph_cnt == 0) {
3520 		SK_ERR("failed to alloc %d packets for alloc ring: err %d",
3521 		    need_orig, err);
3522 		DTRACE_SKYWALK2(alloc__pkts__fail, uint32_t, need_orig, int, err);
3523 		return err;
3524 	}
3525 	need = remain = *ph_cnt;
3526 	alloced = 0;
3527 	pkts = buf_arr;
3528 	while (remain > 0) {
3529 		uint32_t cnt, cnt_orig;
3530 
3531 		cnt = MIN(remain, MAX_BUFLETS);
3532 		cnt_orig = cnt;
3533 		err = pp_alloc_buflet_batch(pp, buflets, &cnt, SKMEM_NOSLEEP, true);
3534 		if (cnt == 0) {
3535 			SK_ERR("failed to alloc %d buflets for alloc ring: "
3536 			    "remain %d, err %d", cnt_orig, remain, err);
3537 			DTRACE_SKYWALK3(alloc__bufs__fail, uint32_t, cnt_orig,
3538 			    uint32_t, remain, int, err);
3539 			break;
3540 		}
3541 		for (i = 0; i < cnt; i++) {
3542 			kern_packet_t ph = (kern_packet_t)pkts[i];
3543 			kern_buflet_t buf = (kern_buflet_t)buflets[i];
3544 			kern_buflet_t pbuf = kern_packet_get_next_buflet(ph, NULL);
3545 			VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
3546 			buflets[i] = 0;
3547 		}
3548 		DTRACE_SKYWALK3(alloc__bufs, uint32_t, remain, uint32_t, cnt,
3549 		    uint32_t, cnt_orig);
3550 		pkts += cnt;
3551 		alloced += cnt;
3552 		remain -= cnt;
3553 	}
3554 	/* free packets without attached buffers */
3555 	if (remain > 0) {
3556 		DTRACE_SKYWALK1(remaining__pkts, uint32_t, remain);
3557 		ASSERT(remain + alloced == need);
3558 		pp_free_packet_batch(pp, pkts, remain);
3559 
3560 		/* pp_free_packet_batch() should clear the pkts array */
3561 		for (i = 0; i < remain; i++) {
3562 			ASSERT(pkts[i] == 0);
3563 		}
3564 	}
3565 	*ph_cnt = alloced;
3566 	if (*ph_cnt == 0) {
3567 		err = ENOMEM;
3568 	} else if (*ph_cnt < need_orig) {
3569 		err = EAGAIN;
3570 	} else {
3571 		err = 0;
3572 	}
3573 	DTRACE_SKYWALK3(alloc__packets, uint32_t, need_orig, uint32_t, *ph_cnt, int, err);
3574 	return err;
3575 }
3576 
3577 static int
na_packet_pool_alloc_sync_common(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags,bool large)3578 na_packet_pool_alloc_sync_common(struct __kern_channel_ring *kring, struct proc *p,
3579     uint32_t flags, bool large)
3580 {
3581 	int b, err;
3582 	uint32_t n = 0;
3583 	slot_idx_t j;
3584 	uint64_t now;
3585 	uint32_t curr_ws, ph_needed, ph_cnt;
3586 	struct __kern_slot_desc *ksd;
3587 	struct __user_slot_desc *usd;
3588 	struct __kern_quantum *kqum;
3589 	kern_pbufpool_t pp = kring->ckr_pp;
3590 	pid_t pid = proc_pid(p);
3591 
3592 	/* packet pool list is protected by channel lock */
3593 	ASSERT(!KR_KERNEL_ONLY(kring));
3594 	ASSERT(!PP_KERNEL_ONLY(pp));
3595 
3596 	now = _net_uptime;
3597 	if ((flags & NA_SYNCF_UPP_PURGE) != 0) {
3598 		if (now - kring->ckr_sync_time >= na_upp_reap_interval) {
3599 			kring->ckr_alloc_ws = na_upp_reap_min_pkts;
3600 		}
3601 		SK_DF(SK_VERB_MEM | SK_VERB_SYNC,
3602 		    "%s: purged curr_ws(%d)", kring->ckr_name,
3603 		    kring->ckr_alloc_ws);
3604 		return 0;
3605 	}
3606 	/* reclaim the completed slots */
3607 	kring->ckr_khead = kring->ckr_rhead;
3608 
3609 	/* # of busy (unclaimed) slots */
3610 	b = kring->ckr_ktail - kring->ckr_khead;
3611 	if (b < 0) {
3612 		b += kring->ckr_num_slots;
3613 	}
3614 
3615 	curr_ws = kring->ckr_alloc_ws;
3616 	if (flags & NA_SYNCF_FORCE_UPP_SYNC) {
3617 		/* increment the working set by 50% */
3618 		curr_ws += (curr_ws >> 1);
3619 		curr_ws = MIN(curr_ws, kring->ckr_lim);
3620 	} else {
3621 		if ((now - kring->ckr_sync_time >= na_upp_ws_hold_time) &&
3622 		    (uint32_t)b >= (curr_ws >> 2)) {
3623 			/* decrease the working set by 25% */
3624 			curr_ws -= (curr_ws >> 2);
3625 		}
3626 	}
3627 	curr_ws = MAX(curr_ws, na_upp_alloc_lowat);
3628 	if (curr_ws > (uint32_t)b) {
3629 		n = curr_ws - b;
3630 	}
3631 	kring->ckr_alloc_ws = curr_ws;
3632 	kring->ckr_sync_time = now;
3633 
3634 	/* min with # of avail free slots (subtract busy from max) */
3635 	n = ph_needed = MIN(n, kring->ckr_lim - b);
3636 	j = kring->ckr_ktail;
3637 	SK_DF(SK_VERB_MEM | SK_VERB_SYNC,
3638 	    "%s: curr_ws(%d), n(%d)", kring->ckr_name, curr_ws, n);
3639 
3640 	if ((ph_cnt = ph_needed) == 0) {
3641 		goto done;
3642 	}
3643 
3644 	err = alloc_packets(pp, kring->ckr_scratch,
3645 	    PP_HAS_BUFFER_ON_DEMAND(pp) && large, &ph_cnt);
3646 	if (__improbable(ph_cnt == 0)) {
3647 		SK_ERR("kr 0x%llx failed to alloc %u packet s(%d)",
3648 		    SK_KVA(kring), ph_needed, err);
3649 		kring->ckr_err_stats.cres_pkt_alloc_failures += ph_needed;
3650 	} else {
3651 		/*
3652 		 * Add packets to the allocated list of user packet pool.
3653 		 */
3654 		pp_insert_upp_batch(pp, pid, kring->ckr_scratch, ph_cnt);
3655 	}
3656 
3657 	for (n = 0; n < ph_cnt; n++) {
3658 		ksd = KR_KSD(kring, j);
3659 		usd = KR_USD(kring, j);
3660 
3661 		kqum = SK_PTR_ADDR_KQUM(kring->ckr_scratch[n]);
3662 		kring->ckr_scratch[n] = 0;
3663 		ASSERT(kqum != NULL);
3664 
3665 		/* cleanup any stale slot mapping */
3666 		KSD_RESET(ksd);
3667 		ASSERT(usd != NULL);
3668 		USD_RESET(usd);
3669 
3670 		/*
3671 		 * Since this packet is freshly allocated and we need to
3672 		 * have the flag set for the attach to succeed, just set
3673 		 * it here rather than calling __packet_finalize().
3674 		 */
3675 		kqum->qum_qflags |= QUM_F_FINALIZED;
3676 
3677 		/* Attach packet to slot */
3678 		KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
3679 		/*
3680 		 * externalize the packet as it is being transferred to
3681 		 * user space.
3682 		 */
3683 		kr_externalize_metadata(kring, pp->pp_max_frags, kqum, p);
3684 
3685 		j = SLOT_NEXT(j, kring->ckr_lim);
3686 	}
3687 done:
3688 	ASSERT(j != kring->ckr_khead || j == kring->ckr_ktail);
3689 	kring->ckr_ktail = j;
3690 	return 0;
3691 }
3692 
3693 static int
na_packet_pool_alloc_sync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3694 na_packet_pool_alloc_sync(struct __kern_channel_ring *kring, struct proc *p,
3695     uint32_t flags)
3696 {
3697 	return na_packet_pool_alloc_sync_common(kring, p, flags, false);
3698 }
3699 
3700 static int
na_packet_pool_alloc_large_sync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3701 na_packet_pool_alloc_large_sync(struct __kern_channel_ring *kring, struct proc *p,
3702     uint32_t flags)
3703 {
3704 	return na_packet_pool_alloc_sync_common(kring, p, flags, true);
3705 }
3706 
3707 static int
na_packet_pool_free_buf_sync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3708 na_packet_pool_free_buf_sync(struct __kern_channel_ring *kring, struct proc *p,
3709     uint32_t flags)
3710 {
3711 #pragma unused(flags, p)
3712 	int n, ret = 0;
3713 	slot_idx_t j;
3714 	struct __kern_slot_desc *ksd;
3715 	struct __user_slot_desc *usd;
3716 	struct __kern_buflet *kbft;
3717 	struct kern_pbufpool *pp = kring->ckr_pp;
3718 
3719 	/* packet pool list is protected by channel lock */
3720 	ASSERT(!KR_KERNEL_ONLY(kring));
3721 
3722 	/* # of new slots */
3723 	n = kring->ckr_rhead - kring->ckr_khead;
3724 	if (n < 0) {
3725 		n += kring->ckr_num_slots;
3726 	}
3727 
3728 	/* nothing to free */
3729 	if (__improbable(n == 0)) {
3730 		SK_DF(SK_VERB_MEM | SK_VERB_SYNC, "%s(%d) kr \"%s\" %s",
3731 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
3732 		    "nothing to free");
3733 		goto done;
3734 	}
3735 
3736 	j = kring->ckr_khead;
3737 	while (n--) {
3738 		int err;
3739 
3740 		ksd = KR_KSD(kring, j);
3741 		usd = KR_USD(kring, j);
3742 
3743 		if (__improbable(!SD_VALID_METADATA(usd))) {
3744 			SK_ERR("bad slot %d 0x%llx", j, SK_KVA(ksd));
3745 			ret = EINVAL;
3746 			break;
3747 		}
3748 
3749 		kbft = pp_remove_upp_bft(pp, usd->sd_md_idx, &err);
3750 		if (__improbable(err != 0)) {
3751 			SK_ERR("un-allocated buflet %d %p", usd->sd_md_idx,
3752 			    SK_KVA(kbft));
3753 			ret = EINVAL;
3754 			break;
3755 		}
3756 
3757 		/* detach and free the packet */
3758 		ASSERT(!KSD_VALID_METADATA(ksd));
3759 		USD_DETACH_METADATA(usd);
3760 		pp_free_buflet(pp, kbft);
3761 		j = SLOT_NEXT(j, kring->ckr_lim);
3762 	}
3763 	kring->ckr_khead = j;
3764 	kring->ckr_ktail = SLOT_PREV(j, kring->ckr_lim);
3765 
3766 done:
3767 	return ret;
3768 }
3769 
3770 static int
na_packet_pool_alloc_buf_sync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)3771 na_packet_pool_alloc_buf_sync(struct __kern_channel_ring *kring, struct proc *p,
3772     uint32_t flags)
3773 {
3774 	int b, err;
3775 	uint32_t n = 0;
3776 	slot_idx_t j;
3777 	uint64_t now;
3778 	uint32_t curr_ws, bh_needed, bh_cnt;
3779 	struct __kern_slot_desc *ksd;
3780 	struct __user_slot_desc *usd;
3781 	struct __kern_buflet *kbft;
3782 	struct __kern_buflet_ext *kbe;
3783 	kern_pbufpool_t pp = kring->ckr_pp;
3784 	pid_t pid = proc_pid(p);
3785 
3786 	/* packet pool list is protected by channel lock */
3787 	ASSERT(!KR_KERNEL_ONLY(kring));
3788 	ASSERT(!PP_KERNEL_ONLY(pp));
3789 
3790 	now = _net_uptime;
3791 	if ((flags & NA_SYNCF_UPP_PURGE) != 0) {
3792 		if (now - kring->ckr_sync_time >= na_upp_reap_interval) {
3793 			kring->ckr_alloc_ws = na_upp_reap_min_pkts;
3794 		}
3795 		SK_DF(SK_VERB_MEM | SK_VERB_SYNC,
3796 		    "%s: purged curr_ws(%d)", kring->ckr_name,
3797 		    kring->ckr_alloc_ws);
3798 		return 0;
3799 	}
3800 	/* reclaim the completed slots */
3801 	kring->ckr_khead = kring->ckr_rhead;
3802 
3803 	/* # of busy (unclaimed) slots */
3804 	b = kring->ckr_ktail - kring->ckr_khead;
3805 	if (b < 0) {
3806 		b += kring->ckr_num_slots;
3807 	}
3808 
3809 	curr_ws = kring->ckr_alloc_ws;
3810 	if (flags & NA_SYNCF_FORCE_UPP_SYNC) {
3811 		/* increment the working set by 50% */
3812 		curr_ws += (curr_ws >> 1);
3813 		curr_ws = MIN(curr_ws, kring->ckr_lim);
3814 	} else {
3815 		if ((now - kring->ckr_sync_time >= na_upp_ws_hold_time) &&
3816 		    (uint32_t)b >= (curr_ws >> 2)) {
3817 			/* decrease the working set by 25% */
3818 			curr_ws -= (curr_ws >> 2);
3819 		}
3820 	}
3821 	curr_ws = MAX(curr_ws, na_upp_alloc_buf_lowat);
3822 	if (curr_ws > (uint32_t)b) {
3823 		n = curr_ws - b;
3824 	}
3825 	kring->ckr_alloc_ws = curr_ws;
3826 	kring->ckr_sync_time = now;
3827 
3828 	/* min with # of avail free slots (subtract busy from max) */
3829 	n = bh_needed = MIN(n, kring->ckr_lim - b);
3830 	j = kring->ckr_ktail;
3831 	SK_DF(SK_VERB_MEM | SK_VERB_SYNC,
3832 	    "%s: curr_ws(%d), n(%d)", kring->ckr_name, curr_ws, n);
3833 
3834 	if ((bh_cnt = bh_needed) == 0) {
3835 		goto done;
3836 	}
3837 
3838 	err = pp_alloc_buflet_batch(pp, kring->ckr_scratch, &bh_cnt,
3839 	    SKMEM_NOSLEEP, false);
3840 
3841 	if (bh_cnt == 0) {
3842 		SK_ERR("kr 0x%llx failed to alloc %u buflets(%d)",
3843 		    SK_KVA(kring), bh_needed, err);
3844 		kring->ckr_err_stats.cres_pkt_alloc_failures += bh_needed;
3845 	}
3846 
3847 	for (n = 0; n < bh_cnt; n++) {
3848 		struct __user_buflet *ubft;
3849 
3850 		ksd = KR_KSD(kring, j);
3851 		usd = KR_USD(kring, j);
3852 
3853 		kbft = (struct __kern_buflet *)(kring->ckr_scratch[n]);
3854 		kbe = (struct __kern_buflet_ext *)kbft;
3855 		kring->ckr_scratch[n] = 0;
3856 		ASSERT(kbft != NULL);
3857 
3858 		/*
3859 		 * Add buflet to the allocated list of user packet pool.
3860 		 */
3861 		pp_insert_upp_bft(pp, kbft, pid);
3862 
3863 		/*
3864 		 * externalize the buflet as it is being transferred to
3865 		 * user space.
3866 		 */
3867 		ubft = __DECONST(struct __user_buflet *, kbe->kbe_buf_user);
3868 		KBUF_EXTERNALIZE(kbft, ubft, pp);
3869 
3870 		/* cleanup any stale slot mapping */
3871 		KSD_RESET(ksd);
3872 		ASSERT(usd != NULL);
3873 		USD_RESET(usd);
3874 
3875 		/* Attach buflet to slot */
3876 		KR_SLOT_ATTACH_BUF_METADATA(kring, ksd, kbft);
3877 
3878 		j = SLOT_NEXT(j, kring->ckr_lim);
3879 	}
3880 done:
3881 	ASSERT(j != kring->ckr_khead || j == kring->ckr_ktail);
3882 	kring->ckr_ktail = j;
3883 	return 0;
3884 }
3885 
3886 /* The caller needs to ensure that the NA stays intact */
3887 void
na_drain(struct nexus_adapter * na,boolean_t purge)3888 na_drain(struct nexus_adapter *na, boolean_t purge)
3889 {
3890 	/* will be cleared on next channel sync */
3891 	if (!(os_atomic_or_orig(&na->na_flags, NAF_DRAINING, relaxed) &
3892 	    NAF_DRAINING) && NA_IS_ACTIVE(na)) {
3893 		SK_DF(SK_VERB_NA, "%s: %s na 0x%llx flags %b",
3894 		    na->na_name, (purge ? "purging" : "pruning"),
3895 		    SK_KVA(na), na->na_flags, NAF_BITS);
3896 
3897 		/* reap (purge/prune) caches in the arena */
3898 		skmem_arena_reap(na->na_arena, purge);
3899 	}
3900 }
3901