xref: /xnu-12377.81.4/bsd/skywalk/nexus/nexus.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33 
34 #include <kern/uipc_domain.h>
35 
36 static uint32_t disable_nxctl_check = 0;
37 #if (DEVELOPMENT || DEBUG)
38 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
39     CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
40 #endif
41 
42 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
43 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
44 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
45 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
46 
47 static STAILQ_HEAD(, nxctl) nxctl_head =
48     STAILQ_HEAD_INITIALIZER(nxctl_head);
49 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
50     STAILQ_HEAD_INITIALIZER(nxprov_head);
51 
52 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
53 RB_HEAD(kern_nexus_tree, kern_nexus);
54 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
55 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
56 static struct kern_nexus_tree   nx_head;
57 
58 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
59 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
60 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
62 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
63 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
64 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
65 static void nxctl_retain_locked(struct nxctl *);
66 static int nxctl_release_locked(struct nxctl *);
67 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
68 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
69 static void nxctl_free(struct nxctl *);
70 
71 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
72     struct kern_nexus_domain_provider *, struct nxprov_reg *,
73     const struct kern_nexus_provider_init *init, int *);
74 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
75 static void nxprov_retain_locked(struct kern_nexus_provider *);
76 static int nxprov_release_locked(struct kern_nexus_provider *);
77 static struct kern_nexus_provider *nxprov_alloc(
78 	struct kern_nexus_domain_provider *, zalloc_flags_t);
79 static void nxprov_free(struct kern_nexus_provider *);
80 
81 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
82 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
83 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
84 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
85 static struct kern_nexus *nx_alloc(zalloc_flags_t);
86 static void nx_free(struct kern_nexus *);
87 
88 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
89 
90 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
91 
92 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
93 
94 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
95 
96 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
97 
98 static int __nx_inited = 0;
99 
100 #define SKMEM_TAG_NX_KEY        "com.apple.skywalk.nexus.key"
101 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
102 
103 #define SKMEM_TAG_NX_MIB        "com.apple.skywalk.nexus.mib"
104 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
105 
106 #define SKMEM_TAG_NX_PORT        "com.apple.skywalk.nexus.port"
107 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
108 
109 #define SKMEM_TAG_NX_PORT_INFO        "com.apple.skywalk.nexus.port.info"
110 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
111 
112 /*
113  * Special nexus controller handle for Skywalk internal use.  Unlike all
114  * other nexus controller handles that are created by userland or kernel
115  * clients, this one never gets closed or freed.  It is also not part of
116  * the global nxctl_head list.
117  */
118 static struct nxctl _kernnxctl;
119 static struct nxctl _usernxctl;
120 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
121 struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
122 
123 /*
124  * -fbounds-safety: For static functions where additional size variables are
125  * added, we need to mark them __unused if this file is being built without
126  * -fbounds-safety.
127  */
128 #if !__has_ptrcheck
129 #define NX_FB_ARG __unused
130 #else
131 #define NX_FB_ARG
132 #endif
133 
134 int
nexus_init(void)135 nexus_init(void)
136 {
137 	SK_LOCK_ASSERT_HELD();
138 	ASSERT(!__nx_inited);
139 
140 	RB_INIT(&nx_head);
141 
142 	na_init();
143 
144 	/* attach system built-in domains and domain providers */
145 	nxdom_attach_all();
146 
147 	/*
148 	 * Initialize private kernel and shared user nexus controller handle;
149 	 *
150 	 * Shared Kernel controller is used internally for creating nexus providers
151 	 * and nexus instances from within the Skywalk code (e.g. netif_compat).
152 	 *
153 	 * Shared User controller is used userspace by clients(e.g. libnetcore)
154 	 * that would like to call nexus instances for use cases like
155 	 * configuring flow entry that they own indirectly (e.g. via NECP), so
156 	 * that the nexus would perform permission check based on other info
157 	 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
158 	 * credentials).
159 	 */
160 	nxctl_init(&_kernnxctl, kernproc, NULL);
161 	nxctl_retain_locked(&_kernnxctl);       /* one for us */
162 	nxctl_init(&_usernxctl, kernproc, NULL);
163 	nxctl_retain_locked(&_usernxctl);       /* one for us */
164 	nxctl_traffic_rule_init();
165 
166 	__nx_inited = 1;
167 
168 	return 0;
169 }
170 
171 void
nexus_fini(void)172 nexus_fini(void)
173 {
174 	SK_LOCK_ASSERT_HELD();
175 
176 	if (__nx_inited) {
177 		nxctl_traffic_rule_fini();
178 		nxctl_release_locked(&_kernnxctl);
179 		nxctl_release_locked(&_usernxctl);
180 
181 		/* tell all domains they're going away */
182 		nxdom_detach_all();
183 
184 		ASSERT(RB_EMPTY(&nx_head));
185 
186 		na_fini();
187 
188 		__nx_inited = 0;
189 	}
190 }
191 
192 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)193 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
194     int *err)
195 {
196 	struct nxctl *nxctl = NULL;
197 
198 	ASSERT(!uuid_is_null(nxctl_uuid));
199 
200 	/* privilege checks would be done when performing nxctl operations */
201 
202 	SK_LOCK();
203 
204 	nxctl = nxctl_alloc(p, fp, Z_WAITOK);
205 
206 	STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
207 	nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
208 	uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
209 
210 	nxctl_retain_locked(nxctl);     /* one for being in the list */
211 	nxctl_retain_locked(nxctl);     /* one for the caller */
212 
213 #if SK_LOG
214 	uuid_string_t uuidstr;
215 	SK_D("nxctl %p UUID %s", SK_KVA(nxctl),
216 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
217 #endif /* SK_LOG */
218 
219 	SK_UNLOCK();
220 
221 	if (*err != 0) {
222 		nxctl_free(nxctl);
223 		nxctl = NULL;
224 	}
225 	return nxctl;
226 }
227 
228 void
nxctl_close(struct nxctl * nxctl)229 nxctl_close(struct nxctl *nxctl)
230 {
231 	struct kern_nexus_provider *nxprov = NULL, *tnxprov;
232 
233 	lck_mtx_lock(&nxctl->nxctl_lock);
234 	SK_LOCK();
235 
236 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
237 
238 #if SK_LOG
239 	uuid_string_t uuidstr;
240 	SK_D("nxctl %p UUID %s flags 0x%x", SK_KVA(nxctl),
241 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
242 	    nxctl->nxctl_flags);
243 #endif /* SK_LOG */
244 
245 	if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
246 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
247 		nxctl->nxctl_fp = NULL;
248 	}
249 
250 	/* may be called as part of failure cleanup, so check */
251 	if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
252 		/* caller must hold an extra ref */
253 		ASSERT(nxctl->nxctl_refcnt > 1);
254 		(void) nxctl_release_locked(nxctl);
255 
256 		STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
257 		nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
258 	}
259 
260 repeat:
261 	STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
262 		/*
263 		 * Close provider only for those which are owned by
264 		 * this control instance.  Note that if we close the
265 		 * provider, we need to repeat this search as the
266 		 * list might have been changed by another thread.
267 		 * That's possible since SK_UNLOCK() may be called
268 		 * as a result of calling nxprov_close().
269 		 */
270 		if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
271 		    nxprov->nxprov_ctl == nxctl) {
272 			nxprov_retain_locked(nxprov);
273 			(void) nxprov_close(nxprov, TRUE);
274 			(void) nxprov_release_locked(nxprov);
275 			goto repeat;
276 		}
277 	}
278 
279 	SK_UNLOCK();
280 	lck_mtx_unlock(&nxctl->nxctl_lock);
281 	nxctl_traffic_rule_clean(nxctl);
282 }
283 
284 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)285 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
286 {
287 #pragma unused(nxctl)
288 	int err = 0;
289 
290 	NXCTL_LOCK_ASSERT_HELD(nxctl);
291 
292 	if (sopt->sopt_dir != SOPT_SET) {
293 		sopt->sopt_dir = SOPT_SET;
294 	}
295 
296 	switch (sopt->sopt_name) {
297 	case NXOPT_NEXUS_BIND:
298 		err = nxctl_nexus_bind(nxctl, sopt);
299 		break;
300 
301 	case NXOPT_NEXUS_UNBIND:
302 		err = nxctl_nexus_unbind(nxctl, sopt);
303 		break;
304 
305 	case NXOPT_NEXUS_CONFIG:
306 		err = nxctl_nexus_config(nxctl, sopt);
307 		break;
308 
309 	default:
310 		err = ENOPROTOOPT;
311 		break;
312 	}
313 
314 	return err;
315 }
316 
317 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)318 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
319 {
320 #pragma unused(nxctl)
321 	int err = 0;
322 
323 	NXCTL_LOCK_ASSERT_HELD(nxctl);
324 
325 	if (sopt->sopt_dir != SOPT_GET) {
326 		sopt->sopt_dir = SOPT_GET;
327 	}
328 
329 	switch (sopt->sopt_name) {
330 	case NXOPT_NEXUS_PROV_LIST:
331 		err = nxctl_get_nexus_prov_list(nxctl, sopt);
332 		break;
333 
334 	case NXOPT_NEXUS_PROV_ENTRY:
335 		err = nxctl_get_nexus_prov_entry(nxctl, sopt);
336 		break;
337 
338 	case NXOPT_NEXUS_LIST:
339 		err = nxctl_get_nexus_list(nxctl, sopt);
340 		break;
341 
342 	case NXOPT_CHANNEL_LIST:
343 		err = nxctl_get_channel_list(nxctl, sopt);
344 		break;
345 
346 	default:
347 		err = ENOPROTOOPT;
348 		break;
349 	}
350 
351 	return err;
352 }
353 
354 /* Upper bound on # of nrl_num_regs that we'd return to user space */
355 #define MAX_NUM_REG_ENTRIES     256
356 
357 /* Hoisted out of line to reduce kernel stack footprint */
358 SK_NO_INLINE_ATTRIBUTE
359 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)360 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
361 {
362 	user_addr_t tmp_ptr = USER_ADDR_NULL;
363 	struct nxprov_reg_ent *pnre, *nres = NULL;
364 	struct nxprov_list_req nrlr;
365 	struct kern_nexus_provider *nxprov = NULL;
366 	uint32_t nregs = 0, ncregs = 0;
367 	int err = 0, observeall;
368 	size_t nres_sz;
369 
370 	NXCTL_LOCK_ASSERT_HELD(nxctl);
371 
372 	ASSERT(sopt->sopt_p != NULL);
373 	if (sopt->sopt_val == USER_ADDR_NULL) {
374 		return EINVAL;
375 	}
376 
377 	err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
378 	if (err != 0) {
379 		return err;
380 	}
381 
382 	if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
383 		nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
384 	}
385 
386 	/*
387 	 * If the caller specified a buffer, copy out the Nexus provider
388 	 * entries to caller gracefully.  We only copy out the number of
389 	 * entries which caller has asked for, but we always tell caller
390 	 * how big the buffer really needs to be.
391 	 */
392 	tmp_ptr = nrlr.nrl_regs;
393 	if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
394 		nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
395 		nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
396 		if (__improbable(nres == NULL)) {
397 			return ENOBUFS;
398 		}
399 	}
400 
401 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
402 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
403 
404 	SK_LOCK();
405 	/*
406 	 * Count number of providers.  If buffer space exists and
407 	 * remains, copy out provider entries.
408 	 */
409 	nregs = nrlr.nrl_num_regs;
410 	pnre = nres;
411 
412 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
413 		/*
414 		 * Return only entries that are visible to the caller,
415 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
416 		 */
417 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
418 			continue;
419 		}
420 
421 		if (nres != NULL && nregs > 0) {
422 			uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
423 			bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
424 			    sizeof(struct nxprov_params));
425 			--nregs;
426 			++pnre;
427 			++ncregs;
428 		}
429 	}
430 	SK_UNLOCK();
431 
432 	if (ncregs == 0) {
433 		err = ENOENT;
434 	}
435 
436 	if (nres != NULL) {
437 		if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
438 			if (sopt->sopt_p != kernproc) {
439 				err = copyout(nres, tmp_ptr,
440 				    ncregs * sizeof(*nres));
441 			} else {
442 				caddr_t tmp;
443 				tmp =  __unsafe_forge_bidi_indexable(caddr_t,
444 				    CAST_DOWN(caddr_t, tmp_ptr),
445 				    ncregs * sizeof(*nres));
446 				bcopy(nres, tmp, ncregs * sizeof(*nres));
447 			}
448 		}
449 		sk_free_data(nres, nres_sz);
450 		nres = NULL;
451 	}
452 
453 	if (err == 0) {
454 		nrlr.nrl_num_regs = ncregs;
455 		err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
456 	}
457 
458 	return err;
459 }
460 
461 /* Hoisted out of line to reduce kernel stack footprint */
462 SK_NO_INLINE_ATTRIBUTE
463 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)464 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
465 {
466 	struct nxprov_reg_ent nre;
467 	struct kern_nexus_provider *nxprov = NULL;
468 	int err = 0;
469 
470 	NXCTL_LOCK_ASSERT_HELD(nxctl);
471 
472 	ASSERT(sopt->sopt_p != NULL);
473 	if (sopt->sopt_val == USER_ADDR_NULL) {
474 		return EINVAL;
475 	}
476 
477 	bzero(&nre, sizeof(nre));
478 	err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
479 	if (err != 0) {
480 		return err;
481 	}
482 
483 	if (uuid_is_null(nre.npre_prov_uuid)) {
484 		return EINVAL;
485 	}
486 
487 	SK_LOCK();
488 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
489 		if (uuid_compare(nxprov->nxprov_uuid,
490 		    nre.npre_prov_uuid) == 0) {
491 			/*
492 			 * Return only entries that are visible to the caller,
493 			 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
494 			 */
495 			if (nxprov->nxprov_ctl != nxctl) {
496 				if (skywalk_priv_check_cred(sopt->sopt_p,
497 				    nxctl->nxctl_cred,
498 				    PRIV_SKYWALK_OBSERVE_ALL) != 0) {
499 					nxprov = NULL;
500 					break;
501 				}
502 			}
503 
504 			bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
505 			    sizeof(struct nxprov_params));
506 			break;
507 		}
508 	}
509 	SK_UNLOCK();
510 
511 	if (nxprov != NULL) {
512 		err = sooptcopyout(sopt, &nre, sizeof(nre));
513 	} else {
514 		err = ENOENT;
515 	}
516 
517 	return err;
518 }
519 
520 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
521 #define MAX_NUM_NX_UUIDS        4096
522 
523 /* Hoisted out of line to reduce kernel stack footprint */
524 SK_NO_INLINE_ATTRIBUTE
525 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)526 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
527 {
528 	user_addr_t tmp_ptr = USER_ADDR_NULL;
529 	uint32_t nuuids = 0, ncuuids = 0;
530 	uuid_t *puuid, *uuids = NULL;
531 	size_t uuids_sz;
532 	struct nx_list_req nlr;
533 	struct kern_nexus_provider *nxprov = NULL;
534 	struct kern_nexus *nx = NULL;
535 	int err = 0, observeall;
536 
537 	NXCTL_LOCK_ASSERT_HELD(nxctl);
538 
539 	ASSERT(sopt->sopt_p != NULL);
540 	if (sopt->sopt_val == USER_ADDR_NULL) {
541 		return EINVAL;
542 	}
543 
544 	err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
545 	if (err != 0) {
546 		return err;
547 	}
548 
549 	if (uuid_is_null(nlr.nl_prov_uuid)) {
550 		return EINVAL;
551 	} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
552 		nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
553 	}
554 
555 	/*
556 	 * If the caller specified a buffer, copy out the Nexus UUIDs to
557 	 * caller gracefully.  We only copy out the number of UUIDs which
558 	 * caller has asked for, but we always tell caller how big the
559 	 * buffer really needs to be.
560 	 */
561 	tmp_ptr = nlr.nl_nx_uuids;
562 	if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
563 		uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
564 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
565 		if (__improbable(uuids == NULL)) {
566 			return ENOBUFS;
567 		}
568 	}
569 
570 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
571 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
572 
573 	SK_LOCK();
574 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
575 		/*
576 		 * Return only entries that are visible to the caller,
577 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
578 		 */
579 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
580 			continue;
581 		}
582 
583 		if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
584 			break;
585 		}
586 	}
587 
588 	if (nxprov != NULL) {
589 		/*
590 		 * Count number of Nexus.  If buffer space exists
591 		 * and remains, copy out the Nexus UUIDs.
592 		 */
593 		nuuids = nlr.nl_num_nx_uuids;
594 		puuid = uuids;
595 
596 		STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
597 			++ncuuids;
598 			if (uuids != NULL && nuuids > 0) {
599 				uuid_copy(*puuid, nx->nx_uuid);
600 				--nuuids;
601 				++puuid;
602 			}
603 		}
604 	} else {
605 		err = ENOENT;
606 	}
607 	SK_UNLOCK();
608 
609 	if (uuids != NULL) {
610 		if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
611 			uintptr_t cnt_uuid;
612 
613 			/* Note: Pointer arithmetic */
614 			cnt_uuid = (uintptr_t)(puuid - uuids);
615 			if (cnt_uuid > 0) {
616 				if (sopt->sopt_p != kernproc) {
617 					err = copyout(uuids, tmp_ptr,
618 					    cnt_uuid * sizeof(uuid_t));
619 				} else {
620 					caddr_t tmp;
621 					tmp = __unsafe_forge_bidi_indexable(caddr_t,
622 					    CAST_DOWN(caddr_t, tmp_ptr),
623 					    cnt_uuid * sizeof(uuid_t));
624 					bcopy(uuids, tmp,
625 					    cnt_uuid * sizeof(uuid_t));
626 				}
627 			}
628 		}
629 		sk_free_data(uuids, uuids_sz);
630 		uuids = NULL;
631 	}
632 
633 	if (err == 0) {
634 		nlr.nl_num_nx_uuids = ncuuids;
635 		err = sooptcopyout(sopt, &nlr, sizeof(nlr));
636 	}
637 
638 	return err;
639 }
640 
641 /* Hoisted out of line to reduce kernel stack footprint */
642 SK_NO_INLINE_ATTRIBUTE
643 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)644 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
645 {
646 	boolean_t m_pid, m_exec_uuid, m_key;
647 	struct nx_bind_req nbr;
648 	struct proc *p = PROC_NULL;
649 	struct nxbind *nxb = NULL;
650 	uint64_t p_uniqueid = -1;
651 	pid_t p_pid = -1;
652 	struct kern_nexus *nx = NULL;
653 #if SK_LOG
654 	uuid_string_t exec_uuidstr;
655 #endif /* SK_LOG */
656 	uuid_t p_uuid;
657 	void *key = NULL;
658 	int err = 0;
659 
660 	NXCTL_LOCK_ASSERT_HELD(nxctl);
661 
662 	if (sopt->sopt_val == USER_ADDR_NULL) {
663 		return EINVAL;
664 	}
665 
666 	uuid_clear(p_uuid);
667 	bzero(&nbr, sizeof(nbr));
668 	err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
669 	if (err != 0) {
670 		return err;
671 	}
672 
673 	if (uuid_is_null(nbr.nb_nx_uuid)) {
674 		err = EINVAL;
675 		goto done_unlocked;
676 	}
677 
678 	nbr.nb_flags &= NBR_MATCH_MASK;
679 	if (nbr.nb_flags == 0) {
680 		/* must choose one of the match criteria */
681 		err = EINVAL;
682 		goto done_unlocked;
683 	}
684 	m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
685 	m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
686 	m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
687 
688 	if (m_pid || m_exec_uuid) {
689 		/*
690 		 * Validate process ID.  A valid PID is needed when we're
691 		 * asked to match by PID, or if asked to match by executable
692 		 * UUID with a NULL nb_exec_uuid supplied.  The latter is
693 		 * to support the case when a userland Nexus provider isn't
694 		 * able to acquire its client's executable UUID, but is
695 		 * able to identify it via PID.
696 		 */
697 		if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
698 		    (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
699 			err = ESRCH;
700 			goto done_unlocked;
701 		}
702 		/* exclude kernel from the match criteria */
703 		if (p == kernproc) {
704 			err = EACCES;
705 			goto done_unlocked;
706 		} else if (p != PROC_NULL) {
707 			proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
708 			p_uniqueid = proc_uniqueid(p);
709 			p_pid = proc_pid(p);
710 		} else {
711 			uuid_copy(p_uuid, nbr.nb_exec_uuid);
712 		}
713 	}
714 
715 	if (m_key) {
716 		if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
717 		    nbr.nb_key == USER_ADDR_NULL) {
718 			err = EINVAL;
719 			goto done_unlocked;
720 		}
721 
722 		key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
723 		if (__improbable(key == NULL)) {
724 			err = ENOMEM;
725 			goto done_unlocked;
726 		}
727 
728 		if (sopt->sopt_p != kernproc) {
729 			err = copyin(nbr.nb_key, key, nbr.nb_key_len);
730 			if (err != 0) {
731 				goto done_unlocked;
732 			}
733 		} else {
734 			/*
735 			 * -fbounds-safety: nbr.nb_key is user_addr_t. Changing
736 			 * it to a pointer type is risky, so we just forge it
737 			 * here instead.
738 			 */
739 			void *nb_key = __unsafe_forge_bidi_indexable(void *,
740 			    nbr.nb_key, nbr.nb_key_len);
741 			bcopy(nb_key, key, nbr.nb_key_len);
742 		}
743 	}
744 
745 	SK_LOCK();
746 	nx = nx_find(nbr.nb_nx_uuid, TRUE);
747 	if (nx == NULL || (disable_nxctl_check == 0 &&
748 	    nx->nx_prov->nxprov_ctl != nxctl &&
749 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
750 		err = ENOENT;
751 		goto done;
752 	}
753 
754 	/* bind isn't applicable on anonymous nexus provider */
755 	if (NX_ANONYMOUS_PROV(nx)) {
756 		err = ENXIO;
757 		goto done;
758 	}
759 
760 	/* port must be within the domain's range */
761 	if (nbr.nb_port != NEXUS_PORT_ANY &&
762 	    nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
763 		err = EDOM;
764 		goto done;
765 	} else if (nbr.nb_port == NEXUS_PORT_ANY) {
766 		/* for now, this is allowed only for kernel clients */
767 		if (sopt->sopt_p != kernproc) {
768 			err = EPERM;
769 			goto done;
770 		}
771 	}
772 
773 	nxb = nxb_alloc(Z_WAITOK);
774 
775 	if (m_pid) {
776 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
777 		nxb->nxb_uniqueid = p_uniqueid;
778 		nxb->nxb_pid = p_pid;
779 	}
780 	if (m_exec_uuid) {
781 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
782 		ASSERT(!uuid_is_null(p_uuid));
783 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
784 	}
785 	if (m_key) {
786 		nxb->nxb_flags |= NXBF_MATCH_KEY;
787 		ASSERT(key != NULL);
788 		ASSERT(nbr.nb_key_len != 0 &&
789 		    nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
790 		/*
791 		 * -fbounds-safety: since nxb_key is __sized_by(nxb_key_len),
792 		 * its assignment needs to be done side-by-side to nxb_key_len.
793 		 */
794 		nxb->nxb_key = key;
795 		key = NULL;     /* let nxb_free() free it */
796 		nxb->nxb_key_len = nbr.nb_key_len;
797 	}
798 
799 	/*
800 	 * Bind the creds to the nexus port.  If client doesn't have a port,
801 	 * find one, claim it, and associate the creds to it.  Upon success,
802 	 * the nexus may move the nxbind contents (including the key) to
803 	 * its own nxbind instance; in that case, nxb_free() below will not
804 	 * be freeing the key within.
805 	 */
806 	err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
807 	if (err != 0) {
808 		goto done;
809 	}
810 
811 	ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
812 	(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
813 
814 	SK_D("nexus %p nxb %p port %u flags 0x%x pid %d "
815 	    "(uniqueid %llu) exec_uuid %s key %p key_len %u",
816 	    SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
817 	    nxb->nxb_pid, nxb->nxb_uniqueid,
818 	    sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
819 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
820 	    nxb->nxb_key_len);
821 
822 done:
823 	if (nx != NULL) {
824 		(void) nx_release_locked(nx);
825 		nx = NULL;
826 	}
827 	SK_UNLOCK();
828 
829 done_unlocked:
830 	ASSERT(nx == NULL);
831 
832 	if (nxb != NULL) {
833 		nxb_free(nxb);
834 		nxb = NULL;
835 	}
836 	if (key != NULL) {
837 		sk_free_data(key, nbr.nb_key_len);
838 		key = NULL;
839 	}
840 	if (p != PROC_NULL) {
841 		proc_rele(p);
842 	}
843 
844 	return err;
845 }
846 
847 /* Hoisted out of line to reduce kernel stack footprint */
848 SK_NO_INLINE_ATTRIBUTE
849 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)850 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
851 {
852 	struct nx_unbind_req nur;
853 	struct kern_nexus *nx = NULL;
854 	int err = 0;
855 
856 	NXCTL_LOCK_ASSERT_HELD(nxctl);
857 
858 	if (sopt->sopt_val == USER_ADDR_NULL) {
859 		return EINVAL;
860 	}
861 
862 	bzero(&nur, sizeof(nur));
863 	err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
864 	if (err != 0) {
865 		return err;
866 	}
867 
868 	if (uuid_is_null(nur.nu_nx_uuid)) {
869 		return EINVAL;
870 	}
871 
872 	SK_LOCK();
873 	nx = nx_find(nur.nu_nx_uuid, TRUE);
874 	if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
875 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
876 		err = ENOENT;
877 		goto done;
878 	}
879 
880 	/* unbind isn't applicable on anonymous nexus provider */
881 	if (NX_ANONYMOUS_PROV(nx)) {
882 		err = ENXIO;
883 		goto done;
884 	}
885 
886 	if (nur.nu_port == NEXUS_PORT_ANY) {
887 		err = EINVAL;
888 		goto done;
889 	}
890 
891 	err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
892 
893 done:
894 	if (nx != NULL) {
895 		(void) nx_release_locked(nx);
896 		nx = NULL;
897 	}
898 	SK_UNLOCK();
899 
900 	return err;
901 }
902 
903 /* Hoisted out of line to reduce kernel stack footprint */
904 SK_NO_INLINE_ATTRIBUTE
905 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)906 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
907 {
908 	struct kern_nexus *nx = NULL;
909 	struct nx_cfg_req ncr;
910 	int err = 0;
911 
912 	NXCTL_LOCK_ASSERT_HELD(nxctl);
913 
914 	if (sopt->sopt_val == USER_ADDR_NULL) {
915 		return EINVAL;
916 	}
917 
918 	bzero(&ncr, sizeof(ncr));
919 	err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
920 	if (err != 0) {
921 		return err;
922 	}
923 
924 	if (uuid_is_null(ncr.nc_nx_uuid)) {
925 		return EINVAL;
926 	}
927 
928 	SK_LOCK();
929 	nx = nx_find(ncr.nc_nx_uuid, TRUE);
930 	if (nx == NULL || (disable_nxctl_check == 0 &&
931 	    nx->nx_prov->nxprov_ctl != nxctl &&
932 	    nxctl != &_kernnxctl &&    /* allow kernel/shared user nxctl */
933 	    nxctl != &_usernxctl)) {
934 		err = ENOENT;
935 		goto done;
936 	}
937 
938 	if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
939 		err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
940 		    nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
941 	} else {
942 		err = EPERM;
943 	}
944 
945 	if (err == 0) {
946 		(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
947 	}
948 done:
949 	if (nx != NULL) {
950 		(void) nx_release_locked(nx);
951 		nx = NULL;
952 	}
953 	SK_UNLOCK();
954 
955 	return err;
956 }
957 
958 struct nxbind *
nxb_alloc(zalloc_flags_t how)959 nxb_alloc(zalloc_flags_t how)
960 {
961 	struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
962 
963 	if (nxb) {
964 		SK_DF(SK_VERB_MEM, "nxb %p ALLOC", SK_KVA(nxb));
965 	}
966 	return nxb;
967 }
968 
969 void
nxb_free(struct nxbind * nxb)970 nxb_free(struct nxbind *nxb)
971 {
972 	SK_DF(SK_VERB_MEM, "nxb %p key %p FREE", SK_KVA(nxb),
973 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
974 
975 	if (nxb->nxb_key != NULL) {
976 		sk_free_data_sized_by(nxb->nxb_key, nxb->nxb_key_len);
977 		nxb->nxb_key = NULL;
978 		nxb->nxb_key_len = 0;
979 	}
980 	zfree(nxbind_zone, nxb);
981 }
982 
983 /*
984  * nxb0 is assumed to possess the truth, compare nxb1 against it.
985  */
986 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)987 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
988 {
989 	ASSERT(nxb0 != NULL && nxb1 != NULL);
990 	ASSERT(nxb0 != nxb1);
991 
992 	/* we always compare using uniqueid and not pid */
993 	if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
994 	    nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
995 		return FALSE;
996 	}
997 
998 	if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
999 	    uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
1000 		return FALSE;
1001 	}
1002 
1003 	ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
1004 	    (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
1005 
1006 	if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
1007 	    (nxb0->nxb_key_len != nxb1->nxb_key_len ||
1008 	    nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
1009 	    nxb1->nxb_key_len) != 0)) {
1010 		return FALSE;
1011 	}
1012 
1013 	return TRUE;
1014 }
1015 
1016 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)1017 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
1018 {
1019 	ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
1020 	    (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
1021 
1022 	/* in case the destination has a key attached, free it first */
1023 	if (dnxb->nxb_key != NULL) {
1024 		sk_free_data_sized_by(dnxb->nxb_key, dnxb->nxb_key_len);
1025 		dnxb->nxb_key = NULL;
1026 		dnxb->nxb_key_len = 0;
1027 	}
1028 
1029 	/* move everything from src to dst, and then wipe out src */
1030 	bcopy(snxb, dnxb, sizeof(*dnxb));
1031 	bzero(snxb, sizeof(*snxb));
1032 }
1033 
1034 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1035 #define MAX_NUM_CH_UUIDS        4096
1036 
1037 /* Hoisted out of line to reduce kernel stack footprint */
1038 SK_NO_INLINE_ATTRIBUTE
1039 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)1040 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1041 {
1042 	user_addr_t tmp_ptr = USER_ADDR_NULL;
1043 	uint32_t nuuids = 0, ncuuids = 0;
1044 	uuid_t *puuid, *uuids = NULL;
1045 	size_t uuids_sz;
1046 	struct ch_list_req clr;
1047 	struct kern_channel *ch = NULL;
1048 	struct kern_nexus *nx = NULL;
1049 	struct kern_nexus find;
1050 	int err = 0, observeall;
1051 
1052 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1053 
1054 	ASSERT(sopt->sopt_p != NULL);
1055 	if (sopt->sopt_val == USER_ADDR_NULL) {
1056 		return EINVAL;
1057 	}
1058 
1059 	err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1060 	if (err != 0) {
1061 		return err;
1062 	}
1063 
1064 	if (uuid_is_null(clr.cl_nx_uuid)) {
1065 		return EINVAL;
1066 	} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1067 		clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1068 	}
1069 
1070 	/*
1071 	 * If the caller specified a buffer, copy out the Channel UUIDs to
1072 	 * caller gracefully.  We only copy out the number of UUIDs which
1073 	 * caller has asked for, but we always tell caller how big the
1074 	 * buffer really needs to be.
1075 	 */
1076 	tmp_ptr = clr.cl_ch_uuids;
1077 	if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1078 		uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1079 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1080 		if (uuids == NULL) {
1081 			return ENOBUFS;
1082 		}
1083 	}
1084 
1085 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1086 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
1087 
1088 	SK_LOCK();
1089 	uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1090 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1091 	if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1092 		/*
1093 		 * Return only entries that are visible to the caller,
1094 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1095 		 */
1096 		nx = NULL;
1097 	}
1098 	if (nx != NULL) {
1099 		/*
1100 		 * Count number of Channels.  If buffer space exists
1101 		 * and remains, copy out the Channel UUIDs.
1102 		 */
1103 		nuuids = clr.cl_num_ch_uuids;
1104 		puuid = uuids;
1105 
1106 		STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1107 			++ncuuids;
1108 			if (uuids != NULL && nuuids > 0) {
1109 				uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1110 				--nuuids;
1111 				++puuid;
1112 			}
1113 		}
1114 	} else {
1115 		err = ENOENT;
1116 	}
1117 	SK_UNLOCK();
1118 
1119 	if (uuids != NULL) {
1120 		if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1121 			uintptr_t cnt_uuid;
1122 
1123 			/* Note: Pointer arithmetic */
1124 			cnt_uuid = (uintptr_t)(puuid - uuids);
1125 			ASSERT(cnt_uuid > 0);
1126 
1127 			if (sopt->sopt_p != kernproc) {
1128 				err = copyout(uuids, tmp_ptr,
1129 				    cnt_uuid * sizeof(uuid_t));
1130 			} else {
1131 				caddr_t tmp;
1132 				tmp = __unsafe_forge_bidi_indexable(caddr_t,
1133 				    CAST_DOWN(caddr_t, tmp_ptr),
1134 				    cnt_uuid * sizeof(uuid_t));
1135 				bcopy(uuids, tmp, cnt_uuid * sizeof(uuid_t));
1136 			}
1137 		}
1138 		sk_free_data(uuids, uuids_sz);
1139 		uuids = NULL;
1140 	}
1141 
1142 	if (err == 0) {
1143 		clr.cl_num_ch_uuids = ncuuids;
1144 		err = sooptcopyout(sopt, &clr, sizeof(clr));
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1151 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1152 {
1153 	uuid_t p_uuid;
1154 
1155 	bzero(nxctl, sizeof(*nxctl));
1156 
1157 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1158 
1159 	lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1160 	uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1161 	nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1162 	nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1163 	nxctl->nxctl_fp = fp;
1164 	if (nxctl == &_kernnxctl) {
1165 		ASSERT(p == kernproc);
1166 		nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1167 	}
1168 	if (nxctl == &_usernxctl) {
1169 		ASSERT(p == kernproc);
1170 		nxctl->nxctl_cred = NULL;
1171 	}
1172 	if (fp == NULL) {
1173 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1174 	}
1175 }
1176 
1177 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1178 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1179 {
1180 	struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1181 
1182 	if (nxctl != NULL) {
1183 		nxctl_init(nxctl, p, fp);
1184 	}
1185 	return nxctl;
1186 }
1187 
1188 static void
nxctl_free(struct nxctl * nxctl)1189 nxctl_free(struct nxctl *nxctl)
1190 {
1191 	ASSERT(nxctl->nxctl_refcnt == 0);
1192 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1193 	kauth_cred_unref(&nxctl->nxctl_cred);
1194 	lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1195 	SK_D("nxctl %p FREE", SK_KVA(nxctl));
1196 	if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1197 		zfree(nxctl_zone, nxctl);
1198 	}
1199 }
1200 
1201 static void
nxctl_retain_locked(struct nxctl * nxctl)1202 nxctl_retain_locked(struct nxctl *nxctl)
1203 {
1204 	SK_LOCK_ASSERT_HELD();
1205 
1206 	nxctl->nxctl_refcnt++;
1207 	ASSERT(nxctl->nxctl_refcnt != 0);
1208 }
1209 
1210 void
nxctl_retain(struct nxctl * nxctl)1211 nxctl_retain(struct nxctl *nxctl)
1212 {
1213 	SK_LOCK();
1214 	nxctl_retain_locked(nxctl);
1215 	SK_UNLOCK();
1216 }
1217 
1218 static int
nxctl_release_locked(struct nxctl * nxctl)1219 nxctl_release_locked(struct nxctl *nxctl)
1220 {
1221 	int oldref = nxctl->nxctl_refcnt;
1222 
1223 	SK_LOCK_ASSERT_HELD();
1224 
1225 	ASSERT(nxctl->nxctl_refcnt != 0);
1226 	if (--nxctl->nxctl_refcnt == 0) {
1227 		nxctl_free(nxctl);
1228 	}
1229 
1230 	return oldref == 1;
1231 }
1232 
1233 int
nxctl_release(struct nxctl * nxctl)1234 nxctl_release(struct nxctl *nxctl)
1235 {
1236 	int lastref;
1237 
1238 	SK_LOCK();
1239 	lastref = nxctl_release_locked(nxctl);
1240 	SK_UNLOCK();
1241 
1242 	return lastref;
1243 }
1244 
1245 /* XXX
1246  * -fbounds-safety: Why is this taking a void *? All callers are passing nxctl.
1247  * How come there's no nxctl_ctor?
1248  */
1249 void
nxctl_dtor(struct nxctl * arg)1250 nxctl_dtor(struct nxctl *arg)
1251 {
1252 	struct nxctl *nxctl = arg;
1253 
1254 	nxctl_close(nxctl);
1255 	SK_LOCK();
1256 	(void) nxctl_release_locked(nxctl);
1257 	SK_UNLOCK();
1258 }
1259 
1260 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1261 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1262     struct proc *p)
1263 {
1264 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1265 	int err = 0;
1266 
1267 	ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1268 	ASSERT(ch->ch_ctx == NULL);
1269 
1270 	SK_LOCK_ASSERT_HELD();
1271 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1272 
1273 	if ((ch->ch_flags & CHANF_EXT_SKIP) ||
1274 	    (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1275 	    nxprov->nxprov_ext.nxpi_connected == NULL)) {
1276 		return 0;
1277 	}
1278 
1279 	ch_retain_locked(ch);
1280 	lck_mtx_unlock(&ch->ch_lock);
1281 	SK_UNLOCK();
1282 	lck_mtx_lock(&ch->ch_lock);
1283 
1284 	err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1285 	    ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1286 	if (err != 0) {
1287 		SK_D("ch %p flags %x nx %p pre_connect "
1288 		    "error %d", SK_KVA(ch), ch->ch_flags, SK_KVA(nx), err);
1289 		ch->ch_ctx = NULL;
1290 		goto done;
1291 	}
1292 	/*
1293 	 * Upon ring/slot init failure, this is cleared
1294 	 * by nxprov_advise_disconnect() below.
1295 	 */
1296 	os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1297 	if (NXPROV_LLINK(nxprov)) {
1298 		err = nx_netif_llink_ext_init_default_queues(nx);
1299 	} else {
1300 		err = nx_init_rings(nx, ch);
1301 	}
1302 	if (err != 0) {
1303 		goto done;
1304 	}
1305 	ASSERT(err == 0);
1306 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1307 	    CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1308 
1309 	err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1310 	if (err != 0) {
1311 		SK_D("ch %p flags %x nx %p connected error %d",
1312 		    SK_KVA(ch), ch->ch_flags, SK_KVA(nx), err);
1313 		goto done;
1314 	}
1315 	os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1316 	SK_D("ch %p flags %x nx %p connected",
1317 	    SK_KVA(ch), ch->ch_flags, SK_KVA(nx));
1318 
1319 
1320 done:
1321 	lck_mtx_unlock(&ch->ch_lock);
1322 	SK_LOCK();
1323 	lck_mtx_lock(&ch->ch_lock);
1324 	if ((err != 0) &&
1325 	    (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1326 		nxprov_advise_disconnect(nx, ch);
1327 	}
1328 	/* caller is expected to hold one, in addition to ourselves */
1329 	VERIFY(ch->ch_refcnt >= 2);
1330 	ch_release_locked(ch);
1331 
1332 	return err;
1333 }
1334 
1335 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1336 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1337 {
1338 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1339 
1340 	SK_LOCK_ASSERT_HELD();
1341 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1342 
1343 	/* check as we might be called in the error handling path */
1344 	if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1345 		ch_retain_locked(ch);
1346 		lck_mtx_unlock(&ch->ch_lock);
1347 		SK_UNLOCK();
1348 		lck_mtx_lock(&ch->ch_lock);
1349 
1350 		ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1351 		if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1352 			nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1353 			os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1354 		}
1355 
1356 		/*
1357 		 * Inform the external domain provider that the rings
1358 		 * and slots for this channel are no longer valid.
1359 		 */
1360 		if (NXPROV_LLINK(nxprov)) {
1361 			nx_netif_llink_ext_fini_default_queues(nx);
1362 		} else {
1363 			nx_fini_rings(nx, ch);
1364 		}
1365 
1366 		ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1367 		nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1368 		os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1369 
1370 		SK_D("ch %p flags %x nx %p disconnected",
1371 		    SK_KVA(ch), ch->ch_flags, SK_KVA(nx));
1372 
1373 		/* We're done with this channel */
1374 		ch->ch_ctx = NULL;
1375 
1376 		lck_mtx_unlock(&ch->ch_lock);
1377 		SK_LOCK();
1378 		lck_mtx_lock(&ch->ch_lock);
1379 		/* caller is expected to hold one, in addition to ourselves */
1380 		VERIFY(ch->ch_refcnt >= 2);
1381 		ch_release_locked(ch);
1382 	}
1383 	ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1384 	ASSERT(ch->ch_ctx == NULL);
1385 }
1386 
1387 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1388 nxprov_create_common(struct nxctl *nxctl,
1389     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1390     const struct kern_nexus_provider_init *init, int *err)
1391 {
1392 	struct skmem_region_params srp[SKMEM_REGIONS];
1393 	struct kern_nexus_provider *nxprov = NULL;
1394 	struct nxprov_params nxp;
1395 	uint32_t override = 0;
1396 	uint32_t pp_region_config_flags;
1397 	int i;
1398 
1399 	static_assert(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1400 	static_assert(sizeof(*init) >= sizeof(struct kern_nexus_netif_provider_init));
1401 
1402 	SK_LOCK_ASSERT_HELD();
1403 	ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1404 
1405 	pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1406 	    PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1407 	/*
1408 	 * Special handling for external nexus providers; similar
1409 	 * logic to what's done in kern_pbufpool_create().
1410 	 */
1411 	if (init != NULL) {
1412 		if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1413 			pp_region_config_flags |=
1414 			    PP_REGION_CONFIG_BUF_MONOLITHIC;
1415 		}
1416 
1417 		if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1418 			pp_region_config_flags |=
1419 			    PP_REGION_CONFIG_BUF_NOCACHE;
1420 		}
1421 	}
1422 
1423 	/*
1424 	 * For network devices, set the packet metadata memory as persistent
1425 	 * so that it is wired at segment creation.  This allows us to access
1426 	 * it with preemption disabled, as well as for rdar://problem/46511741.
1427 	 */
1428 	if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1429 		pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1430 	}
1431 
1432 	/* process and validate provider parameters */
1433 	if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1434 	    &nxp, srp, override, pp_region_config_flags)) != 0) {
1435 		goto done;
1436 	}
1437 
1438 	nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1439 	ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1440 
1441 	STAILQ_INIT(&nxprov->nxprov_nx_head);
1442 	STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1443 	nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1444 	nxprov->nxprov_ctl = nxctl;
1445 	uuid_generate_random(nxprov->nxprov_uuid);
1446 	bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1447 
1448 	if (init != NULL) {
1449 		if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1450 			ASSERT(NXPROV_LLINK(nxprov));
1451 			bcopy(init, &nxprov->nxprov_netif_ext,
1452 			    sizeof(nxprov->nxprov_netif_ext));
1453 		} else {
1454 			ASSERT(!NXPROV_LLINK(nxprov));
1455 			ASSERT(init->nxpi_version ==
1456 			    KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1457 			bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1458 		}
1459 		nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1460 	}
1461 
1462 	/* store validated region parameters to the provider */
1463 	for (i = 0; i < SKMEM_REGIONS; i++) {
1464 		nxprov->nxprov_region_params[i] = srp[i];
1465 	}
1466 
1467 	if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1468 		uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1469 
1470 		if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1471 			nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1472 		}
1473 	} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1474 	    NEXUS_TYPE_NET_IF) {
1475 		/*
1476 		 * Treat non-netif built-in nexus providers as those
1477 		 * meant for inter-process communications, i.e. there
1478 		 * is no actual networking hardware involved.
1479 		 */
1480 		nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1481 	}
1482 
1483 	nxprov_retain_locked(nxprov);   /* one for being in the list */
1484 	nxprov_retain_locked(nxprov);   /* one for the caller */
1485 
1486 #if SK_LOG
1487 	uuid_string_t uuidstr;
1488 	SK_D("nxprov %p UUID %s", SK_KVA(nxprov),
1489 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1490 #endif /* SK_LOG */
1491 
1492 done:
1493 	return nxprov;
1494 }
1495 
1496 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1497 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1498     int *err)
1499 {
1500 	struct nxprov_params *nxp = &reg->nxpreg_params;
1501 	struct kern_nexus_domain_provider *nxdom_prov = NULL;
1502 	struct kern_nexus_provider *nxprov = NULL;
1503 
1504 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1505 
1506 	ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1507 	*err = 0;
1508 
1509 	switch (nxp->nxp_type) {
1510 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1511 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1512 		    PRIV_SKYWALK_REGISTER_USER_PIPE);
1513 		break;
1514 
1515 	case NEXUS_TYPE_FLOW_SWITCH:    /* allowed for userland */
1516 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1517 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1518 		break;
1519 
1520 	case NEXUS_TYPE_NET_IF:         /* allowed for userland */
1521 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1522 		    PRIV_SKYWALK_REGISTER_NET_IF);
1523 		break;
1524 
1525 	case NEXUS_TYPE_KERNEL_PIPE:    /* only for kernel */
1526 	default:
1527 		*err = EINVAL;
1528 		goto done;
1529 	}
1530 
1531 	if (*err != 0) {
1532 		goto done;
1533 	}
1534 
1535 	ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1536 	if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1537 		*err = ENXIO;
1538 		goto done;
1539 	}
1540 
1541 #if CONFIG_NEXUS_NETIF
1542 	/* make sure netif_compat is the default here */
1543 	ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1544 	    strbufcmp(nxdom_prov->nxdom_prov_name, sizeof(nxdom_prov->nxdom_prov_name),
1545 	    NEXUS_PROVIDER_NET_IF_COMPAT, sizeof(NEXUS_PROVIDER_NET_IF_COMPAT)) == 0);
1546 #endif /* CONFIG_NEXUS_NETIF */
1547 
1548 	SK_LOCK();
1549 	/* callee holds a reference for our caller upon success */
1550 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1551 	SK_UNLOCK();
1552 done:
1553 	return nxprov;
1554 }
1555 
1556 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1557 nxprov_create_kern(struct nxctl *nxctl,
1558     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1559     const struct kern_nexus_provider_init *init, int *err)
1560 {
1561 	struct nxprov_params *nxp = &reg->nxpreg_params;
1562 	struct kern_nexus_provider *nxprov = NULL;
1563 
1564 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1565 	SK_LOCK_ASSERT_HELD();
1566 
1567 	ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1568 	ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1569 	ASSERT(init == NULL ||
1570 	    init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1571 	    init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1572 
1573 	*err = 0;
1574 
1575 	switch (nxp->nxp_type) {
1576 	case NEXUS_TYPE_NET_IF:
1577 		break;
1578 	case NEXUS_TYPE_KERNEL_PIPE:
1579 		if (init == NULL) {
1580 			*err = EINVAL;
1581 			goto done;
1582 		}
1583 		break;
1584 	case NEXUS_TYPE_FLOW_SWITCH:
1585 		if (init != NULL) {
1586 			*err = EINVAL;
1587 			goto done;
1588 		}
1589 		break;
1590 
1591 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1592 	default:
1593 		*err = EINVAL;
1594 		goto done;
1595 	}
1596 
1597 	/* callee holds a reference for our caller upon success */
1598 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1599 
1600 done:
1601 	return nxprov;
1602 }
1603 
1604 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1605 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1606 {
1607 	struct kern_nexus_provider *nxprov = NULL;
1608 	int err = 0;
1609 
1610 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1611 
1612 	SK_LOCK();
1613 
1614 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1615 		if (nxctl == nxprov->nxprov_ctl &&
1616 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1617 			nxprov_retain_locked(nxprov);
1618 			break;
1619 		}
1620 	}
1621 
1622 	if (nxprov == NULL) {
1623 		err = ENOENT;
1624 	} else {
1625 		err = nxprov_close(nxprov, TRUE);
1626 	}
1627 
1628 	if (nxprov != NULL) {
1629 		(void) nxprov_release_locked(nxprov);
1630 	}
1631 
1632 	SK_UNLOCK();
1633 
1634 	return err;
1635 }
1636 
1637 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1638 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1639 {
1640 	int err = 0;
1641 
1642 	if (!locked) {
1643 		SK_LOCK();
1644 	}
1645 
1646 	SK_LOCK_ASSERT_HELD();
1647 
1648 #if SK_LOG
1649 	uuid_string_t uuidstr;
1650 	SK_D("nxprov %p UUID %s flags 0x%x", SK_KVA(nxprov),
1651 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1652 	    nxprov->nxprov_flags);
1653 #endif /* SK_LOG */
1654 
1655 	if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1656 		err = EALREADY;
1657 	} else {
1658 		struct kern_nexus *nx, *tnx;
1659 
1660 		nxprov->nxprov_ctl = NULL;
1661 
1662 		STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1663 		    nx_prov_link, tnx) {
1664 			nx_retain_locked(nx);
1665 			(void) nx_close(nx, TRUE);
1666 			(void) nx_release_locked(nx);
1667 		}
1668 
1669 		if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1670 			/* no nexus created on this, so detach now */
1671 			nxprov_detach(nxprov, TRUE);
1672 		} else {
1673 			/* detach when last nexus is destroyed */
1674 			ASSERT(nxprov->nxprov_refcnt > 1);
1675 			nxprov->nxprov_flags |= NXPROVF_CLOSED;
1676 		}
1677 	}
1678 
1679 	if (!locked) {
1680 		SK_UNLOCK();
1681 	}
1682 
1683 	return err;
1684 }
1685 
1686 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1687 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1688 {
1689 	if (!locked) {
1690 		SK_LOCK();
1691 	}
1692 
1693 	SK_LOCK_ASSERT_HELD();
1694 
1695 #if SK_LOG
1696 	uuid_string_t uuidstr;
1697 	SK_D("nxprov %p UUID %s flags 0x%x", SK_KVA(nxprov),
1698 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1699 	    nxprov->nxprov_flags);
1700 #endif /* SK_LOG */
1701 
1702 	ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1703 	STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1704 	nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1705 
1706 	/* caller must hold an extra ref */
1707 	ASSERT(nxprov->nxprov_refcnt > 1);
1708 	(void) nxprov_release_locked(nxprov);
1709 
1710 	if (!locked) {
1711 		SK_UNLOCK();
1712 	}
1713 }
1714 
1715 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1716 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1717 {
1718 	struct kern_nexus_provider *nxprov;
1719 	struct nxprov_params *nxp;
1720 
1721 	ASSERT(nxdom_prov != NULL);
1722 
1723 	nxp = nxprov_params_alloc(how);
1724 	if (nxp == NULL) {
1725 		SK_ERR("Failed to allocate nxprov_params");
1726 		return NULL;
1727 	}
1728 
1729 	nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1730 	if (nxprov == NULL) {
1731 		SK_ERR("Failed to allocate nxprov");
1732 		nxprov_params_free(nxp);
1733 		return NULL;
1734 	}
1735 
1736 	nxprov->nxprov_dom_prov = nxdom_prov;
1737 	nxprov->nxprov_params = nxp;
1738 	/* hold a reference for nxprov */
1739 	nxdom_prov_retain_locked(nxdom_prov);
1740 
1741 	return nxprov;
1742 }
1743 
1744 static void
nxprov_free(struct kern_nexus_provider * nxprov)1745 nxprov_free(struct kern_nexus_provider *nxprov)
1746 {
1747 	struct kern_nexus_domain_provider *nxdom_prov =
1748 	    nxprov->nxprov_dom_prov;
1749 
1750 	SK_LOCK_ASSERT_HELD();
1751 
1752 	ASSERT(nxdom_prov != NULL);
1753 	(void) nxdom_prov_release_locked(nxdom_prov);
1754 	nxprov->nxprov_dom_prov = NULL;
1755 	ASSERT(nxprov->nxprov_params != NULL);
1756 	nxprov_params_free(nxprov->nxprov_params);
1757 	nxprov->nxprov_params = NULL;
1758 	ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1759 	SK_DF(SK_VERB_MEM, "nxprov %p FREE", SK_KVA(nxprov));
1760 	zfree(nxprov_zone, nxprov);
1761 }
1762 
1763 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1764 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1765 {
1766 	SK_LOCK_ASSERT_HELD();
1767 
1768 	nxprov->nxprov_refcnt++;
1769 	ASSERT(nxprov->nxprov_refcnt != 0);
1770 }
1771 
1772 void
nxprov_retain(struct kern_nexus_provider * nxprov)1773 nxprov_retain(struct kern_nexus_provider *nxprov)
1774 {
1775 	SK_LOCK();
1776 	nxprov_retain_locked(nxprov);
1777 	SK_UNLOCK();
1778 }
1779 
1780 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1781 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1782 {
1783 	int oldref = nxprov->nxprov_refcnt;
1784 
1785 	SK_LOCK_ASSERT_HELD();
1786 
1787 	ASSERT(nxprov->nxprov_refcnt != 0);
1788 	if (--nxprov->nxprov_refcnt == 0) {
1789 		nxprov_free(nxprov);
1790 	}
1791 
1792 	return oldref == 1;
1793 }
1794 
1795 int
nxprov_release(struct kern_nexus_provider * nxprov)1796 nxprov_release(struct kern_nexus_provider *nxprov)
1797 {
1798 	int lastref;
1799 
1800 	SK_LOCK();
1801 	lastref = nxprov_release_locked(nxprov);
1802 	SK_UNLOCK();
1803 
1804 	return lastref;
1805 }
1806 
1807 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1808 nxprov_params_alloc(zalloc_flags_t how)
1809 {
1810 	return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1811 }
1812 
1813 void
nxprov_params_free(struct nxprov_params * nxp)1814 nxprov_params_free(struct nxprov_params *nxp)
1815 {
1816 	SK_DF(SK_VERB_MEM, "nxp %p FREE", SK_KVA(nxp));
1817 	zfree(nxprov_params_zone, nxp);
1818 }
1819 
1820 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1821 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1822 {
1823 	struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1824 
1825 	if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1826 		SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1827 		return ENOTSUP;
1828 	}
1829 
1830 	/*
1831 	 * Require that the nexus domain metadata type and the
1832 	 * metadata type of the caller-provided pbufpool match.
1833 	 */
1834 	if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1835 	    pp->pp_md_type ||
1836 	    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1837 	    pp->pp_md_subtype) {
1838 		SK_ERR("Mismatch in metadata type/subtype "
1839 		    "(%u/%u != %u/%u)", pp->pp_md_type,
1840 		    nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1841 		    pp->pp_md_subtype,
1842 		    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1843 		return EINVAL;
1844 	}
1845 
1846 	/*
1847 	 * Require that the nexus provider memory configuration
1848 	 * has the same impedance as the caller-provided one.
1849 	 * Both need to be lacking or present; if one of them
1850 	 * is set and the other isn't, then we bail.
1851 	 */
1852 	if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1853 	    !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1854 		SK_ERR("Memory config mismatch: monolithic mode");
1855 		return EINVAL;
1856 	}
1857 
1858 	return 0;
1859 }
1860 
1861 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1862 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1863     const nexus_type_t dom_type, const void *nx_ctx,
1864     nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1865     struct kern_pbufpool *rx_pp, int *err)
1866 {
1867 	struct kern_nexus_domain_provider *nxdom_prov;
1868 	struct kern_nexus_provider *nxprov = NULL;
1869 	struct kern_nexus *nx = NULL;
1870 #if SK_LOG
1871 	uuid_string_t uuidstr;
1872 #endif /* SK_LOG */
1873 
1874 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1875 
1876 	ASSERT(dom_type < NEXUS_TYPE_MAX);
1877 	ASSERT(!uuid_is_null(nxprov_uuid));
1878 	*err = 0;
1879 
1880 	SK_LOCK();
1881 
1882 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1883 		if (nxctl == nxprov->nxprov_ctl &&
1884 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1885 			break;
1886 		}
1887 	}
1888 
1889 	if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1890 		SK_ERR("Provider not found or has been closed");
1891 		*err = ENOENT;
1892 		goto done;
1893 	}
1894 
1895 	nxdom_prov = nxprov->nxprov_dom_prov;
1896 	if (dom_type != NEXUS_TYPE_UNDEFINED &&
1897 	    (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1898 		SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1899 		    dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1900 		nxdom_prov = NULL;
1901 		nxprov = NULL;
1902 		*err = ENODEV;
1903 		goto done;
1904 	}
1905 
1906 	if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1907 	    (!tx_pp || !rx_pp)) {
1908 #if SK_LOG
1909 		SK_ERR("TX/RX packet pool is required for netif logical link "
1910 		    "nexus provider UUID: %s",
1911 		    sk_uuid_unparse(nxprov_uuid, uuidstr));
1912 #endif /* SK_LOG */
1913 		nxdom_prov = NULL;
1914 		nxprov = NULL;
1915 		*err = EINVAL;
1916 		goto done;
1917 	}
1918 
1919 	if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1920 	    (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1921 		goto done;
1922 	}
1923 
1924 	nx = nx_alloc(Z_WAITOK);
1925 
1926 	STAILQ_INIT(&nx->nx_ch_head);
1927 	STAILQ_INIT(&nx->nx_ch_nonxref_head);
1928 	lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1929 	    &nexus_lock_attr);
1930 	STAILQ_INIT(&nx->nx_ch_if_adv_head);
1931 	uuid_generate_random(nx->nx_uuid);
1932 	nx->nx_prov = nxprov;
1933 	nx->nx_ctx = __DECONST(void *, nx_ctx);
1934 	nx->nx_ctx_release = nx_ctx_release;
1935 	nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1936 
1937 	if (tx_pp != NULL) {
1938 		nx->nx_tx_pp = tx_pp;
1939 		pp_retain(tx_pp);       /* released by nx_free */
1940 	}
1941 
1942 	if (rx_pp != NULL) {
1943 		nx->nx_rx_pp = rx_pp;
1944 		pp_retain(rx_pp);       /* released by nx_free */
1945 	}
1946 
1947 	/* this nexus is alive; tell the nexus constructor to set it up */
1948 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1949 		*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1950 		if (*err != 0) {
1951 			nx->nx_prov = NULL;
1952 			goto done;
1953 		}
1954 	}
1955 
1956 	nxprov_retain_locked(nxprov);   /* hold a ref on the nexus reg */
1957 
1958 	STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1959 	nxprov->nxprov_nx_count++;
1960 	RB_INSERT(kern_nexus_tree, &nx_head, nx);
1961 	os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1962 
1963 	nx_retain_locked(nx);   /* one for the provider list */
1964 	nx_retain_locked(nx);   /* one for the global list */
1965 	nx_retain_locked(nx);   /* one for the caller */
1966 
1967 #if SK_LOG
1968 	SK_D("nexus %p (%s:%s) UUID %s", SK_KVA(nx),
1969 	    nxdom_prov->nxdom_prov_dom->nxdom_name,
1970 	    nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1971 #endif /* SK_LOG */
1972 done:
1973 	SK_UNLOCK();
1974 
1975 	if (*err != 0) {
1976 		if (nx != NULL) {
1977 			nx_free(nx);
1978 			nx = NULL;
1979 		}
1980 	}
1981 	return nx;
1982 }
1983 
1984 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1985 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1986 {
1987 	struct kern_nexus *nx = NULL;
1988 	struct kern_nexus find;
1989 	int err = 0;
1990 
1991 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1992 
1993 	SK_LOCK();
1994 
1995 	uuid_copy(find.nx_uuid, nx_uuid);
1996 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1997 	if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
1998 		nx = NULL;
1999 	}
2000 
2001 	if (nx != NULL) {
2002 		nx_retain_locked(nx);
2003 	}
2004 
2005 	if (nx == NULL) {
2006 		err = ENOENT;
2007 	} else {
2008 		/* prevent any opens */
2009 		os_atomic_or(&nx->nx_flags, NXF_INVALIDATED, relaxed);
2010 		err = nx_close(nx, TRUE);
2011 		(void) nx_release_locked(nx);
2012 	}
2013 
2014 	SK_UNLOCK();
2015 
2016 	return err;
2017 }
2018 
2019 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)2020 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
2021 {
2022 	return uuid_compare(a->nx_uuid, b->nx_uuid);
2023 }
2024 
2025 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)2026 nx_find(const uuid_t nx_uuid, boolean_t locked)
2027 {
2028 	struct kern_nexus *nx = NULL;
2029 	struct kern_nexus find;
2030 
2031 	if (!locked) {
2032 		SK_LOCK();
2033 	}
2034 
2035 	SK_LOCK_ASSERT_HELD();
2036 
2037 	uuid_copy(find.nx_uuid, nx_uuid);
2038 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2039 	if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2040 		nx = NULL;
2041 	}
2042 
2043 	/* return reference to caller */
2044 	if (nx != NULL) {
2045 		nx_retain_locked(nx);
2046 	}
2047 
2048 	if (!locked) {
2049 		SK_UNLOCK();
2050 	}
2051 
2052 	return nx;
2053 }
2054 
2055 int
nx_close(struct kern_nexus * nx,boolean_t locked)2056 nx_close(struct kern_nexus *nx, boolean_t locked)
2057 {
2058 	int err = 0;
2059 
2060 	if (!locked) {
2061 		SK_LOCK();
2062 	}
2063 
2064 	SK_LOCK_ASSERT_HELD();
2065 
2066 
2067 	if (nx->nx_flags & NXF_CLOSED) {
2068 		err = EALREADY;
2069 	} else {
2070 #if SK_LOG
2071 		uuid_string_t uuidstr;
2072 		SK_D("nexus %p (%s:%s) UUID %s flags 0x%x", SK_KVA(nx),
2073 		    NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2074 		    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags);
2075 #endif /* SK_LOG */
2076 
2077 		if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2078 			/* no regular channels open to it, so detach now */
2079 			nx_detach(nx);
2080 		} else {
2081 			/* detach when the last channel closes */
2082 			ASSERT(nx->nx_refcnt > 3);
2083 			os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2084 		}
2085 	}
2086 
2087 	if (!locked) {
2088 		SK_UNLOCK();
2089 	}
2090 
2091 	return err;
2092 }
2093 
2094 void
nx_stop(struct kern_nexus * nx)2095 nx_stop(struct kern_nexus *nx)
2096 {
2097 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2098 
2099 	SK_LOCK_ASSERT_HELD();
2100 
2101 	/* send a stop message */
2102 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2103 		nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2104 	}
2105 }
2106 
2107 void
nx_detach(struct kern_nexus * nx)2108 nx_detach(struct kern_nexus *nx)
2109 {
2110 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2111 
2112 	SK_LOCK_ASSERT_HELD();
2113 
2114 #if SK_LOG
2115 	uuid_string_t uuidstr;
2116 	SK_D("nexus %p UUID %s flags 0x%x", SK_KVA(nx),
2117 	    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags);
2118 #endif /* SK_LOG */
2119 
2120 	/* Caller must hold extra refs, on top of the two in reg/global lists */
2121 	ASSERT(nx->nx_refcnt >= 3);
2122 	ASSERT(nx->nx_flags & NXF_ATTACHED);
2123 
2124 	/* this nexus is done; let the nexus destructor do final cleanups */
2125 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2126 		nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2127 	}
2128 
2129 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2130 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2131 
2132 	STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2133 	nxprov->nxprov_nx_count--;
2134 	RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2135 	os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2136 	nx->nx_prov = NULL;
2137 	if (nx->nx_ctx_release != NULL) {
2138 		nx->nx_ctx_release(nx->nx_ctx);
2139 	}
2140 	nx->nx_ctx = NULL;
2141 
2142 	(void) nx_release_locked(nx);   /* one for the reg list */
2143 	(void) nx_release_locked(nx);   /* one for the global list */
2144 
2145 	/*
2146 	 * If this was the last nexus and the provider has been closed,
2147 	 * detach the provider and and finish up the postponed job.
2148 	 */
2149 	if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2150 	    (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2151 		nxprov_detach(nxprov, TRUE);
2152 	}
2153 	(void) nxprov_release_locked(nxprov);
2154 }
2155 
2156 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2157 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2158     struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2159 {
2160 	struct __kern_nexus_adv_metadata *adv_md;
2161 	uint32_t msize = 0;
2162 	/* -fbounds-safety: why do we need maddr? */
2163 	void *__sized_by(msize) maddr = NULL;
2164 
2165 	static_assert(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2166 	static_assert((sizeof(struct sk_nexusadv) +
2167 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2168 	static_assert((sizeof(struct netif_nexus_advisory) +
2169 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2170 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2171 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2172 	ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2173 	    type == NEXUS_ADVISORY_TYPE_NETIF);
2174 
2175 	if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2176 	    NULL, NULL, NULL)) == NULL) {
2177 		return ENOMEM;
2178 	}
2179 
2180 	nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, &maddr,
2181 	    NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC),
2182 	    nx->nx_adv.nxv_reg->skr_c_obj_size, &msize);
2183 	nx->nx_adv.nxv_adv_size = nx->nx_adv.nxv_reg->skr_c_obj_size;
2184 	adv_md = nx->nx_adv.nxv_adv;
2185 	adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2186 	adv_md->knam_type = type;
2187 	adv_md->__reserved = 0;
2188 	nx->nx_adv.nxv_adv_type = type;
2189 	nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2190 	if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2191 		nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2192 		    NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2193 	} else {
2194 		nx->nx_adv.netif_nxv_adv->nna_version =
2195 		    NX_NETIF_ADVISORY_CURRENT_VERSION;
2196 	}
2197 	return 0;
2198 }
2199 
2200 void
nx_advisory_free(struct kern_nexus * nx)2201 nx_advisory_free(struct kern_nexus *nx)
2202 {
2203 	if (nx->nx_adv.nxv_reg != NULL) {
2204 		ASSERT(nx->nx_adv.nxv_adv != NULL);
2205 		skmem_region_free(nx->nx_adv.nxv_reg,
2206 		    nx->nx_adv.nxv_adv, NULL);
2207 		nx->nx_adv.nxv_adv = NULL;
2208 		nx->nx_adv.nxv_adv_size = 0;
2209 		nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2210 		nx->nx_adv.flowswitch_nxv_adv = NULL;
2211 		skmem_region_release(nx->nx_adv.nxv_reg);
2212 		nx->nx_adv.nxv_reg = NULL;
2213 	}
2214 
2215 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2216 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2217 	ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2218 	ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2219 }
2220 
2221 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2222 nx_alloc(zalloc_flags_t how)
2223 {
2224 	SK_LOCK_ASSERT_HELD();
2225 
2226 	return zalloc_flags(nx_zone, how | Z_ZERO);
2227 }
2228 
2229 static void
nx_free(struct kern_nexus * nx)2230 nx_free(struct kern_nexus *nx)
2231 {
2232 	ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2233 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2234 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2235 
2236 	nx_port_free_all(nx);
2237 
2238 	if (nx->nx_tx_pp != NULL) {
2239 		pp_release(nx->nx_tx_pp);
2240 		nx->nx_tx_pp = NULL;
2241 	}
2242 	if (nx->nx_rx_pp != NULL) {
2243 		pp_release(nx->nx_rx_pp);
2244 		nx->nx_rx_pp = NULL;
2245 	}
2246 
2247 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2248 	lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2249 
2250 	SK_DF(SK_VERB_MEM, "nexus %p FREE", SK_KVA(nx));
2251 	zfree(nx_zone, nx);
2252 }
2253 
2254 void
nx_retain_locked(struct kern_nexus * nx)2255 nx_retain_locked(struct kern_nexus *nx)
2256 {
2257 	SK_LOCK_ASSERT_HELD();
2258 
2259 	nx->nx_refcnt++;
2260 	VERIFY(nx->nx_refcnt > 0);
2261 }
2262 
2263 void
nx_retain(struct kern_nexus * nx)2264 nx_retain(struct kern_nexus *nx)
2265 {
2266 	SK_LOCK();
2267 	nx_retain_locked(nx);
2268 	SK_UNLOCK();
2269 }
2270 
2271 int
nx_release_locked(struct kern_nexus * nx)2272 nx_release_locked(struct kern_nexus *nx)
2273 {
2274 	int oldref = nx->nx_refcnt;
2275 
2276 	SK_LOCK_ASSERT_HELD();
2277 
2278 	VERIFY(nx->nx_refcnt > 0);
2279 	if (--nx->nx_refcnt == 0) {
2280 		nx_free(nx);
2281 	}
2282 
2283 	return oldref == 1;
2284 }
2285 
2286 int
nx_release(struct kern_nexus * nx)2287 nx_release(struct kern_nexus *nx)
2288 {
2289 	int lastref;
2290 
2291 	SK_LOCK_ASSERT_NOTHELD();
2292 
2293 	SK_LOCK();
2294 	lastref = nx_release_locked(nx);
2295 	SK_UNLOCK();
2296 
2297 	return lastref;
2298 }
2299 
2300 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2301 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2302 {
2303 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2304 	struct nexus_adapter *na = ch->ch_na;
2305 	boolean_t undo = FALSE;
2306 	int ksd_retains = 0;
2307 	enum txrx t;
2308 	int err = 0;
2309 
2310 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2311 	    CHANF_EXT_PRECONNECT);
2312 
2313 	if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2314 		return 0;
2315 	}
2316 
2317 	for_rx_tx(t) {
2318 		uint32_t i;
2319 
2320 		for (i = 0; i < na_get_nrings(na, t); i++) {
2321 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2322 
2323 			/* skip host rings */
2324 			if (kring->ckr_flags & CKRF_HOST) {
2325 				continue;
2326 			}
2327 
2328 			if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2329 				    nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2330 				    &kring->ckr_ctx)) != 0) {
2331 				SK_D("ch %p flags %x nx %p kr \"%s\" "
2332 				    "(%p) krflags %x ring_init error %d",
2333 				    SK_KVA(ch), ch->ch_flags, SK_KVA(nx),
2334 				    kring->ckr_name, SK_KVA(kring),
2335 				    kring->ckr_flags, err);
2336 				kring->ckr_ctx = NULL;
2337 				undo = TRUE;
2338 				break;
2339 			}
2340 			kring->ckr_flags |= CKRF_EXT_RING_INITED;
2341 
2342 			if ((err = nx_init_slots(nx, kring)) != 0) {
2343 				undo = TRUE;
2344 				break;
2345 			}
2346 
2347 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2348 				++ksd_retains;
2349 			}
2350 		}
2351 		if (undo) {
2352 			break;
2353 		}
2354 	}
2355 
2356 	/*
2357 	 * Note: retain KSD even in case of error, as we have set
2358 	 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2359 	 * nx_fini_rings would take care of release based on it.
2360 	 */
2361 	if (ksd_retains != 0) {
2362 		/*
2363 		 * Mark the kernel slot descriptor region as busy; this
2364 		 * prevents it from being torn-down at channel defunct
2365 		 * time, as we need to invoke the slot_fini() callback
2366 		 * for each slot and we need the descriptors until then.
2367 		 */
2368 		skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2369 		    ksd_retains);
2370 	}
2371 
2372 	if (err != 0) {
2373 		ASSERT(undo);
2374 		nx_fini_rings(nx, ch);
2375 	}
2376 
2377 	return err;
2378 }
2379 
2380 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2381 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2382 {
2383 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2384 	struct nexus_adapter *na = ch->ch_na;
2385 	int ksd_releases = 0;
2386 	enum txrx t;
2387 
2388 	for_rx_tx(t) {
2389 		uint32_t i;
2390 
2391 		for (i = 0; i < na_get_nrings(na, t); i++) {
2392 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2393 
2394 			if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2395 				continue;
2396 			}
2397 
2398 			ASSERT(!(kring->ckr_flags & CKRF_HOST));
2399 			ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2400 			nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2401 			kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2402 
2403 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2404 				++ksd_releases;
2405 			}
2406 
2407 			/*
2408 			 * Undo the work done in nx_init_slots() and inform
2409 			 * the external domain provider, if applicable, that
2410 			 * the slots for this ring are no longer valid.
2411 			 */
2412 			nx_fini_slots(nx, kring);
2413 			kring->ckr_ctx = NULL;
2414 		}
2415 	}
2416 
2417 	if (ksd_releases != 0) {
2418 		/*
2419 		 * Now that we've finished invoking the slot_fini()
2420 		 * callbacks, release the busy retain counts held
2421 		 * earlier in nx_init_rings().  This will allow the
2422 		 * kernel slot descriptor region to be torn down.
2423 		 */
2424 		skmem_arena_nexus_sd_set_noidle(
2425 			skmem_arena_nexus(na->na_arena), -ksd_releases);
2426 	}
2427 }
2428 
2429 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2430 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2431 {
2432 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2433 	struct __slot_desc *slot = kring->ckr_ksds;
2434 	int err = 0;
2435 	uint32_t i;
2436 
2437 	/*
2438 	 * If the slot init callback was not provided, or if the
2439 	 * kring was not created to hold any slot contexts, don't
2440 	 * go any further.
2441 	 */
2442 	if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2443 	    kring->ckr_slot_ctxs == NULL) {
2444 		return 0;
2445 	}
2446 
2447 	ASSERT(kring->ckr_slot_ctxs_set == 0);
2448 	ASSERT(slot != NULL);
2449 
2450 	for (i = 0; i < kring->ckr_num_slots; i++) {
2451 		struct kern_slot_prop *__single slot_ctx_prop = NULL;
2452 		/* -fbounds-safety: slot_ctx is unsafe anyway (mach_vmaddr_t) */
2453 		void *__single slot_ctx_arg = NULL;
2454 
2455 		ASSERT(&slot[i] <= kring->ckr_ksds_last);
2456 		if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2457 		    &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2458 			SK_D("nx %p kr \"%s\" (%p) krflags %x slot %u "
2459 			    "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2460 			    SK_KVA(kring), kring->ckr_flags, i, err);
2461 			break;
2462 		}
2463 		/* we don't want this to be used by client, so verify here */
2464 		ASSERT(slot_ctx_prop == NULL);
2465 		kring->ckr_slot_ctxs[i].slot_ctx_arg = slot_ctx_arg;
2466 		kring->ckr_slot_ctxs_set++;
2467 	}
2468 
2469 	if (err != 0) {
2470 		nx_fini_slots(nx, kring);
2471 	} else {
2472 		kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2473 	}
2474 
2475 	return err;
2476 }
2477 
2478 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2479 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2480 {
2481 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2482 	struct __slot_desc *slot = kring->ckr_ksds;
2483 	uint32_t i;
2484 
2485 	ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2486 	    nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2487 	ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2488 
2489 	for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2490 		ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2491 		if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2492 			nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2493 			    kring, &slot[i], i);
2494 		}
2495 		if (kring->ckr_slot_ctxs != NULL) {
2496 			kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2497 		}
2498 	}
2499 	kring->ckr_slot_ctxs_set = 0;
2500 
2501 	/* We're done with this kring */
2502 	kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2503 }
2504 
2505 
2506 /* 64-bit mask with range */
2507 #define BMASK64(_beg, _end)     \
2508 	((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2509 
2510 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2511 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2512     nexus_port_t last, nexus_port_t *nx_port)
2513 {
2514 	int err = 0;
2515 
2516 	ASSERT(first < last);
2517 	*nx_port = NEXUS_PORT_ANY;
2518 
2519 	if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2520 		/*
2521 		 * Left edge of the range is beyond the current map;
2522 		 * let nx_port_alloc() handle the growing later.
2523 		 */
2524 		*nx_port = first;
2525 	} else {
2526 		nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2527 		nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2528 		nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2529 		nexus_port_size_t i, j;
2530 		bitmap_t *bmap;
2531 
2532 		/*
2533 		 * The right edge of the range is either within or
2534 		 * beyond the current map; scan thru the current
2535 		 * map and find the first available port.
2536 		 */
2537 		for (i = fc; i <= lc; i++) {
2538 			bitmap_t mask;
2539 			nexus_port_size_t beg = 0, end = 63;
2540 
2541 			if (i == fc) {
2542 				beg = (first % NX_PORT_CHUNK);
2543 			}
2544 			if (i == (last / NX_PORT_CHUNK)) {
2545 				end = (last % NX_PORT_CHUNK);
2546 			}
2547 
2548 			if (i < lim) {
2549 				bmap = &nx->nx_ports_bmap[i];
2550 				mask = BMASK64(beg, end);
2551 
2552 				j = (nexus_port_size_t)ffsll((*bmap) & mask);
2553 				if (j == 0) {
2554 					continue;
2555 				}
2556 
2557 				--j;
2558 				*nx_port = (i * NX_PORT_CHUNK) + j;
2559 			}
2560 			break;
2561 		}
2562 
2563 		/*
2564 		 * If the requested range is within the current map and we
2565 		 * couldn't find a port, return an err.  Otherwise, return
2566 		 * the next port index to trigger growing later.
2567 		 */
2568 		if (*nx_port == NEXUS_PORT_ANY) {
2569 			if (lc == (last / NX_PORT_CHUNK)) {
2570 				err = EBUSY;
2571 				SK_ERR("port unavail in [%u, %u)", first, last);
2572 			} else {
2573 				*nx_port = nx->nx_num_ports;
2574 			}
2575 		}
2576 	}
2577 
2578 	SK_DF(SK_VERB_NXPORT, "nx %p nx_port %d (err %d)", SK_KVA(nx),
2579 	    (int)*nx_port, err);
2580 
2581 	return err;
2582 }
2583 
2584 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2585 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2586 {
2587 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2588 	nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2589 	struct nx_port_info *ports;
2590 	nexus_port_size_t limit, i, num_ports, old_num_ports;
2591 	bitmap_t *bmap;
2592 
2593 	ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2594 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2595 	static_assert((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2596 	ASSERT(powerof2(dom_port_max));
2597 	ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2598 
2599 	old_num_ports = nx->nx_num_ports;
2600 	num_ports = nx->nx_num_ports + grow;
2601 	limit = (nexus_port_size_t)P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2602 	if (num_ports > limit) {
2603 		SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2604 		    nx->nx_num_ports, grow, num_ports, limit);
2605 		return EDOM;
2606 	}
2607 
2608 	if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2609 	    (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2610 	    (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2611 	    Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2612 		SK_ERR("bmap alloc failed, num_port %u", num_ports);
2613 		return ENOMEM;
2614 	}
2615 	nx->nx_ports_bmap = bmap;
2616 	nx->nx_ports_bmap_size = (num_ports / NX_PORT_CHUNK) * sizeof(*bmap);
2617 
2618 	if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2619 	    num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2620 		/* can't free bmap here, otherwise nexus won't work */
2621 		SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2622 		return ENOMEM;
2623 	}
2624 
2625 	/* initialize the additional new ports */
2626 	bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2627 
2628 	/* initialize new bitmaps (set all bits) */
2629 	for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2630 	    i < (num_ports / NX_PORT_CHUNK); i++) {
2631 		bmap[i] = NX_PORT_CHUNK_FREE;
2632 	}
2633 
2634 	/*
2635 	 * -fbounds-safety: Not sure if moving nx_ports assignment down here
2636 	 * would cause a regression.
2637 	 */
2638 	nx->nx_ports = ports;
2639 	nx->nx_num_ports = num_ports;
2640 
2641 	SK_DF(SK_VERB_NXPORT, "!!! nx %p ports %u/%u, %u ports added",
2642 	    SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2643 
2644 	return 0;
2645 }
2646 
2647 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2648 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2649     struct nexus_adapter **na, struct proc *p)
2650 {
2651 	struct nx_port_info *npi = NULL;
2652 	struct nxbind *nxb0;
2653 	size_t g;
2654 	uint32_t i, j;
2655 	bitmap_t *bmap;
2656 	bool refonly = false;
2657 	int err = 0;
2658 
2659 	ASSERT(nx_port != NEXUS_PORT_ANY);
2660 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2661 
2662 	/* port is zero-based, so adjust here */
2663 	if ((nx_port + 1) > nx->nx_num_ports) {
2664 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2665 		VERIFY(g <= NEXUS_PORT_MAX);
2666 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2667 			goto done;
2668 		}
2669 	}
2670 	ASSERT(err == 0);
2671 	ASSERT(nx_port < nx->nx_num_ports);
2672 	npi = &nx->nx_ports[nx_port];
2673 	nxb0 = npi->npi_nxb;
2674 	i = nx_port / NX_PORT_CHUNK;
2675 	j = nx_port % NX_PORT_CHUNK;
2676 	bmap = &nx->nx_ports_bmap[i];
2677 
2678 	if (bit_test(*bmap, j)) {
2679 		/* port is not (yet) bound or allocated */
2680 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2681 		if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2682 			/*
2683 			 * If the port allocation is requested by userland
2684 			 * and the nexus is non-anonymous, then fail the
2685 			 * request.
2686 			 */
2687 			err = EACCES;
2688 			SK_ERR("user proc alloc on named nexus needs binding");
2689 		} else if (na != NULL && *na != NULL) {
2690 			/*
2691 			 * Otherwise claim it (clear bit) if the caller
2692 			 * supplied an adapter for this port; else, it
2693 			 * is just an existential check and so there's
2694 			 * no action needed at this point (we'll skip
2695 			 * the init below since vpna is NULL).
2696 			 */
2697 			bit_clear(*bmap, j);
2698 		}
2699 	} else {
2700 		/* if port is bound, check if credentials match */
2701 		if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2702 		    (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2703 			SK_ERR("nexus binding mismatch");
2704 			err = EACCES;
2705 		} else {
2706 			/*
2707 			 * If port is already occupied by an adapter,
2708 			 * see if the client is requesting a reference
2709 			 * to it; if so, return the adapter.  Otherwise,
2710 			 * if unoccupied and vpna is non-NULL, associate
2711 			 * it with this nexus port via the below init.
2712 			 */
2713 			if (NPI_NA(npi) != NULL) {
2714 				if (na != NULL && *na == NULL) {
2715 					*na = NPI_NA(npi);
2716 					na_retain_locked(*na);
2717 					/* skip the init below */
2718 					refonly = true;
2719 				} else {
2720 					/*
2721 					 * If the client supplied an adapter
2722 					 * (regardless of its value) for a
2723 					 * nexus port that's already occupied,
2724 					 * then we fail the request.
2725 					 */
2726 					SK_ERR("nexus adapted exits");
2727 					err = EEXIST;
2728 				}
2729 			}
2730 		}
2731 	}
2732 
2733 done:
2734 	/* initialize the nexus port and the adapter occupying it */
2735 	if (err == 0 && na != NULL && *na != NULL && !refonly) {
2736 		ASSERT(nx_port < nx->nx_num_ports);
2737 		ASSERT(npi->npi_nah == 0);
2738 		ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2739 		ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2740 		    (nx_port % NX_PORT_CHUNK)));
2741 
2742 		nx->nx_active_ports++;
2743 		npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2744 		(*na)->na_nx_port = nx_port;
2745 	}
2746 
2747 	SK_DF(SK_VERB_NXPORT, "nx %p nx_port %d, ports %u/%u (err %d)",
2748 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2749 	    err);
2750 
2751 	return err;
2752 }
2753 
2754 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2755 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2756 {
2757 	struct nx_port_info *npi = &nx->nx_ports[nx_port];
2758 
2759 	npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2760 	    NEXUS_PORT_STATE_DEFUNCT);
2761 }
2762 
2763 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2764 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2765 {
2766 	struct nx_port_info *npi = NULL;
2767 	bitmap_t *bmap;
2768 	uint32_t i, j;
2769 
2770 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2771 	ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2772 	ASSERT(nx->nx_active_ports != 0);
2773 
2774 	i = nx_port / NX_PORT_CHUNK;
2775 	j = nx_port % NX_PORT_CHUNK;
2776 	bmap = &nx->nx_ports_bmap[i];
2777 	ASSERT(!bit_test(*bmap, j));
2778 
2779 	npi = &nx->nx_ports[nx_port];
2780 	npi->npi_nah = 0;
2781 	if (npi->npi_nxb == NULL) {
2782 		/* it's vacant, release it (set bit) */
2783 		bit_set(*bmap, j);
2784 	}
2785 
2786 	nx->nx_active_ports--;
2787 
2788 	//XXX [email protected] --- try to shrink bitmap & nx_ports ???
2789 
2790 	SK_DF(SK_VERB_NXPORT, "--- nx %p nx_port %d, ports %u/%u",
2791 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2792 }
2793 
2794 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2795 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2796     struct nxbind *nxb0, void *info)
2797 {
2798 	struct nx_port_info *npi = NULL;
2799 	size_t g;
2800 	uint32_t i, j;
2801 	bitmap_t *bmap;
2802 	int err = 0;
2803 
2804 	ASSERT(nx_port != NEXUS_PORT_ANY);
2805 	ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2806 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2807 	ASSERT(nxb0 != NULL);
2808 
2809 	if ((nx_port) + 1 > nx->nx_num_ports) {
2810 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2811 		VERIFY(g <= NEXUS_PORT_MAX);
2812 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2813 			goto done;
2814 		}
2815 	}
2816 	ASSERT(err == 0);
2817 
2818 	npi = &nx->nx_ports[nx_port];
2819 	i = nx_port / NX_PORT_CHUNK;
2820 	j = nx_port % NX_PORT_CHUNK;
2821 	bmap = &nx->nx_ports_bmap[i];
2822 	if (bit_test(*bmap, j)) {
2823 		/* port is not (yet) bound or allocated */
2824 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2825 
2826 		bit_clear(*bmap, j);
2827 		struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2828 		nxb_move(nxb0, nxb);
2829 		npi->npi_nxb = nxb;
2830 		npi->npi_info = info;
2831 		/* claim it (clear bit) */
2832 		bit_clear(*bmap, j);
2833 		ASSERT(err == 0);
2834 	} else {
2835 		/* port is already taken */
2836 		ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2837 		err = EEXIST;
2838 	}
2839 done:
2840 
2841 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2842 	    "+++ nx %p nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2843 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2844 
2845 	return err;
2846 }
2847 
2848 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2849 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2850 {
2851 	return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2852 }
2853 
2854 /*
2855  * -fbounds-safety: all callers pass npi_info. Why don't we just change the
2856  * input type to nx_port_info_header *?
2857  */
2858 static int
nx_port_info_size(struct nx_port_info_header * info,size_t * sz)2859 nx_port_info_size(struct nx_port_info_header *info, size_t *sz)
2860 {
2861 	struct nx_port_info_header *hdr = info;
2862 
2863 	switch (hdr->ih_type) {
2864 	case NX_PORT_INFO_TYPE_NETIF:
2865 		break;
2866 	default:
2867 		return EINVAL;
2868 	}
2869 	*sz = hdr->ih_size;
2870 	return 0;
2871 }
2872 
2873 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2874 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2875 {
2876 	struct nx_port_info *npi = NULL;
2877 	struct nxbind *nxb;
2878 	uint32_t i, j;
2879 	bitmap_t *bmap;
2880 	int err = 0;
2881 
2882 	ASSERT(nx_port != NEXUS_PORT_ANY);
2883 
2884 	if (nx_port >= nx->nx_num_ports) {
2885 		err = EDOM;
2886 		goto done;
2887 	}
2888 
2889 	npi = &nx->nx_ports[nx_port];
2890 	i = nx_port / NX_PORT_CHUNK;
2891 	j = nx_port % NX_PORT_CHUNK;
2892 	bmap = &nx->nx_ports_bmap[i];
2893 
2894 	if ((nxb = npi->npi_nxb) == NULL) {
2895 		/* must be either free or allocated */
2896 		ASSERT(NPI_NA(npi) == NULL ||
2897 		    (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2898 		err = ENOENT;
2899 	} else {
2900 		nxb_free(nxb);
2901 		npi->npi_nxb = NULL;
2902 		if (npi->npi_info != NULL) {
2903 			size_t sz;
2904 
2905 			VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2906 			sk_free_data(npi->npi_info, sz);
2907 			npi->npi_info = NULL;
2908 		}
2909 		ASSERT(!bit_test(*bmap, j));
2910 		if (NPI_NA(npi) == NULL) {
2911 			/* it's vacant, release it (set bit) */
2912 			bit_set(*bmap, j);
2913 		}
2914 	}
2915 
2916 done:
2917 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2918 	    "--- nx %p nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2919 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2920 
2921 	return err;
2922 }
2923 
2924 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2925 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2926 {
2927 	if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2928 		return NPI_NA(&nx->nx_ports[nx_port]);
2929 	} else {
2930 		return NULL;
2931 	}
2932 }
2933 
2934 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * __sized_by (len)info,uint32_t len)2935 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2936     nx_port_info_type_t type, void *__sized_by(len)info, uint32_t len)
2937 {
2938 	struct nx_port_info *npi;
2939 	struct nx_port_info_header *hdr;
2940 
2941 	if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2942 		return ENXIO;
2943 	}
2944 	npi = &nx->nx_ports[port];
2945 	/*
2946 	 * -fbounds-safety: Changing npi_info to be __sized_by is a major
2947 	 * surgery. Just forge it here for now.
2948 	 */
2949 	hdr = __unsafe_forge_bidi_indexable(struct nx_port_info_header *,
2950 	    npi->npi_info, len);
2951 	if (hdr == NULL) {
2952 		return ENOENT;
2953 	}
2954 
2955 	if (hdr->ih_type != type) {
2956 		return EINVAL;
2957 	}
2958 
2959 	bcopy(hdr, info, len);
2960 	return 0;
2961 }
2962 
2963 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2964 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2965 {
2966 	return nx_port < nx->nx_num_ports;
2967 }
2968 
2969 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2970 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2971 {
2972 	ASSERT(nx_port_is_valid(nx, nx_port));
2973 
2974 	return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2975 }
2976 
2977 void
nx_port_free_all(struct kern_nexus * nx)2978 nx_port_free_all(struct kern_nexus *nx)
2979 {
2980 	/* uncrustify doesn't handle C blocks properly */
2981 	/* BEGIN IGNORE CODESTYLE */
2982 	nx_port_foreach(nx, ^(nexus_port_t p) {
2983 		struct nxbind *nxb;
2984 		/*
2985 		 * XXX -fbounds-safety: Come back to this after fixing npi_info
2986 		 */
2987 		void *__single info;
2988 		nxb = nx->nx_ports[p].npi_nxb;
2989 		info = nx->nx_ports[p].npi_info;
2990 		if (nxb != NULL) {
2991 			nxb_free(nxb);
2992 			nx->nx_ports[p].npi_nxb = NULL;
2993 		}
2994 		if (info != NULL) {
2995 			size_t sz;
2996 
2997 			VERIFY(nx_port_info_size(info, &sz) == 0);
2998 			skn_free_data(info, info, sz);
2999 			nx->nx_ports[p].npi_info = NULL;
3000 		}
3001 	});
3002 	/* END IGNORE CODESTYLE */
3003 
3004 	nx->nx_active_ports = 0;
3005 	sk_free_data_sized_by(nx->nx_ports_bmap, nx->nx_ports_bmap_size);
3006 	nx->nx_ports_bmap = NULL;
3007 	nx->nx_ports_bmap_size = 0;
3008 	sk_free_type_array_counted_by(struct nx_port_info, nx->nx_num_ports, nx->nx_ports);
3009 	nx->nx_ports = NULL;
3010 	nx->nx_num_ports = 0;
3011 }
3012 
3013 void
3014 nx_port_foreach(struct kern_nexus *nx,
3015     void (^port_handle)(nexus_port_t nx_port))
3016 {
3017 	for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
3018 		bitmap_t bmap = nx->nx_ports_bmap[i];
3019 
3020 		if (bmap == NX_PORT_CHUNK_FREE) {
3021 			continue;
3022 		}
3023 
3024 		for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
3025 			if (bit_test(bmap, j)) {
3026 				continue;
3027 			}
3028 			port_handle((i * NX_PORT_CHUNK) + j);
3029 		}
3030 	}
3031 }
3032 
3033 /*
3034  * sysctl interfaces
3035  */
3036 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
3037 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
3038 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
3039 
3040 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
3041     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3042     0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
3043 
3044 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
3045     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3046     0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
3047 
3048 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
3049     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3050     0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
3051     "A list of logical links");
3052 
3053 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3054     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3055     0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3056     "Nexus inet flows with stats collected in kernel");
3057 
3058 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3059     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3060     0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3061     "Nexus flow owners");
3062 
3063 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3064     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3065     0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3066     "Nexus flow routes");
3067 
3068 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3069     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3070     0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3071     "Nexus netif statistics collected in kernel");
3072 
3073 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3074     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3075     0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3076     "Nexus flowswitch statistics collected in kernel");
3077 
3078 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3079     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3080     0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3081     "Nexus userstack statistics counter");
3082 
3083 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3084     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3085     0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3086     "Nexus flow advisory dump");
3087 
3088 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3089     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3090     0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3091     "A list of netif queue stats entries");
3092 
3093 /*
3094  * Provider list sysctl
3095  */
3096 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3097 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3098     nexus_provider_info_t info)
3099 {
3100 	struct kern_nexus *nx;
3101 	uuid_t *uuids;
3102 
3103 	SK_LOCK_ASSERT_HELD();
3104 
3105 	/* provider UUID + params */
3106 	uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3107 	bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3108 	    sizeof(struct nxprov_params));
3109 	info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3110 
3111 	/* instance UUID list */
3112 	uuids = __unsafe_forge_bidi_indexable(uuid_t *,
3113 	    info->npi_instance_uuids, sizeof(uuid_t) * info->npi_instance_uuids_count);
3114 	STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3115 		uuid_copy(*uuids, nx->nx_uuid);
3116 		uuids++;
3117 	}
3118 }
3119 
3120 static int
3121 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3122 {
3123 #pragma unused(arg1, arg2, oidp)
3124 	size_t actual_space;
3125 	caddr_t buffer = NULL;
3126 	size_t buffer_space;
3127 	size_t allocated_space;
3128 	int out_error;
3129 	int error = 0;
3130 	struct kern_nexus_provider *nxprov;
3131 	caddr_t scan;
3132 
3133 	if (!kauth_cred_issuser(kauth_cred_get())) {
3134 		return EPERM;
3135 	}
3136 
3137 	net_update_uptime();
3138 	buffer_space = req->oldlen;
3139 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3140 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3141 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3142 		}
3143 		allocated_space = buffer_space;
3144 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3145 		if (__improbable(buffer == NULL)) {
3146 			return ENOBUFS;
3147 		}
3148 	} else if (req->oldptr == USER_ADDR_NULL) {
3149 		buffer_space = 0;
3150 	}
3151 	actual_space = 0;
3152 	scan = buffer;
3153 	SK_LOCK();
3154 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3155 		size_t                  info_size;
3156 
3157 		info_size
3158 		        = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3159 		if (scan != NULL) {
3160 			if (buffer_space < info_size) {
3161 				/* supplied buffer too small, stop copying */
3162 				error = ENOMEM;
3163 				break;
3164 			}
3165 			nexus_provider_info_populate(nxprov, (void *)scan);
3166 			scan += info_size;
3167 			buffer_space -= info_size;
3168 		}
3169 		actual_space += info_size;
3170 	}
3171 	SK_UNLOCK();
3172 
3173 	out_error = SYSCTL_OUT(req, buffer, actual_space);
3174 	if (out_error != 0) {
3175 		error = out_error;
3176 	}
3177 
3178 	if (buffer != NULL) {
3179 		sk_free_data(buffer, allocated_space);
3180 	}
3181 
3182 	return error;
3183 }
3184 
3185 /*
3186  * Channel list sysctl
3187  */
3188 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3189 channel_ring_count(struct kern_channel *ch, enum txrx which)
3190 {
3191 	return ch->ch_last[which] - ch->ch_first[which];
3192 }
3193 
3194 /*
3195  * -fbounds-safety: kring's range is [first..last]. Marking it
3196  * __counted_by(last) means range is [0..first..last]. The [0..first) might be
3197  * problematic. However, the for loop in this function starts indexing from
3198  * 'first', not 0, so that should be okay.
3199  * XXX Until BATS starts using uncrustify-7 (rdar://90709826), having a space
3200  * between __counted_by(entry_count) entries will be considered invalid code
3201  * style and build will fail. Until rdar://117811249 is resolved, either stick
3202  * to what makes BATS happy, or wrap IGNORE CODESTYLE around.
3203  */
3204 static void
populate_ring_entries(struct __kern_channel_ring * __counted_by (last)kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry * __counted_by (entry_count)entries,uint32_t NX_FB_ARG entry_count)3205 populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring,
3206     ring_id_t first, ring_id_t last,
3207     nexus_channel_ring_entry *__counted_by(entry_count)entries,
3208     uint32_t NX_FB_ARG entry_count)
3209 {
3210 	uint64_t now = net_uptime();
3211 	ring_id_t i;
3212 	nexus_channel_ring_entry_t scan;
3213 	struct __kern_channel_ring *ring;
3214 
3215 	scan = entries;
3216 	for (i = first; i < last; i++, scan++) {
3217 		ring = &kring[i];
3218 
3219 		DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3220 		    ring);
3221 		if (kr_stat_enable == 0) {
3222 			bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3223 			bzero(&scan->ncre_user_stats,
3224 			    sizeof(scan->ncre_user_stats));
3225 		} else {
3226 			scan->ncre_stats = ring->ckr_stats;
3227 			scan->ncre_stats.crs_seconds_since_last_update = now -
3228 			    scan->ncre_stats.crs_last_update_net_uptime;
3229 			scan->ncre_user_stats = ring->ckr_usr_stats;
3230 		}
3231 		scan->ncre_error_stats = ring->ckr_err_stats;
3232 		scan->ncre_ring_id = i;
3233 	}
3234 }
3235 
3236 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3237 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3238 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3239 {
3240 	uint32_t flags = 0;
3241 
3242 	flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3243 	flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3244 	flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3245 	flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3246 	flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3247 	flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3248 	flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3249 	flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3250 	flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3251 	flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3252 
3253 	return flags;
3254 }
3255 
3256 SK_NO_INLINE_ATTRIBUTE
3257 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3258 nexus_channel_entry_populate(struct kern_channel *ch,
3259     nexus_channel_entry_t entry)
3260 {
3261 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3262 	uint32_t ch_flags = ch->ch_flags;
3263 	ring_id_t rx_first = ch->ch_first[NR_RX];
3264 	ring_id_t rx_last = ch->ch_last[NR_RX];
3265 	ring_id_t tx_last = ch->ch_last[NR_TX];
3266 	ring_id_t tx_first = ch->ch_first[NR_TX];
3267 
3268 	uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3269 	entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3270 	entry->nce_port = ch->ch_info->cinfo_nx_port;
3271 	entry->nce_pid = ch->ch_pid;
3272 	entry->nce_fd = ch->ch_fd;
3273 	entry->nce_tx_rings = tx_last - tx_first;
3274 	entry->nce_rx_rings = rx_last - rx_first;
3275 	populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3276 	    entry->nce_ring_entries, entry->nce_tx_rings);
3277 
3278 	/*
3279 	 * -fbounds-safety: If entry->nce_tx_rings > 0 and
3280 	 * entry->nce_rx_rings == 0 (i.e. entry->nce_ring_count ==
3281 	 * entry->nce_tx_rings), simply passing
3282 	 * entry->nce_ring_entries + entry->nce_tx_rings to populate_ring_entries
3283 	 * will fail bounds check, because it is equivalent to assigning
3284 	 * nce_ring_entries + nce_tx_rings to a __single variable, and in this
3285 	 * case it goes out of bounds. It's same thing as having:
3286 	 *     int a[1];
3287 	 *     some_func(a + 1);  <-- bounds check will fail
3288 	 */
3289 	if (rx_first < rx_last) {
3290 		populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3291 		    entry->nce_ring_entries + entry->nce_tx_rings,
3292 		    entry->nce_rx_rings);
3293 	}
3294 }
3295 
3296 SK_NO_INLINE_ATTRIBUTE
3297 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info * __sized_by (buffer_size)info,size_t buffer_size)3298 nexus_channel_info_populate(struct kern_nexus *nx,
3299     nexus_channel_info *__sized_by(buffer_size) info, size_t buffer_size)
3300 {
3301 	struct kern_channel *ch = NULL;
3302 	size_t info_size;
3303 	caddr_t scan = NULL;
3304 	nexus_channel_entry *entry;
3305 
3306 	SK_LOCK_ASSERT_HELD();
3307 
3308 	info_size = sizeof(nexus_channel_info);
3309 
3310 	/* channel list */
3311 	if (info != NULL) {
3312 		if (buffer_size < info_size) {
3313 			return info_size;
3314 		}
3315 
3316 		/* instance UUID */
3317 		uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3318 		info->nci_channel_entries_count = nx->nx_ch_count;
3319 		scan = (caddr_t __bidi_indexable)info->nci_channel_entries;
3320 	}
3321 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3322 		size_t          entry_size;
3323 		uint32_t        ring_count;
3324 
3325 		ring_count = channel_ring_count(ch, NR_TX) +
3326 		    channel_ring_count(ch, NR_RX);
3327 		entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3328 		info_size += entry_size;
3329 		if (scan != NULL) {
3330 			if (buffer_size < info_size) {
3331 				return info_size;
3332 			}
3333 			entry = (nexus_channel_entry *)(void *)scan;
3334 			entry->nce_ring_count = ring_count;
3335 
3336 			nexus_channel_entry_populate(ch, entry);
3337 			scan += entry_size;
3338 		}
3339 	}
3340 	return info_size;
3341 }
3342 
3343 static int
3344 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3345 {
3346 #pragma unused(arg1, arg2, oidp)
3347 	size_t actual_space;
3348 	caddr_t buffer = NULL;
3349 	size_t buffer_space;
3350 	size_t allocated_space;
3351 	int out_error;
3352 	struct kern_nexus *nx;
3353 	int error = 0;
3354 	caddr_t scan;
3355 
3356 	if (!kauth_cred_issuser(kauth_cred_get())) {
3357 		return EPERM;
3358 	}
3359 
3360 	net_update_uptime();
3361 	buffer_space = req->oldlen;
3362 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3363 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3364 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3365 		}
3366 		allocated_space = buffer_space;
3367 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3368 		if (__improbable(buffer == NULL)) {
3369 			return ENOBUFS;
3370 		}
3371 	} else if (req->oldptr == USER_ADDR_NULL) {
3372 		buffer_space = 0;
3373 	}
3374 	actual_space = 0;
3375 	scan = buffer;
3376 	SK_LOCK();
3377 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3378 		size_t info_size;
3379 
3380 		info_size = nexus_channel_info_populate(nx, (void *)scan,
3381 		    buffer_space);
3382 		if (scan != NULL) {
3383 			if (buffer_space < info_size) {
3384 				/* supplied buffer too small, stop copying */
3385 				error = ENOMEM;
3386 				break;
3387 			}
3388 			scan += info_size;
3389 			buffer_space -= info_size;
3390 		}
3391 		actual_space += info_size;
3392 	}
3393 	SK_UNLOCK();
3394 
3395 	if (actual_space != 0) {
3396 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3397 		if (out_error != 0) {
3398 			error = out_error;
3399 		}
3400 	}
3401 	if (buffer != NULL) {
3402 		sk_free_data(buffer, allocated_space);
3403 	}
3404 
3405 	return error;
3406 }
3407 
3408 static int
3409 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3410 {
3411 #pragma unused(arg1, arg2)
3412 	struct proc *p = req->p;
3413 	struct nexus_mib_filter filter;
3414 	int error = 0;
3415 	size_t actual_space;
3416 	size_t allocated_space = 0;
3417 	caddr_t __sized_by(allocated_space) buffer = NULL;
3418 	size_t buffer_space;
3419 	int out_error;
3420 	struct kern_nexus *nx;
3421 	caddr_t scan;
3422 
3423 	/* Restrict protocol stats access to root user only (like netstat). */
3424 	if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3425 	    !kauth_cred_issuser(kauth_cred_get())) {
3426 		SK_ERR("mib request rejected, EPERM");
3427 		return EPERM;
3428 	}
3429 
3430 	if (req->newptr == USER_ADDR_NULL) {
3431 		/*
3432 		 * For flow stats requests, non-root users need to provide a
3433 		 * 5-tuple. Otherwise, we do not grant access.
3434 		 */
3435 		if (oidp->oid_arg2 == NXMIB_FLOW &&
3436 		    !kauth_cred_issuser(kauth_cred_get())) {
3437 			SK_ERR("mib request rejected: tuple not provided");
3438 			return EPERM;
3439 		}
3440 		/* use subcommand for multiple nodes */
3441 		filter.nmf_type = oidp->oid_arg2;
3442 		filter.nmf_bitmap = 0x0;
3443 	} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3444 		SK_ERR("mis-matching newlen");
3445 		return EINVAL;
3446 	} else {
3447 		error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3448 		if (error != 0) {
3449 			SK_ERR("SYSCTL_IN err %d", error);
3450 			return error;
3451 		}
3452 		if (filter.nmf_type != oidp->oid_arg2) {
3453 			SK_ERR("mis-matching nmf_type");
3454 			return EINVAL;
3455 		}
3456 		/*
3457 		 * For flow stats requests, non-root users need to set the nexus
3458 		 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3459 		 * grant access. This ensures that fsw_mib_get_flow looks for a
3460 		 * flow entry that matches the given tuple of the non-root user.
3461 		 */
3462 		if (filter.nmf_type == NXMIB_FLOW &&
3463 		    (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3464 		    !kauth_cred_issuser(kauth_cred_get())) {
3465 			SK_ERR("mib request rejected: tuple filter not set");
3466 			return EPERM;
3467 		}
3468 	}
3469 
3470 	net_update_uptime();
3471 	buffer_space = req->oldlen;
3472 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3473 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3474 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3475 		}
3476 		buffer = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_sysctl_buf);
3477 		allocated_space = buffer_space;
3478 		if (__improbable(buffer == NULL)) {
3479 			return ENOBUFS;
3480 		}
3481 	} else if (req->oldptr == USER_ADDR_NULL) {
3482 		buffer_space = 0;
3483 	}
3484 	actual_space = 0;
3485 	scan = buffer;
3486 
3487 	SK_LOCK();
3488 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3489 		if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3490 			continue;
3491 		}
3492 
3493 		size_t size = 0;
3494 		struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3495 
3496 		/*
3497 		 * -fbounds-safety: Because scan takes the bounds of buffer
3498 		 * (which is __sized_by(allocated_space)), at some point scan
3499 		 * will reach its bounds (because of scan += size). When it
3500 		 * does, it won't pass the bounds check when scan is passed to
3501 		 * nxdom_prov_nx_mib_get function. We need to avoid passing scan
3502 		 * to nxdom_prov_nx_mib_get when it reaches its upper bound,
3503 		 * i.e. when buffer_space reaches 0 (see buffer_space -= size).
3504 		 */
3505 		if (req->oldptr == USER_ADDR_NULL || buffer_space) {
3506 			size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3507 			    buffer_space, p);
3508 		}
3509 
3510 		if (scan != NULL) {
3511 			if (buffer_space < size) {
3512 				/* supplied buffer too small, stop copying */
3513 				error = ENOMEM;
3514 				break;
3515 			}
3516 			scan += size;
3517 			buffer_space -= size;
3518 		}
3519 		actual_space += size;
3520 	}
3521 	SK_UNLOCK();
3522 
3523 	if (actual_space != 0) {
3524 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3525 		if (out_error != 0) {
3526 			error = out_error;
3527 		}
3528 	}
3529 	if (buffer != NULL) {
3530 		sk_free_data_sized_by(buffer, allocated_space);
3531 	}
3532 
3533 	return error;
3534 }
3535 
3536 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3537 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3538     boolean_t is_sk_locked)
3539 {
3540 	struct kern_nexus *nx = NULL;
3541 
3542 	if (!is_sk_locked) {
3543 		SK_LOCK();
3544 	} else {
3545 		SK_LOCK_ASSERT_HELD();
3546 	}
3547 
3548 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3549 		(*f)(nx, arg0);
3550 	}
3551 
3552 	if (!is_sk_locked) {
3553 		SK_UNLOCK();
3554 	}
3555 }
3556 
3557 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3558 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3559     struct kern_pbufpool_memory_info *rx_pool_info,
3560     struct kern_pbufpool_memory_info *tx_pool_info)
3561 {
3562 	struct kern_pbufpool *__single tpp, *__single rpp;
3563 	struct kern_nexus *nx;
3564 	errno_t err = 0;
3565 
3566 	nx = nx_find(nx_uuid, FALSE);
3567 	if (nx == NULL) {
3568 		err = ENOENT;
3569 		goto done;
3570 	}
3571 
3572 	if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3573 		err = ENOTSUP;
3574 		goto done;
3575 	}
3576 
3577 	err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3578 	if (err != 0) {
3579 		goto done;
3580 	}
3581 
3582 	if ((tpp == NULL) && (rpp == NULL)) {
3583 		err = ENOENT;
3584 		goto done;
3585 	}
3586 
3587 	if (tx_pool_info != NULL) {
3588 		bzero(tx_pool_info, sizeof(*tx_pool_info));
3589 	}
3590 	if (rx_pool_info != NULL) {
3591 		bzero(rx_pool_info, sizeof(*rx_pool_info));
3592 	}
3593 
3594 	if ((tx_pool_info != NULL) && (tpp != NULL)) {
3595 		err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3596 		if (err != 0) {
3597 			goto done;
3598 		}
3599 	}
3600 
3601 	if ((rx_pool_info != NULL) && (rpp != NULL)) {
3602 		err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3603 	}
3604 
3605 done:
3606 	if (nx != NULL) {
3607 		(void) nx_release(nx);
3608 		nx = NULL;
3609 	}
3610 	return err;
3611 }
3612 
3613 void
nx_interface_advisory_notify(struct kern_nexus * nx)3614 nx_interface_advisory_notify(struct kern_nexus *nx)
3615 {
3616 	struct kern_channel *ch;
3617 	struct netif_stats *nifs;
3618 	struct fsw_stats *fsw_stats;
3619 	nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3620 
3621 	if (nxdom_type == NEXUS_TYPE_NET_IF) {
3622 		nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3623 	} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3624 		fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3625 	} else {
3626 		VERIFY(0);
3627 		__builtin_unreachable();
3628 	}
3629 	if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3630 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3631 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3632 		} else {
3633 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3634 		}
3635 		return;
3636 	}
3637 	/*
3638 	 * if the channel is in "nx_ch_if_adv_head" list, then we can
3639 	 * safely assume that the channel is not closed yet.
3640 	 * In ch_close_common(), the channel is removed from the
3641 	 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3642 	 * exclusive mode, prior to closing the channel.
3643 	 */
3644 	STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3645 		struct nexus_adapter *na = ch->ch_na;
3646 
3647 		ASSERT(na != NULL);
3648 		na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3649 		    TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3650 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3651 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3652 		} else {
3653 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3654 		}
3655 	}
3656 	lck_rw_done(&nx->nx_ch_if_adv_lock);
3657 }
3658