xref: /xnu-11215.41.3/bsd/skywalk/nexus/nexus.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33 
34 static uint32_t disable_nxctl_check = 0;
35 #if (DEVELOPMENT || DEBUG)
36 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37     CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38 #endif
39 
40 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44 
45 static STAILQ_HEAD(, nxctl) nxctl_head =
46     STAILQ_HEAD_INITIALIZER(nxctl_head);
47 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48     STAILQ_HEAD_INITIALIZER(nxprov_head);
49 
50 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51 RB_HEAD(kern_nexus_tree, kern_nexus);
52 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54 static struct kern_nexus_tree   nx_head;
55 
56 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63 static void nxctl_retain_locked(struct nxctl *);
64 static int nxctl_release_locked(struct nxctl *);
65 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67 static void nxctl_free(struct nxctl *);
68 
69 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70     struct kern_nexus_domain_provider *, struct nxprov_reg *,
71     const struct kern_nexus_provider_init *init, int *);
72 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73 static void nxprov_retain_locked(struct kern_nexus_provider *);
74 static int nxprov_release_locked(struct kern_nexus_provider *);
75 static struct kern_nexus_provider *nxprov_alloc(
76 	struct kern_nexus_domain_provider *, zalloc_flags_t);
77 static void nxprov_free(struct kern_nexus_provider *);
78 
79 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83 static struct kern_nexus *nx_alloc(zalloc_flags_t);
84 static void nx_free(struct kern_nexus *);
85 
86 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87 
88 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89 
90 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91 
92 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93 
94 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95 
96 static int __nx_inited = 0;
97 
98 #define SKMEM_TAG_NX_KEY        "com.apple.skywalk.nexus.key"
99 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100 
101 #define SKMEM_TAG_NX_MIB        "com.apple.skywalk.nexus.mib"
102 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103 
104 #define SKMEM_TAG_NX_PORT        "com.apple.skywalk.nexus.port"
105 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106 
107 #define SKMEM_TAG_NX_PORT_INFO        "com.apple.skywalk.nexus.port.info"
108 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109 
110 /*
111  * Special nexus controller handle for Skywalk internal use.  Unlike all
112  * other nexus controller handles that are created by userland or kernel
113  * clients, this one never gets closed or freed.  It is also not part of
114  * the global nxctl_head list.
115  */
116 static struct nxctl _kernnxctl;
117 static struct nxctl _usernxctl;
118 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
119 struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
120 
121 /*
122  * -fbounds-safety: For static functions where additional size variables are
123  * added, we need to mark them __unused if this file is being built without
124  * -fbounds-safety.
125  */
126 #if !__has_ptrcheck
127 #define NX_FB_ARG __unused
128 #else
129 #define NX_FB_ARG
130 #endif
131 
132 int
nexus_init(void)133 nexus_init(void)
134 {
135 	SK_LOCK_ASSERT_HELD();
136 	ASSERT(!__nx_inited);
137 
138 	RB_INIT(&nx_head);
139 
140 	na_init();
141 
142 	/* attach system built-in domains and domain providers */
143 	nxdom_attach_all();
144 
145 	/*
146 	 * Initialize private kernel and shared user nexus controller handle;
147 	 *
148 	 * Shared Kernel controller is used internally for creating nexus providers
149 	 * and nexus instances from within the Skywalk code (e.g. netif_compat).
150 	 *
151 	 * Shared User controller is used userspace by clients(e.g. libnetcore)
152 	 * that would like to call nexus instances for use cases like
153 	 * configuring flow entry that they own indirectly (e.g. via NECP), so
154 	 * that the nexus would perform permission check based on other info
155 	 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
156 	 * credentials).
157 	 */
158 	nxctl_init(&_kernnxctl, kernproc, NULL);
159 	nxctl_retain_locked(&_kernnxctl);       /* one for us */
160 	nxctl_init(&_usernxctl, kernproc, NULL);
161 	nxctl_retain_locked(&_usernxctl);       /* one for us */
162 	nxctl_traffic_rule_init();
163 
164 	__nx_inited = 1;
165 
166 	return 0;
167 }
168 
169 void
nexus_fini(void)170 nexus_fini(void)
171 {
172 	SK_LOCK_ASSERT_HELD();
173 
174 	if (__nx_inited) {
175 		nxctl_traffic_rule_fini();
176 		nxctl_release_locked(&_kernnxctl);
177 		nxctl_release_locked(&_usernxctl);
178 
179 		/* tell all domains they're going away */
180 		nxdom_detach_all();
181 
182 		ASSERT(RB_EMPTY(&nx_head));
183 
184 		na_fini();
185 
186 		__nx_inited = 0;
187 	}
188 }
189 
190 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)191 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
192     int *err)
193 {
194 	struct nxctl *nxctl = NULL;
195 
196 	ASSERT(!uuid_is_null(nxctl_uuid));
197 
198 	/* privilege checks would be done when performing nxctl operations */
199 
200 	SK_LOCK();
201 
202 	nxctl = nxctl_alloc(p, fp, Z_WAITOK);
203 
204 	STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
205 	nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
206 	uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
207 
208 	nxctl_retain_locked(nxctl);     /* one for being in the list */
209 	nxctl_retain_locked(nxctl);     /* one for the caller */
210 
211 #if SK_LOG
212 	uuid_string_t uuidstr;
213 	SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
214 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
215 #endif /* SK_LOG */
216 
217 	SK_UNLOCK();
218 
219 	if (*err != 0) {
220 		nxctl_free(nxctl);
221 		nxctl = NULL;
222 	}
223 	return nxctl;
224 }
225 
226 void
nxctl_close(struct nxctl * nxctl)227 nxctl_close(struct nxctl *nxctl)
228 {
229 	struct kern_nexus_provider *nxprov = NULL, *tnxprov;
230 
231 	lck_mtx_lock(&nxctl->nxctl_lock);
232 	SK_LOCK();
233 
234 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
235 
236 #if SK_LOG
237 	uuid_string_t uuidstr;
238 	SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
239 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
240 	    nxctl->nxctl_flags, NEXUSCTLF_BITS);
241 #endif /* SK_LOG */
242 
243 	if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
244 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
245 		nxctl->nxctl_fp = NULL;
246 	}
247 
248 	/* may be called as part of failure cleanup, so check */
249 	if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
250 		/* caller must hold an extra ref */
251 		ASSERT(nxctl->nxctl_refcnt > 1);
252 		(void) nxctl_release_locked(nxctl);
253 
254 		STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
255 		nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
256 	}
257 
258 repeat:
259 	STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
260 		/*
261 		 * Close provider only for those which are owned by
262 		 * this control instance.  Note that if we close the
263 		 * provider, we need to repeat this search as the
264 		 * list might have been changed by another thread.
265 		 * That's possible since SK_UNLOCK() may be called
266 		 * as a result of calling nxprov_close().
267 		 */
268 		if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
269 		    nxprov->nxprov_ctl == nxctl) {
270 			nxprov_retain_locked(nxprov);
271 			(void) nxprov_close(nxprov, TRUE);
272 			(void) nxprov_release_locked(nxprov);
273 			goto repeat;
274 		}
275 	}
276 
277 	SK_UNLOCK();
278 	lck_mtx_unlock(&nxctl->nxctl_lock);
279 	nxctl_traffic_rule_clean(nxctl);
280 }
281 
282 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)283 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
284 {
285 #pragma unused(nxctl)
286 	int err = 0;
287 
288 	NXCTL_LOCK_ASSERT_HELD(nxctl);
289 
290 	if (sopt->sopt_dir != SOPT_SET) {
291 		sopt->sopt_dir = SOPT_SET;
292 	}
293 
294 	switch (sopt->sopt_name) {
295 	case NXOPT_NEXUS_BIND:
296 		err = nxctl_nexus_bind(nxctl, sopt);
297 		break;
298 
299 	case NXOPT_NEXUS_UNBIND:
300 		err = nxctl_nexus_unbind(nxctl, sopt);
301 		break;
302 
303 	case NXOPT_NEXUS_CONFIG:
304 		err = nxctl_nexus_config(nxctl, sopt);
305 		break;
306 
307 	default:
308 		err = ENOPROTOOPT;
309 		break;
310 	}
311 
312 	return err;
313 }
314 
315 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)316 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
317 {
318 #pragma unused(nxctl)
319 	int err = 0;
320 
321 	NXCTL_LOCK_ASSERT_HELD(nxctl);
322 
323 	if (sopt->sopt_dir != SOPT_GET) {
324 		sopt->sopt_dir = SOPT_GET;
325 	}
326 
327 	switch (sopt->sopt_name) {
328 	case NXOPT_NEXUS_PROV_LIST:
329 		err = nxctl_get_nexus_prov_list(nxctl, sopt);
330 		break;
331 
332 	case NXOPT_NEXUS_PROV_ENTRY:
333 		err = nxctl_get_nexus_prov_entry(nxctl, sopt);
334 		break;
335 
336 	case NXOPT_NEXUS_LIST:
337 		err = nxctl_get_nexus_list(nxctl, sopt);
338 		break;
339 
340 	case NXOPT_CHANNEL_LIST:
341 		err = nxctl_get_channel_list(nxctl, sopt);
342 		break;
343 
344 	default:
345 		err = ENOPROTOOPT;
346 		break;
347 	}
348 
349 	return err;
350 }
351 
352 /* Upper bound on # of nrl_num_regs that we'd return to user space */
353 #define MAX_NUM_REG_ENTRIES     256
354 
355 /* Hoisted out of line to reduce kernel stack footprint */
356 SK_NO_INLINE_ATTRIBUTE
357 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)358 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
359 {
360 	user_addr_t tmp_ptr = USER_ADDR_NULL;
361 	struct nxprov_reg_ent *pnre, *nres = NULL;
362 	struct nxprov_list_req nrlr;
363 	struct kern_nexus_provider *nxprov = NULL;
364 	uint32_t nregs = 0, ncregs = 0;
365 	int err = 0, observeall;
366 	size_t nres_sz;
367 
368 	NXCTL_LOCK_ASSERT_HELD(nxctl);
369 
370 	ASSERT(sopt->sopt_p != NULL);
371 	if (sopt->sopt_val == USER_ADDR_NULL) {
372 		return EINVAL;
373 	}
374 
375 	err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
376 	if (err != 0) {
377 		return err;
378 	}
379 
380 	if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
381 		nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
382 	}
383 
384 	/*
385 	 * If the caller specified a buffer, copy out the Nexus provider
386 	 * entries to caller gracefully.  We only copy out the number of
387 	 * entries which caller has asked for, but we always tell caller
388 	 * how big the buffer really needs to be.
389 	 */
390 	tmp_ptr = nrlr.nrl_regs;
391 	if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
392 		nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
393 		nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
394 		if (__improbable(nres == NULL)) {
395 			return ENOBUFS;
396 		}
397 	}
398 
399 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
400 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
401 
402 	SK_LOCK();
403 	/*
404 	 * Count number of providers.  If buffer space exists and
405 	 * remains, copy out provider entries.
406 	 */
407 	nregs = nrlr.nrl_num_regs;
408 	pnre = nres;
409 
410 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
411 		/*
412 		 * Return only entries that are visible to the caller,
413 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
414 		 */
415 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
416 			continue;
417 		}
418 
419 		if (nres != NULL && nregs > 0) {
420 			uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
421 			bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
422 			    sizeof(struct nxprov_params));
423 			--nregs;
424 			++pnre;
425 			++ncregs;
426 		}
427 	}
428 	SK_UNLOCK();
429 
430 	if (ncregs == 0) {
431 		err = ENOENT;
432 	}
433 
434 	if (nres != NULL) {
435 		if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
436 			if (sopt->sopt_p != kernproc) {
437 				err = copyout(nres, tmp_ptr,
438 				    ncregs * sizeof(*nres));
439 			} else {
440 				caddr_t tmp;
441 				tmp =  __unsafe_forge_bidi_indexable(caddr_t,
442 				    CAST_DOWN(caddr_t, tmp_ptr),
443 				    ncregs * sizeof(*nres));
444 				bcopy(nres, tmp, ncregs * sizeof(*nres));
445 			}
446 		}
447 		sk_free_data(nres, nres_sz);
448 		nres = NULL;
449 	}
450 
451 	if (err == 0) {
452 		nrlr.nrl_num_regs = ncregs;
453 		err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
454 	}
455 
456 	return err;
457 }
458 
459 /* Hoisted out of line to reduce kernel stack footprint */
460 SK_NO_INLINE_ATTRIBUTE
461 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)462 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
463 {
464 	struct nxprov_reg_ent nre;
465 	struct kern_nexus_provider *nxprov = NULL;
466 	int err = 0;
467 
468 	NXCTL_LOCK_ASSERT_HELD(nxctl);
469 
470 	ASSERT(sopt->sopt_p != NULL);
471 	if (sopt->sopt_val == USER_ADDR_NULL) {
472 		return EINVAL;
473 	}
474 
475 	bzero(&nre, sizeof(nre));
476 	err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
477 	if (err != 0) {
478 		return err;
479 	}
480 
481 	if (uuid_is_null(nre.npre_prov_uuid)) {
482 		return EINVAL;
483 	}
484 
485 	SK_LOCK();
486 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
487 		if (uuid_compare(nxprov->nxprov_uuid,
488 		    nre.npre_prov_uuid) == 0) {
489 			/*
490 			 * Return only entries that are visible to the caller,
491 			 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
492 			 */
493 			if (nxprov->nxprov_ctl != nxctl) {
494 				if (skywalk_priv_check_cred(sopt->sopt_p,
495 				    nxctl->nxctl_cred,
496 				    PRIV_SKYWALK_OBSERVE_ALL) != 0) {
497 					nxprov = NULL;
498 					break;
499 				}
500 			}
501 
502 			bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
503 			    sizeof(struct nxprov_params));
504 			break;
505 		}
506 	}
507 	SK_UNLOCK();
508 
509 	if (nxprov != NULL) {
510 		err = sooptcopyout(sopt, &nre, sizeof(nre));
511 	} else {
512 		err = ENOENT;
513 	}
514 
515 	return err;
516 }
517 
518 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
519 #define MAX_NUM_NX_UUIDS        4096
520 
521 /* Hoisted out of line to reduce kernel stack footprint */
522 SK_NO_INLINE_ATTRIBUTE
523 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)524 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
525 {
526 	user_addr_t tmp_ptr = USER_ADDR_NULL;
527 	uint32_t nuuids = 0, ncuuids = 0;
528 	uuid_t *puuid, *uuids = NULL;
529 	size_t uuids_sz;
530 	struct nx_list_req nlr;
531 	struct kern_nexus_provider *nxprov = NULL;
532 	struct kern_nexus *nx = NULL;
533 	int err = 0, observeall;
534 
535 	NXCTL_LOCK_ASSERT_HELD(nxctl);
536 
537 	ASSERT(sopt->sopt_p != NULL);
538 	if (sopt->sopt_val == USER_ADDR_NULL) {
539 		return EINVAL;
540 	}
541 
542 	err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
543 	if (err != 0) {
544 		return err;
545 	}
546 
547 	if (uuid_is_null(nlr.nl_prov_uuid)) {
548 		return EINVAL;
549 	} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
550 		nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
551 	}
552 
553 	/*
554 	 * If the caller specified a buffer, copy out the Nexus UUIDs to
555 	 * caller gracefully.  We only copy out the number of UUIDs which
556 	 * caller has asked for, but we always tell caller how big the
557 	 * buffer really needs to be.
558 	 */
559 	tmp_ptr = nlr.nl_nx_uuids;
560 	if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
561 		uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
562 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
563 		if (__improbable(uuids == NULL)) {
564 			return ENOBUFS;
565 		}
566 	}
567 
568 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
569 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
570 
571 	SK_LOCK();
572 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
573 		/*
574 		 * Return only entries that are visible to the caller,
575 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
576 		 */
577 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
578 			continue;
579 		}
580 
581 		if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
582 			break;
583 		}
584 	}
585 
586 	if (nxprov != NULL) {
587 		/*
588 		 * Count number of Nexus.  If buffer space exists
589 		 * and remains, copy out the Nexus UUIDs.
590 		 */
591 		nuuids = nlr.nl_num_nx_uuids;
592 		puuid = uuids;
593 
594 		STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
595 			++ncuuids;
596 			if (uuids != NULL && nuuids > 0) {
597 				uuid_copy(*puuid, nx->nx_uuid);
598 				--nuuids;
599 				++puuid;
600 			}
601 		}
602 	} else {
603 		err = ENOENT;
604 	}
605 	SK_UNLOCK();
606 
607 	if (uuids != NULL) {
608 		if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
609 			uintptr_t cnt_uuid;
610 
611 			/* Note: Pointer arithmetic */
612 			cnt_uuid = (uintptr_t)(puuid - uuids);
613 			if (cnt_uuid > 0) {
614 				if (sopt->sopt_p != kernproc) {
615 					err = copyout(uuids, tmp_ptr,
616 					    cnt_uuid * sizeof(uuid_t));
617 				} else {
618 					caddr_t tmp;
619 					tmp = __unsafe_forge_bidi_indexable(caddr_t,
620 					    CAST_DOWN(caddr_t, tmp_ptr),
621 					    cnt_uuid * sizeof(uuid_t));
622 					bcopy(uuids, tmp,
623 					    cnt_uuid * sizeof(uuid_t));
624 				}
625 			}
626 		}
627 		sk_free_data(uuids, uuids_sz);
628 		uuids = NULL;
629 	}
630 
631 	if (err == 0) {
632 		nlr.nl_num_nx_uuids = ncuuids;
633 		err = sooptcopyout(sopt, &nlr, sizeof(nlr));
634 	}
635 
636 	return err;
637 }
638 
639 /* Hoisted out of line to reduce kernel stack footprint */
640 SK_NO_INLINE_ATTRIBUTE
641 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)642 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
643 {
644 	boolean_t m_pid, m_exec_uuid, m_key;
645 	struct nx_bind_req nbr;
646 	struct proc *p = PROC_NULL;
647 	struct nxbind *nxb = NULL;
648 	uint64_t p_uniqueid = -1;
649 	pid_t p_pid = -1;
650 	struct kern_nexus *nx = NULL;
651 #if SK_LOG
652 	uuid_string_t exec_uuidstr;
653 #endif /* SK_LOG */
654 	uuid_t p_uuid;
655 	void *key = NULL;
656 	int err = 0;
657 
658 	NXCTL_LOCK_ASSERT_HELD(nxctl);
659 
660 	if (sopt->sopt_val == USER_ADDR_NULL) {
661 		return EINVAL;
662 	}
663 
664 	uuid_clear(p_uuid);
665 	bzero(&nbr, sizeof(nbr));
666 	err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
667 	if (err != 0) {
668 		return err;
669 	}
670 
671 	if (uuid_is_null(nbr.nb_nx_uuid)) {
672 		err = EINVAL;
673 		goto done_unlocked;
674 	}
675 
676 	nbr.nb_flags &= NBR_MATCH_MASK;
677 	if (nbr.nb_flags == 0) {
678 		/* must choose one of the match criteria */
679 		err = EINVAL;
680 		goto done_unlocked;
681 	}
682 	m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
683 	m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
684 	m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
685 
686 	if (m_pid || m_exec_uuid) {
687 		/*
688 		 * Validate process ID.  A valid PID is needed when we're
689 		 * asked to match by PID, or if asked to match by executable
690 		 * UUID with a NULL nb_exec_uuid supplied.  The latter is
691 		 * to support the case when a userland Nexus provider isn't
692 		 * able to acquire its client's executable UUID, but is
693 		 * able to identify it via PID.
694 		 */
695 		if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
696 		    (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
697 			err = ESRCH;
698 			goto done_unlocked;
699 		}
700 		/* exclude kernel from the match criteria */
701 		if (p == kernproc) {
702 			err = EACCES;
703 			goto done_unlocked;
704 		} else if (p != PROC_NULL) {
705 			proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
706 			p_uniqueid = proc_uniqueid(p);
707 			p_pid = proc_pid(p);
708 		} else {
709 			uuid_copy(p_uuid, nbr.nb_exec_uuid);
710 		}
711 	}
712 
713 	if (m_key) {
714 		if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
715 		    nbr.nb_key == USER_ADDR_NULL) {
716 			err = EINVAL;
717 			goto done_unlocked;
718 		}
719 
720 		key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
721 		if (__improbable(key == NULL)) {
722 			err = ENOMEM;
723 			goto done_unlocked;
724 		}
725 
726 		if (sopt->sopt_p != kernproc) {
727 			err = copyin(nbr.nb_key, key, nbr.nb_key_len);
728 			if (err != 0) {
729 				goto done_unlocked;
730 			}
731 		} else {
732 			/*
733 			 * -fbounds-safety: nbr.nb_key is user_addr_t. Changing
734 			 * it to a pointer type is risky, so we just forge it
735 			 * here instead.
736 			 */
737 			void *nb_key = __unsafe_forge_bidi_indexable(void *,
738 			    nbr.nb_key, nbr.nb_key_len);
739 			bcopy(nb_key, key, nbr.nb_key_len);
740 		}
741 	}
742 
743 	SK_LOCK();
744 	nx = nx_find(nbr.nb_nx_uuid, TRUE);
745 	if (nx == NULL || (disable_nxctl_check == 0 &&
746 	    nx->nx_prov->nxprov_ctl != nxctl &&
747 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
748 		err = ENOENT;
749 		goto done;
750 	}
751 
752 	/* bind isn't applicable on anonymous nexus provider */
753 	if (NX_ANONYMOUS_PROV(nx)) {
754 		err = ENXIO;
755 		goto done;
756 	}
757 
758 	/* port must be within the domain's range */
759 	if (nbr.nb_port != NEXUS_PORT_ANY &&
760 	    nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
761 		err = EDOM;
762 		goto done;
763 	} else if (nbr.nb_port == NEXUS_PORT_ANY) {
764 		/* for now, this is allowed only for kernel clients */
765 		if (sopt->sopt_p != kernproc) {
766 			err = EPERM;
767 			goto done;
768 		}
769 	}
770 
771 	nxb = nxb_alloc(Z_WAITOK);
772 
773 	if (m_pid) {
774 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
775 		nxb->nxb_uniqueid = p_uniqueid;
776 		nxb->nxb_pid = p_pid;
777 	}
778 	if (m_exec_uuid) {
779 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
780 		ASSERT(!uuid_is_null(p_uuid));
781 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
782 	}
783 	if (m_key) {
784 		nxb->nxb_flags |= NXBF_MATCH_KEY;
785 		ASSERT(key != NULL);
786 		ASSERT(nbr.nb_key_len != 0 &&
787 		    nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
788 		/*
789 		 * -fbounds-safety: since nxb_key is __sized_by(nxb_key_len),
790 		 * its assignment needs to be done side-by-side to nxb_key_len.
791 		 */
792 		nxb->nxb_key = key;
793 		key = NULL;     /* let nxb_free() free it */
794 		nxb->nxb_key_len = nbr.nb_key_len;
795 	}
796 
797 	/*
798 	 * Bind the creds to the nexus port.  If client doesn't have a port,
799 	 * find one, claim it, and associate the creds to it.  Upon success,
800 	 * the nexus may move the nxbind contents (including the key) to
801 	 * its own nxbind instance; in that case, nxb_free() below will not
802 	 * be freeing the key within.
803 	 */
804 	err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
805 	if (err != 0) {
806 		goto done;
807 	}
808 
809 	ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
810 	(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
811 
812 	SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
813 	    "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
814 	    SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
815 	    NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
816 	    sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
817 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
818 	    nxb->nxb_key_len);
819 
820 done:
821 	if (nx != NULL) {
822 		(void) nx_release_locked(nx);
823 		nx = NULL;
824 	}
825 	SK_UNLOCK();
826 
827 done_unlocked:
828 	ASSERT(nx == NULL);
829 
830 	if (nxb != NULL) {
831 		nxb_free(nxb);
832 		nxb = NULL;
833 	}
834 	if (key != NULL) {
835 		sk_free_data(key, nbr.nb_key_len);
836 		key = NULL;
837 	}
838 	if (p != PROC_NULL) {
839 		proc_rele(p);
840 	}
841 
842 	return err;
843 }
844 
845 /* Hoisted out of line to reduce kernel stack footprint */
846 SK_NO_INLINE_ATTRIBUTE
847 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)848 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
849 {
850 	struct nx_unbind_req nur;
851 	struct kern_nexus *nx = NULL;
852 	int err = 0;
853 
854 	NXCTL_LOCK_ASSERT_HELD(nxctl);
855 
856 	if (sopt->sopt_val == USER_ADDR_NULL) {
857 		return EINVAL;
858 	}
859 
860 	bzero(&nur, sizeof(nur));
861 	err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
862 	if (err != 0) {
863 		return err;
864 	}
865 
866 	if (uuid_is_null(nur.nu_nx_uuid)) {
867 		return EINVAL;
868 	}
869 
870 	SK_LOCK();
871 	nx = nx_find(nur.nu_nx_uuid, TRUE);
872 	if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
873 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
874 		err = ENOENT;
875 		goto done;
876 	}
877 
878 	/* unbind isn't applicable on anonymous nexus provider */
879 	if (NX_ANONYMOUS_PROV(nx)) {
880 		err = ENXIO;
881 		goto done;
882 	}
883 
884 	if (nur.nu_port == NEXUS_PORT_ANY) {
885 		err = EINVAL;
886 		goto done;
887 	}
888 
889 	err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
890 
891 done:
892 	if (nx != NULL) {
893 		(void) nx_release_locked(nx);
894 		nx = NULL;
895 	}
896 	SK_UNLOCK();
897 
898 	return err;
899 }
900 
901 /* Hoisted out of line to reduce kernel stack footprint */
902 SK_NO_INLINE_ATTRIBUTE
903 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)904 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
905 {
906 	struct kern_nexus *nx = NULL;
907 	struct nx_cfg_req ncr;
908 	int err = 0;
909 
910 	NXCTL_LOCK_ASSERT_HELD(nxctl);
911 
912 	if (sopt->sopt_val == USER_ADDR_NULL) {
913 		return EINVAL;
914 	}
915 
916 	bzero(&ncr, sizeof(ncr));
917 	err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
918 	if (err != 0) {
919 		return err;
920 	}
921 
922 	if (uuid_is_null(ncr.nc_nx_uuid)) {
923 		return EINVAL;
924 	}
925 
926 	SK_LOCK();
927 	nx = nx_find(ncr.nc_nx_uuid, TRUE);
928 	if (nx == NULL || (disable_nxctl_check == 0 &&
929 	    nx->nx_prov->nxprov_ctl != nxctl &&
930 	    nxctl != &_kernnxctl &&    /* allow kernel/shared user nxctl */
931 	    nxctl != &_usernxctl)) {
932 		err = ENOENT;
933 		goto done;
934 	}
935 
936 	if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
937 		err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
938 		    nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
939 	} else {
940 		err = EPERM;
941 	}
942 
943 	if (err == 0) {
944 		(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
945 	}
946 done:
947 	if (nx != NULL) {
948 		(void) nx_release_locked(nx);
949 		nx = NULL;
950 	}
951 	SK_UNLOCK();
952 
953 	return err;
954 }
955 
956 struct nxbind *
nxb_alloc(zalloc_flags_t how)957 nxb_alloc(zalloc_flags_t how)
958 {
959 	struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
960 
961 	if (nxb) {
962 		SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
963 	}
964 	return nxb;
965 }
966 
967 void
nxb_free(struct nxbind * nxb)968 nxb_free(struct nxbind *nxb)
969 {
970 	SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
971 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
972 
973 	if (nxb->nxb_key != NULL) {
974 		sk_free_data_sized_by(nxb->nxb_key, nxb->nxb_key_len);
975 		nxb->nxb_key = NULL;
976 		nxb->nxb_key_len = 0;
977 	}
978 	zfree(nxbind_zone, nxb);
979 }
980 
981 /*
982  * nxb0 is assumed to possess the truth, compare nxb1 against it.
983  */
984 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)985 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
986 {
987 	ASSERT(nxb0 != NULL && nxb1 != NULL);
988 	ASSERT(nxb0 != nxb1);
989 
990 	/* we always compare using uniqueid and not pid */
991 	if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
992 	    nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
993 		return FALSE;
994 	}
995 
996 	if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
997 	    uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
998 		return FALSE;
999 	}
1000 
1001 	ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
1002 	    (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
1003 
1004 	if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
1005 	    (nxb0->nxb_key_len != nxb1->nxb_key_len ||
1006 	    nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
1007 	    nxb1->nxb_key_len) != 0)) {
1008 		return FALSE;
1009 	}
1010 
1011 	return TRUE;
1012 }
1013 
1014 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)1015 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
1016 {
1017 	ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
1018 	    (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
1019 
1020 	/* in case the destination has a key attached, free it first */
1021 	if (dnxb->nxb_key != NULL) {
1022 		sk_free_data_sized_by(dnxb->nxb_key, dnxb->nxb_key_len);
1023 		dnxb->nxb_key = NULL;
1024 		dnxb->nxb_key_len = 0;
1025 	}
1026 
1027 	/* move everything from src to dst, and then wipe out src */
1028 	bcopy(snxb, dnxb, sizeof(*dnxb));
1029 	bzero(snxb, sizeof(*snxb));
1030 }
1031 
1032 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1033 #define MAX_NUM_CH_UUIDS        4096
1034 
1035 /* Hoisted out of line to reduce kernel stack footprint */
1036 SK_NO_INLINE_ATTRIBUTE
1037 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)1038 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1039 {
1040 	user_addr_t tmp_ptr = USER_ADDR_NULL;
1041 	uint32_t nuuids = 0, ncuuids = 0;
1042 	uuid_t *puuid, *uuids = NULL;
1043 	size_t uuids_sz;
1044 	struct ch_list_req clr;
1045 	struct kern_channel *ch = NULL;
1046 	struct kern_nexus *nx = NULL;
1047 	struct kern_nexus find;
1048 	int err = 0, observeall;
1049 
1050 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1051 
1052 	ASSERT(sopt->sopt_p != NULL);
1053 	if (sopt->sopt_val == USER_ADDR_NULL) {
1054 		return EINVAL;
1055 	}
1056 
1057 	err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1058 	if (err != 0) {
1059 		return err;
1060 	}
1061 
1062 	if (uuid_is_null(clr.cl_nx_uuid)) {
1063 		return EINVAL;
1064 	} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1065 		clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1066 	}
1067 
1068 	/*
1069 	 * If the caller specified a buffer, copy out the Channel UUIDs to
1070 	 * caller gracefully.  We only copy out the number of UUIDs which
1071 	 * caller has asked for, but we always tell caller how big the
1072 	 * buffer really needs to be.
1073 	 */
1074 	tmp_ptr = clr.cl_ch_uuids;
1075 	if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1076 		uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1077 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1078 		if (uuids == NULL) {
1079 			return ENOBUFS;
1080 		}
1081 	}
1082 
1083 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1084 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
1085 
1086 	SK_LOCK();
1087 	uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1088 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1089 	if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1090 		/*
1091 		 * Return only entries that are visible to the caller,
1092 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1093 		 */
1094 		nx = NULL;
1095 	}
1096 	if (nx != NULL) {
1097 		/*
1098 		 * Count number of Channels.  If buffer space exists
1099 		 * and remains, copy out the Channel UUIDs.
1100 		 */
1101 		nuuids = clr.cl_num_ch_uuids;
1102 		puuid = uuids;
1103 
1104 		STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1105 			++ncuuids;
1106 			if (uuids != NULL && nuuids > 0) {
1107 				uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1108 				--nuuids;
1109 				++puuid;
1110 			}
1111 		}
1112 	} else {
1113 		err = ENOENT;
1114 	}
1115 	SK_UNLOCK();
1116 
1117 	if (uuids != NULL) {
1118 		if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1119 			uintptr_t cnt_uuid;
1120 
1121 			/* Note: Pointer arithmetic */
1122 			cnt_uuid = (uintptr_t)(puuid - uuids);
1123 			ASSERT(cnt_uuid > 0);
1124 
1125 			if (sopt->sopt_p != kernproc) {
1126 				err = copyout(uuids, tmp_ptr,
1127 				    cnt_uuid * sizeof(uuid_t));
1128 			} else {
1129 				caddr_t tmp;
1130 				tmp = __unsafe_forge_bidi_indexable(caddr_t,
1131 				    CAST_DOWN(caddr_t, tmp_ptr),
1132 				    cnt_uuid * sizeof(uuid_t));
1133 				bcopy(uuids, tmp, cnt_uuid * sizeof(uuid_t));
1134 			}
1135 		}
1136 		sk_free_data(uuids, uuids_sz);
1137 		uuids = NULL;
1138 	}
1139 
1140 	if (err == 0) {
1141 		clr.cl_num_ch_uuids = ncuuids;
1142 		err = sooptcopyout(sopt, &clr, sizeof(clr));
1143 	}
1144 
1145 	return err;
1146 }
1147 
1148 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1149 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1150 {
1151 	uuid_t p_uuid;
1152 
1153 	bzero(nxctl, sizeof(*nxctl));
1154 
1155 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1156 
1157 	lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1158 	uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1159 	nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1160 	nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1161 	nxctl->nxctl_fp = fp;
1162 	if (nxctl == &_kernnxctl) {
1163 		ASSERT(p == kernproc);
1164 		nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1165 	}
1166 	if (nxctl == &_usernxctl) {
1167 		ASSERT(p == kernproc);
1168 		nxctl->nxctl_cred = NULL;
1169 	}
1170 	if (fp == NULL) {
1171 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1172 	}
1173 }
1174 
1175 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1176 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1177 {
1178 	struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1179 
1180 	if (nxctl != NULL) {
1181 		nxctl_init(nxctl, p, fp);
1182 	}
1183 	return nxctl;
1184 }
1185 
1186 static void
nxctl_free(struct nxctl * nxctl)1187 nxctl_free(struct nxctl *nxctl)
1188 {
1189 	ASSERT(nxctl->nxctl_refcnt == 0);
1190 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1191 	kauth_cred_unref(&nxctl->nxctl_cred);
1192 	lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1193 	SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1194 	if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1195 		zfree(nxctl_zone, nxctl);
1196 	}
1197 }
1198 
1199 static void
nxctl_retain_locked(struct nxctl * nxctl)1200 nxctl_retain_locked(struct nxctl *nxctl)
1201 {
1202 	SK_LOCK_ASSERT_HELD();
1203 
1204 	nxctl->nxctl_refcnt++;
1205 	ASSERT(nxctl->nxctl_refcnt != 0);
1206 }
1207 
1208 void
nxctl_retain(struct nxctl * nxctl)1209 nxctl_retain(struct nxctl *nxctl)
1210 {
1211 	SK_LOCK();
1212 	nxctl_retain_locked(nxctl);
1213 	SK_UNLOCK();
1214 }
1215 
1216 static int
nxctl_release_locked(struct nxctl * nxctl)1217 nxctl_release_locked(struct nxctl *nxctl)
1218 {
1219 	int oldref = nxctl->nxctl_refcnt;
1220 
1221 	SK_LOCK_ASSERT_HELD();
1222 
1223 	ASSERT(nxctl->nxctl_refcnt != 0);
1224 	if (--nxctl->nxctl_refcnt == 0) {
1225 		nxctl_free(nxctl);
1226 	}
1227 
1228 	return oldref == 1;
1229 }
1230 
1231 int
nxctl_release(struct nxctl * nxctl)1232 nxctl_release(struct nxctl *nxctl)
1233 {
1234 	int lastref;
1235 
1236 	SK_LOCK();
1237 	lastref = nxctl_release_locked(nxctl);
1238 	SK_UNLOCK();
1239 
1240 	return lastref;
1241 }
1242 
1243 /* XXX
1244  * -fbounds-safety: Why is this taking a void *? All callers are passing nxctl.
1245  * How come there's no nxctl_ctor?
1246  */
1247 void
nxctl_dtor(struct nxctl * arg)1248 nxctl_dtor(struct nxctl *arg)
1249 {
1250 	struct nxctl *nxctl = arg;
1251 
1252 	nxctl_close(nxctl);
1253 	SK_LOCK();
1254 	(void) nxctl_release_locked(nxctl);
1255 	SK_UNLOCK();
1256 }
1257 
1258 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1259 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1260     struct proc *p)
1261 {
1262 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1263 	int err = 0;
1264 
1265 	ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1266 	ASSERT(ch->ch_ctx == NULL);
1267 
1268 	SK_LOCK_ASSERT_HELD();
1269 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1270 
1271 	/* monitor channels aren't externally visible/usable, so ignore */
1272 	if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1273 	    (ch->ch_flags & CHANF_EXT_SKIP) ||
1274 	    (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1275 	    nxprov->nxprov_ext.nxpi_connected == NULL)) {
1276 		return 0;
1277 	}
1278 
1279 	ch_retain_locked(ch);
1280 	lck_mtx_unlock(&ch->ch_lock);
1281 	SK_UNLOCK();
1282 	lck_mtx_lock(&ch->ch_lock);
1283 
1284 	err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1285 	    ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1286 	if (err != 0) {
1287 		SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1288 		    "error %d", SK_KVA(ch), ch->ch_flags,
1289 		    CHANF_BITS, SK_KVA(nx), err);
1290 		ch->ch_ctx = NULL;
1291 		goto done;
1292 	}
1293 	/*
1294 	 * Upon ring/slot init failure, this is cleared
1295 	 * by nxprov_advise_disconnect() below.
1296 	 */
1297 	os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1298 	if (NXPROV_LLINK(nxprov)) {
1299 		err = nx_netif_llink_ext_init_default_queues(nx);
1300 	} else {
1301 		err = nx_init_rings(nx, ch);
1302 	}
1303 	if (err != 0) {
1304 		goto done;
1305 	}
1306 	ASSERT(err == 0);
1307 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1308 	    CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1309 
1310 	err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1311 	if (err != 0) {
1312 		SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1313 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1314 		goto done;
1315 	}
1316 	os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1317 	SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1318 	    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1319 
1320 
1321 done:
1322 	lck_mtx_unlock(&ch->ch_lock);
1323 	SK_LOCK();
1324 	lck_mtx_lock(&ch->ch_lock);
1325 	if ((err != 0) &&
1326 	    (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1327 		nxprov_advise_disconnect(nx, ch);
1328 	}
1329 	/* caller is expected to hold one, in addition to ourselves */
1330 	VERIFY(ch->ch_refcnt >= 2);
1331 	ch_release_locked(ch);
1332 
1333 	return err;
1334 }
1335 
1336 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1337 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1338 {
1339 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1340 
1341 	SK_LOCK_ASSERT_HELD();
1342 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1343 
1344 	/* check as we might be called in the error handling path */
1345 	if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1346 		ch_retain_locked(ch);
1347 		lck_mtx_unlock(&ch->ch_lock);
1348 		SK_UNLOCK();
1349 		lck_mtx_lock(&ch->ch_lock);
1350 
1351 		ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1352 		if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1353 			nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1354 			os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1355 		}
1356 
1357 		/*
1358 		 * Inform the external domain provider that the rings
1359 		 * and slots for this channel are no longer valid.
1360 		 */
1361 		if (NXPROV_LLINK(nxprov)) {
1362 			nx_netif_llink_ext_fini_default_queues(nx);
1363 		} else {
1364 			nx_fini_rings(nx, ch);
1365 		}
1366 
1367 		ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1368 		nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1369 		os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1370 
1371 		SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1372 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1373 
1374 		/* We're done with this channel */
1375 		ch->ch_ctx = NULL;
1376 
1377 		lck_mtx_unlock(&ch->ch_lock);
1378 		SK_LOCK();
1379 		lck_mtx_lock(&ch->ch_lock);
1380 		/* caller is expected to hold one, in addition to ourselves */
1381 		VERIFY(ch->ch_refcnt >= 2);
1382 		ch_release_locked(ch);
1383 	}
1384 	ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1385 	ASSERT(ch->ch_ctx == NULL);
1386 }
1387 
1388 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1389 nxprov_create_common(struct nxctl *nxctl,
1390     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1391     const struct kern_nexus_provider_init *init, int *err)
1392 {
1393 	struct skmem_region_params srp[SKMEM_REGIONS];
1394 	struct kern_nexus_provider *nxprov = NULL;
1395 	struct nxprov_params nxp;
1396 	uint32_t override = 0;
1397 	uint32_t pp_region_config_flags;
1398 	int i;
1399 
1400 	_CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1401 	_CASSERT(sizeof(*init) >=
1402 	    sizeof(struct kern_nexus_netif_provider_init));
1403 
1404 	SK_LOCK_ASSERT_HELD();
1405 	ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1406 
1407 	pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1408 	    PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1409 	/*
1410 	 * Special handling for external nexus providers; similar
1411 	 * logic to what's done in kern_pbufpool_create().
1412 	 */
1413 	if (init != NULL) {
1414 		if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1415 			pp_region_config_flags |=
1416 			    PP_REGION_CONFIG_BUF_MONOLITHIC;
1417 		}
1418 
1419 		if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1420 			pp_region_config_flags |=
1421 			    PP_REGION_CONFIG_BUF_NOCACHE;
1422 		}
1423 	}
1424 
1425 	/*
1426 	 * For network devices, set the packet metadata memory as persistent
1427 	 * so that it is wired at segment creation.  This allows us to access
1428 	 * it with preemption disabled, as well as for rdar://problem/46511741.
1429 	 */
1430 	if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1431 		pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1432 	}
1433 
1434 	/* process and validate provider parameters */
1435 	if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1436 	    &nxp, srp, override, pp_region_config_flags)) != 0) {
1437 		goto done;
1438 	}
1439 
1440 	nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1441 	ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1442 
1443 	STAILQ_INIT(&nxprov->nxprov_nx_head);
1444 	STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1445 	nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1446 	nxprov->nxprov_ctl = nxctl;
1447 	uuid_generate_random(nxprov->nxprov_uuid);
1448 	bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1449 
1450 	if (init != NULL) {
1451 		if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1452 			ASSERT(NXPROV_LLINK(nxprov));
1453 			bcopy(init, &nxprov->nxprov_netif_ext,
1454 			    sizeof(nxprov->nxprov_netif_ext));
1455 		} else {
1456 			ASSERT(!NXPROV_LLINK(nxprov));
1457 			ASSERT(init->nxpi_version ==
1458 			    KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1459 			bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1460 		}
1461 		nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1462 	}
1463 
1464 	/* store validated region parameters to the provider */
1465 	for (i = 0; i < SKMEM_REGIONS; i++) {
1466 		nxprov->nxprov_region_params[i] = srp[i];
1467 	}
1468 
1469 	if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1470 		uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1471 
1472 		if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1473 			nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1474 		}
1475 	} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1476 	    NEXUS_TYPE_NET_IF) {
1477 		/*
1478 		 * Treat non-netif built-in nexus providers as those
1479 		 * meant for inter-process communications, i.e. there
1480 		 * is no actual networking hardware involved.
1481 		 */
1482 		nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1483 	}
1484 
1485 	nxprov_retain_locked(nxprov);   /* one for being in the list */
1486 	nxprov_retain_locked(nxprov);   /* one for the caller */
1487 
1488 #if SK_LOG
1489 	uuid_string_t uuidstr;
1490 	SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1491 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1492 #endif /* SK_LOG */
1493 
1494 done:
1495 	return nxprov;
1496 }
1497 
1498 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1499 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1500     int *err)
1501 {
1502 	struct nxprov_params *nxp = &reg->nxpreg_params;
1503 	struct kern_nexus_domain_provider *nxdom_prov = NULL;
1504 	struct kern_nexus_provider *nxprov = NULL;
1505 
1506 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1507 
1508 	ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1509 	*err = 0;
1510 
1511 	switch (nxp->nxp_type) {
1512 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1513 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1514 		    PRIV_SKYWALK_REGISTER_USER_PIPE);
1515 		break;
1516 
1517 	case NEXUS_TYPE_FLOW_SWITCH:    /* allowed for userland */
1518 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1519 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1520 		break;
1521 
1522 	case NEXUS_TYPE_NET_IF:         /* allowed for userland */
1523 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1524 		    PRIV_SKYWALK_REGISTER_NET_IF);
1525 		break;
1526 
1527 	case NEXUS_TYPE_KERNEL_PIPE:    /* only for kernel */
1528 	case NEXUS_TYPE_MONITOR:        /* invalid */
1529 	default:
1530 		*err = EINVAL;
1531 		goto done;
1532 	}
1533 
1534 	if (*err != 0) {
1535 		goto done;
1536 	}
1537 
1538 	ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1539 	if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1540 		*err = ENXIO;
1541 		goto done;
1542 	}
1543 
1544 #if CONFIG_NEXUS_NETIF
1545 	/* make sure netif_compat is the default here */
1546 	ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1547 	    strbufcmp(nxdom_prov->nxdom_prov_name, sizeof(nxdom_prov->nxdom_prov_name),
1548 	    NEXUS_PROVIDER_NET_IF_COMPAT, sizeof(NEXUS_PROVIDER_NET_IF_COMPAT)) == 0);
1549 #endif /* CONFIG_NEXUS_NETIF */
1550 
1551 	SK_LOCK();
1552 	/* callee holds a reference for our caller upon success */
1553 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1554 	SK_UNLOCK();
1555 done:
1556 	return nxprov;
1557 }
1558 
1559 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1560 nxprov_create_kern(struct nxctl *nxctl,
1561     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1562     const struct kern_nexus_provider_init *init, int *err)
1563 {
1564 	struct nxprov_params *nxp = &reg->nxpreg_params;
1565 	struct kern_nexus_provider *nxprov = NULL;
1566 
1567 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1568 	SK_LOCK_ASSERT_HELD();
1569 
1570 	ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1571 	ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1572 	ASSERT(init == NULL ||
1573 	    init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1574 	    init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1575 
1576 	*err = 0;
1577 
1578 	switch (nxp->nxp_type) {
1579 	case NEXUS_TYPE_NET_IF:
1580 		break;
1581 	case NEXUS_TYPE_KERNEL_PIPE:
1582 		if (init == NULL) {
1583 			*err = EINVAL;
1584 			goto done;
1585 		}
1586 		break;
1587 	case NEXUS_TYPE_FLOW_SWITCH:
1588 		if (init != NULL) {
1589 			*err = EINVAL;
1590 			goto done;
1591 		}
1592 		break;
1593 
1594 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1595 	case NEXUS_TYPE_MONITOR:        /* invalid */
1596 	default:
1597 		*err = EINVAL;
1598 		goto done;
1599 	}
1600 
1601 	/* callee holds a reference for our caller upon success */
1602 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1603 
1604 done:
1605 	return nxprov;
1606 }
1607 
1608 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1609 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1610 {
1611 	struct kern_nexus_provider *nxprov = NULL;
1612 	int err = 0;
1613 
1614 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1615 
1616 	SK_LOCK();
1617 
1618 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1619 		if (nxctl == nxprov->nxprov_ctl &&
1620 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1621 			nxprov_retain_locked(nxprov);
1622 			break;
1623 		}
1624 	}
1625 
1626 	if (nxprov == NULL) {
1627 		err = ENOENT;
1628 	} else {
1629 		err = nxprov_close(nxprov, TRUE);
1630 	}
1631 
1632 	if (nxprov != NULL) {
1633 		(void) nxprov_release_locked(nxprov);
1634 	}
1635 
1636 	SK_UNLOCK();
1637 
1638 	return err;
1639 }
1640 
1641 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1642 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1643 {
1644 	int err = 0;
1645 
1646 	if (!locked) {
1647 		SK_LOCK();
1648 	}
1649 
1650 	SK_LOCK_ASSERT_HELD();
1651 
1652 #if SK_LOG
1653 	uuid_string_t uuidstr;
1654 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1655 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1656 	    nxprov->nxprov_flags, NXPROVF_BITS);
1657 #endif /* SK_LOG */
1658 
1659 	if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1660 		err = EALREADY;
1661 	} else {
1662 		struct kern_nexus *nx, *tnx;
1663 
1664 		nxprov->nxprov_ctl = NULL;
1665 
1666 		STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1667 		    nx_prov_link, tnx) {
1668 			nx_retain_locked(nx);
1669 			(void) nx_close(nx, TRUE);
1670 			(void) nx_release_locked(nx);
1671 		}
1672 
1673 		if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1674 			/* no nexus created on this, so detach now */
1675 			nxprov_detach(nxprov, TRUE);
1676 		} else {
1677 			/* detach when last nexus is destroyed */
1678 			ASSERT(nxprov->nxprov_refcnt > 1);
1679 			nxprov->nxprov_flags |= NXPROVF_CLOSED;
1680 		}
1681 	}
1682 
1683 	if (!locked) {
1684 		SK_UNLOCK();
1685 	}
1686 
1687 	return err;
1688 }
1689 
1690 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1691 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1692 {
1693 	if (!locked) {
1694 		SK_LOCK();
1695 	}
1696 
1697 	SK_LOCK_ASSERT_HELD();
1698 
1699 #if SK_LOG
1700 	uuid_string_t uuidstr;
1701 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1702 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1703 	    nxprov->nxprov_flags, NXPROVF_BITS);
1704 #endif /* SK_LOG */
1705 
1706 	ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1707 	STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1708 	nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1709 
1710 	/* caller must hold an extra ref */
1711 	ASSERT(nxprov->nxprov_refcnt > 1);
1712 	(void) nxprov_release_locked(nxprov);
1713 
1714 	if (!locked) {
1715 		SK_UNLOCK();
1716 	}
1717 }
1718 
1719 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1720 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1721 {
1722 	struct kern_nexus_provider *nxprov;
1723 	struct nxprov_params *nxp;
1724 
1725 	ASSERT(nxdom_prov != NULL);
1726 
1727 	nxp = nxprov_params_alloc(how);
1728 	if (nxp == NULL) {
1729 		SK_ERR("Failed to allocate nxprov_params");
1730 		return NULL;
1731 	}
1732 
1733 	nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1734 	if (nxprov == NULL) {
1735 		SK_ERR("Failed to allocate nxprov");
1736 		nxprov_params_free(nxp);
1737 		return NULL;
1738 	}
1739 
1740 	nxprov->nxprov_dom_prov = nxdom_prov;
1741 	nxprov->nxprov_params = nxp;
1742 	/* hold a reference for nxprov */
1743 	nxdom_prov_retain_locked(nxdom_prov);
1744 
1745 	return nxprov;
1746 }
1747 
1748 static void
nxprov_free(struct kern_nexus_provider * nxprov)1749 nxprov_free(struct kern_nexus_provider *nxprov)
1750 {
1751 	struct kern_nexus_domain_provider *nxdom_prov =
1752 	    nxprov->nxprov_dom_prov;
1753 
1754 	SK_LOCK_ASSERT_HELD();
1755 
1756 	ASSERT(nxdom_prov != NULL);
1757 	(void) nxdom_prov_release_locked(nxdom_prov);
1758 	nxprov->nxprov_dom_prov = NULL;
1759 	ASSERT(nxprov->nxprov_params != NULL);
1760 	nxprov_params_free(nxprov->nxprov_params);
1761 	nxprov->nxprov_params = NULL;
1762 	ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1763 	SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1764 	zfree(nxprov_zone, nxprov);
1765 }
1766 
1767 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1768 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1769 {
1770 	SK_LOCK_ASSERT_HELD();
1771 
1772 	nxprov->nxprov_refcnt++;
1773 	ASSERT(nxprov->nxprov_refcnt != 0);
1774 }
1775 
1776 void
nxprov_retain(struct kern_nexus_provider * nxprov)1777 nxprov_retain(struct kern_nexus_provider *nxprov)
1778 {
1779 	SK_LOCK();
1780 	nxprov_retain_locked(nxprov);
1781 	SK_UNLOCK();
1782 }
1783 
1784 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1785 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1786 {
1787 	int oldref = nxprov->nxprov_refcnt;
1788 
1789 	SK_LOCK_ASSERT_HELD();
1790 
1791 	ASSERT(nxprov->nxprov_refcnt != 0);
1792 	if (--nxprov->nxprov_refcnt == 0) {
1793 		nxprov_free(nxprov);
1794 	}
1795 
1796 	return oldref == 1;
1797 }
1798 
1799 int
nxprov_release(struct kern_nexus_provider * nxprov)1800 nxprov_release(struct kern_nexus_provider *nxprov)
1801 {
1802 	int lastref;
1803 
1804 	SK_LOCK();
1805 	lastref = nxprov_release_locked(nxprov);
1806 	SK_UNLOCK();
1807 
1808 	return lastref;
1809 }
1810 
1811 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1812 nxprov_params_alloc(zalloc_flags_t how)
1813 {
1814 	return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1815 }
1816 
1817 void
nxprov_params_free(struct nxprov_params * nxp)1818 nxprov_params_free(struct nxprov_params *nxp)
1819 {
1820 	SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1821 	zfree(nxprov_params_zone, nxp);
1822 }
1823 
1824 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1825 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1826 {
1827 	struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1828 
1829 	if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1830 		SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1831 		return ENOTSUP;
1832 	}
1833 
1834 	/*
1835 	 * Require that the nexus domain metadata type and the
1836 	 * metadata type of the caller-provided pbufpool match.
1837 	 */
1838 	if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1839 	    pp->pp_md_type ||
1840 	    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1841 	    pp->pp_md_subtype) {
1842 		SK_ERR("Mismatch in metadata type/subtype "
1843 		    "(%u/%u != %u/%u)", pp->pp_md_type,
1844 		    nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1845 		    pp->pp_md_subtype,
1846 		    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1847 		return EINVAL;
1848 	}
1849 
1850 	/*
1851 	 * Require that the nexus provider memory configuration
1852 	 * has the same impedance as the caller-provided one.
1853 	 * Both need to be lacking or present; if one of them
1854 	 * is set and the other isn't, then we bail.
1855 	 */
1856 	if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1857 	    !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1858 		SK_ERR("Memory config mismatch: monolithic mode");
1859 		return EINVAL;
1860 	}
1861 
1862 	return 0;
1863 }
1864 
1865 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1866 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1867     const nexus_type_t dom_type, const void *nx_ctx,
1868     nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1869     struct kern_pbufpool *rx_pp, int *err)
1870 {
1871 	struct kern_nexus_domain_provider *nxdom_prov;
1872 	struct kern_nexus_provider *nxprov = NULL;
1873 	struct kern_nexus *nx = NULL;
1874 #if SK_LOG
1875 	uuid_string_t uuidstr;
1876 #endif /* SK_LOG */
1877 
1878 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1879 
1880 	ASSERT(dom_type < NEXUS_TYPE_MAX);
1881 	ASSERT(!uuid_is_null(nxprov_uuid));
1882 	*err = 0;
1883 
1884 	SK_LOCK();
1885 
1886 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1887 		if (nxctl == nxprov->nxprov_ctl &&
1888 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1889 			break;
1890 		}
1891 	}
1892 
1893 	if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1894 		SK_ERR("Provider not found or has been closed");
1895 		*err = ENOENT;
1896 		goto done;
1897 	}
1898 
1899 	nxdom_prov = nxprov->nxprov_dom_prov;
1900 	if (dom_type != NEXUS_TYPE_UNDEFINED &&
1901 	    (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1902 		SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1903 		    dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1904 		nxdom_prov = NULL;
1905 		nxprov = NULL;
1906 		*err = ENODEV;
1907 		goto done;
1908 	}
1909 
1910 	if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1911 	    (!tx_pp || !rx_pp)) {
1912 #if SK_LOG
1913 		SK_ERR("TX/RX packet pool is required for netif logical link "
1914 		    "nexus provider UUID: %s",
1915 		    sk_uuid_unparse(nxprov_uuid, uuidstr));
1916 #endif /* SK_LOG */
1917 		nxdom_prov = NULL;
1918 		nxprov = NULL;
1919 		*err = EINVAL;
1920 		goto done;
1921 	}
1922 
1923 	if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1924 	    (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1925 		goto done;
1926 	}
1927 
1928 	nx = nx_alloc(Z_WAITOK);
1929 
1930 	STAILQ_INIT(&nx->nx_ch_head);
1931 	STAILQ_INIT(&nx->nx_ch_nonxref_head);
1932 	lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1933 	    &nexus_lock_attr);
1934 	STAILQ_INIT(&nx->nx_ch_if_adv_head);
1935 	uuid_generate_random(nx->nx_uuid);
1936 	nx->nx_prov = nxprov;
1937 	nx->nx_ctx = __DECONST(void *, nx_ctx);
1938 	nx->nx_ctx_release = nx_ctx_release;
1939 	nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1940 
1941 	if (tx_pp != NULL) {
1942 		nx->nx_tx_pp = tx_pp;
1943 		pp_retain(tx_pp);       /* released by nx_free */
1944 	}
1945 
1946 	if (rx_pp != NULL) {
1947 		nx->nx_rx_pp = rx_pp;
1948 		pp_retain(rx_pp);       /* released by nx_free */
1949 	}
1950 
1951 	/* this nexus is alive; tell the nexus constructor to set it up */
1952 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1953 		*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1954 		if (*err != 0) {
1955 			nx->nx_prov = NULL;
1956 			goto done;
1957 		}
1958 	}
1959 
1960 	nxprov_retain_locked(nxprov);   /* hold a ref on the nexus reg */
1961 
1962 	STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1963 	nxprov->nxprov_nx_count++;
1964 	RB_INSERT(kern_nexus_tree, &nx_head, nx);
1965 	os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1966 
1967 	nx_retain_locked(nx);   /* one for the provider list */
1968 	nx_retain_locked(nx);   /* one for the global list */
1969 	nx_retain_locked(nx);   /* one for the caller */
1970 
1971 #if SK_LOG
1972 	SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1973 	    nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 	    nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1975 #endif /* SK_LOG */
1976 done:
1977 	SK_UNLOCK();
1978 
1979 	if (*err != 0) {
1980 		if (nx != NULL) {
1981 			nx_free(nx);
1982 			nx = NULL;
1983 		}
1984 	}
1985 	return nx;
1986 }
1987 
1988 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1989 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1990 {
1991 	struct kern_nexus *nx = NULL;
1992 	struct kern_nexus find;
1993 	int err = 0;
1994 
1995 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1996 
1997 	SK_LOCK();
1998 
1999 	uuid_copy(find.nx_uuid, nx_uuid);
2000 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2001 	if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
2002 		nx = NULL;
2003 	}
2004 
2005 	if (nx != NULL) {
2006 		nx_retain_locked(nx);
2007 	}
2008 
2009 	if (nx == NULL) {
2010 		err = ENOENT;
2011 	} else {
2012 		err = nx_close(nx, TRUE);
2013 		(void) nx_release_locked(nx);
2014 	}
2015 
2016 	SK_UNLOCK();
2017 
2018 	return err;
2019 }
2020 
2021 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)2022 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
2023 {
2024 	return uuid_compare(a->nx_uuid, b->nx_uuid);
2025 }
2026 
2027 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)2028 nx_find(const uuid_t nx_uuid, boolean_t locked)
2029 {
2030 	struct kern_nexus *nx = NULL;
2031 	struct kern_nexus find;
2032 
2033 	if (!locked) {
2034 		SK_LOCK();
2035 	}
2036 
2037 	SK_LOCK_ASSERT_HELD();
2038 
2039 	uuid_copy(find.nx_uuid, nx_uuid);
2040 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2041 	if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2042 		nx = NULL;
2043 	}
2044 
2045 	/* return reference to caller */
2046 	if (nx != NULL) {
2047 		nx_retain_locked(nx);
2048 	}
2049 
2050 	if (!locked) {
2051 		SK_UNLOCK();
2052 	}
2053 
2054 	return nx;
2055 }
2056 
2057 int
nx_close(struct kern_nexus * nx,boolean_t locked)2058 nx_close(struct kern_nexus *nx, boolean_t locked)
2059 {
2060 	int err = 0;
2061 
2062 	if (!locked) {
2063 		SK_LOCK();
2064 	}
2065 
2066 	SK_LOCK_ASSERT_HELD();
2067 
2068 
2069 	if (nx->nx_flags & NXF_CLOSED) {
2070 		err = EALREADY;
2071 	} else {
2072 #if SK_LOG
2073 		uuid_string_t uuidstr;
2074 		SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2075 		    NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2076 		    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2077 		    NXF_BITS);
2078 #endif /* SK_LOG */
2079 
2080 		if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2081 			/* no regular channels open to it, so detach now */
2082 			nx_detach(nx);
2083 		} else {
2084 			/* detach when the last channel closes */
2085 			ASSERT(nx->nx_refcnt > 3);
2086 			os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2087 		}
2088 	}
2089 
2090 	if (!locked) {
2091 		SK_UNLOCK();
2092 	}
2093 
2094 	return err;
2095 }
2096 
2097 void
nx_stop(struct kern_nexus * nx)2098 nx_stop(struct kern_nexus *nx)
2099 {
2100 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2101 
2102 	SK_LOCK_ASSERT_HELD();
2103 
2104 	/* send a stop message */
2105 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2106 		nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2107 	}
2108 }
2109 
2110 void
nx_detach(struct kern_nexus * nx)2111 nx_detach(struct kern_nexus *nx)
2112 {
2113 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2114 
2115 	SK_LOCK_ASSERT_HELD();
2116 
2117 #if SK_LOG
2118 	uuid_string_t uuidstr;
2119 	SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2120 	    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2121 #endif /* SK_LOG */
2122 
2123 	/* Caller must hold extra refs, on top of the two in reg/global lists */
2124 	ASSERT(nx->nx_refcnt >= 3);
2125 	ASSERT(nx->nx_flags & NXF_ATTACHED);
2126 
2127 	/* this nexus is done; let the nexus destructor do final cleanups */
2128 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2129 		nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2130 	}
2131 
2132 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2133 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2134 
2135 	STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2136 	nxprov->nxprov_nx_count--;
2137 	RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2138 	os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2139 	nx->nx_prov = NULL;
2140 	if (nx->nx_ctx_release != NULL) {
2141 		nx->nx_ctx_release(nx->nx_ctx);
2142 	}
2143 	nx->nx_ctx = NULL;
2144 
2145 	(void) nx_release_locked(nx);   /* one for the reg list */
2146 	(void) nx_release_locked(nx);   /* one for the global list */
2147 
2148 	/*
2149 	 * If this was the last nexus and the provider has been closed,
2150 	 * detach the provider and and finish up the postponed job.
2151 	 */
2152 	if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2153 	    (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2154 		nxprov_detach(nxprov, TRUE);
2155 	}
2156 	(void) nxprov_release_locked(nxprov);
2157 }
2158 
2159 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2160 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2161     struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2162 {
2163 	struct __kern_nexus_adv_metadata *adv_md;
2164 	uint32_t msize = 0;
2165 	/* -fbounds-safety: why do we need maddr? */
2166 	void *__sized_by(msize) maddr = NULL;
2167 
2168 	_CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2169 	_CASSERT((sizeof(struct sk_nexusadv) +
2170 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2171 	_CASSERT((sizeof(struct netif_nexus_advisory) +
2172 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2173 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2174 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2175 	ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2176 	    type == NEXUS_ADVISORY_TYPE_NETIF);
2177 
2178 	if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2179 	    NULL, NULL, NULL)) == NULL) {
2180 		return ENOMEM;
2181 	}
2182 
2183 	nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, &maddr,
2184 	    NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC),
2185 	    nx->nx_adv.nxv_reg->skr_c_obj_size, &msize);
2186 	nx->nx_adv.nxv_adv_size = nx->nx_adv.nxv_reg->skr_c_obj_size;
2187 	adv_md = nx->nx_adv.nxv_adv;
2188 	adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2189 	adv_md->knam_type = type;
2190 	adv_md->__reserved = 0;
2191 	nx->nx_adv.nxv_adv_type = type;
2192 	nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2193 	if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2194 		nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2195 		    NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2196 	} else {
2197 		nx->nx_adv.netif_nxv_adv->nna_version =
2198 		    NX_NETIF_ADVISORY_CURRENT_VERSION;
2199 	}
2200 	return 0;
2201 }
2202 
2203 void
nx_advisory_free(struct kern_nexus * nx)2204 nx_advisory_free(struct kern_nexus *nx)
2205 {
2206 	if (nx->nx_adv.nxv_reg != NULL) {
2207 		ASSERT(nx->nx_adv.nxv_adv != NULL);
2208 		skmem_region_free(nx->nx_adv.nxv_reg,
2209 		    nx->nx_adv.nxv_adv, NULL);
2210 		nx->nx_adv.nxv_adv = NULL;
2211 		nx->nx_adv.nxv_adv_size = 0;
2212 		nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2213 		nx->nx_adv.flowswitch_nxv_adv = NULL;
2214 		skmem_region_release(nx->nx_adv.nxv_reg);
2215 		nx->nx_adv.nxv_reg = NULL;
2216 	}
2217 
2218 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2219 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2220 	ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2221 	ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2222 }
2223 
2224 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2225 nx_alloc(zalloc_flags_t how)
2226 {
2227 	SK_LOCK_ASSERT_HELD();
2228 
2229 	return zalloc_flags(nx_zone, how | Z_ZERO);
2230 }
2231 
2232 static void
nx_free(struct kern_nexus * nx)2233 nx_free(struct kern_nexus *nx)
2234 {
2235 	ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2236 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2237 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2238 
2239 	nx_port_free_all(nx);
2240 
2241 	if (nx->nx_tx_pp != NULL) {
2242 		pp_release(nx->nx_tx_pp);
2243 		nx->nx_tx_pp = NULL;
2244 	}
2245 	if (nx->nx_rx_pp != NULL) {
2246 		pp_release(nx->nx_rx_pp);
2247 		nx->nx_rx_pp = NULL;
2248 	}
2249 
2250 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2251 	lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2252 
2253 	SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2254 	zfree(nx_zone, nx);
2255 }
2256 
2257 void
nx_retain_locked(struct kern_nexus * nx)2258 nx_retain_locked(struct kern_nexus *nx)
2259 {
2260 	SK_LOCK_ASSERT_HELD();
2261 
2262 	nx->nx_refcnt++;
2263 	VERIFY(nx->nx_refcnt > 0);
2264 }
2265 
2266 void
nx_retain(struct kern_nexus * nx)2267 nx_retain(struct kern_nexus *nx)
2268 {
2269 	SK_LOCK();
2270 	nx_retain_locked(nx);
2271 	SK_UNLOCK();
2272 }
2273 
2274 int
nx_release_locked(struct kern_nexus * nx)2275 nx_release_locked(struct kern_nexus *nx)
2276 {
2277 	int oldref = nx->nx_refcnt;
2278 
2279 	SK_LOCK_ASSERT_HELD();
2280 
2281 	VERIFY(nx->nx_refcnt > 0);
2282 	if (--nx->nx_refcnt == 0) {
2283 		nx_free(nx);
2284 	}
2285 
2286 	return oldref == 1;
2287 }
2288 
2289 int
nx_release(struct kern_nexus * nx)2290 nx_release(struct kern_nexus *nx)
2291 {
2292 	int lastref;
2293 
2294 	SK_LOCK_ASSERT_NOTHELD();
2295 
2296 	SK_LOCK();
2297 	lastref = nx_release_locked(nx);
2298 	SK_UNLOCK();
2299 
2300 	return lastref;
2301 }
2302 
2303 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2304 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2305 {
2306 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2307 	struct nexus_adapter *na = ch->ch_na;
2308 	boolean_t undo = FALSE;
2309 	int ksd_retains = 0;
2310 	enum txrx t;
2311 	int err = 0;
2312 
2313 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2314 	    CHANF_EXT_PRECONNECT);
2315 
2316 	if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2317 		return 0;
2318 	}
2319 
2320 	for_rx_tx(t) {
2321 		uint32_t i;
2322 
2323 		for (i = 0; i < na_get_nrings(na, t); i++) {
2324 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2325 
2326 			/* skip host rings */
2327 			if (kring->ckr_flags & CKRF_HOST) {
2328 				continue;
2329 			}
2330 
2331 			if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2332 				    nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2333 				    &kring->ckr_ctx)) != 0) {
2334 				SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2335 				    "(0x%llx) krflags %b ring_init error %d",
2336 				    SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2337 				    SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2338 				    kring->ckr_flags, CKRF_BITS, err);
2339 				kring->ckr_ctx = NULL;
2340 				undo = TRUE;
2341 				break;
2342 			}
2343 			kring->ckr_flags |= CKRF_EXT_RING_INITED;
2344 
2345 			if ((err = nx_init_slots(nx, kring)) != 0) {
2346 				undo = TRUE;
2347 				break;
2348 			}
2349 
2350 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2351 				++ksd_retains;
2352 			}
2353 		}
2354 		if (undo) {
2355 			break;
2356 		}
2357 	}
2358 
2359 	/*
2360 	 * Note: retain KSD even in case of error, as we have set
2361 	 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2362 	 * nx_fini_rings would take care of release based on it.
2363 	 */
2364 	if (ksd_retains != 0) {
2365 		/*
2366 		 * Mark the kernel slot descriptor region as busy; this
2367 		 * prevents it from being torn-down at channel defunct
2368 		 * time, as we need to invoke the slot_fini() callback
2369 		 * for each slot and we need the descriptors until then.
2370 		 */
2371 		skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2372 		    ksd_retains);
2373 	}
2374 
2375 	if (err != 0) {
2376 		ASSERT(undo);
2377 		nx_fini_rings(nx, ch);
2378 	}
2379 
2380 	return err;
2381 }
2382 
2383 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2384 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2385 {
2386 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2387 	struct nexus_adapter *na = ch->ch_na;
2388 	int ksd_releases = 0;
2389 	enum txrx t;
2390 
2391 	for_rx_tx(t) {
2392 		uint32_t i;
2393 
2394 		for (i = 0; i < na_get_nrings(na, t); i++) {
2395 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2396 
2397 			if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2398 				continue;
2399 			}
2400 
2401 			ASSERT(!(kring->ckr_flags & CKRF_HOST));
2402 			ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2403 			nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2404 			kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2405 
2406 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2407 				++ksd_releases;
2408 			}
2409 
2410 			/*
2411 			 * Undo the work done in nx_init_slots() and inform
2412 			 * the external domain provider, if applicable, that
2413 			 * the slots for this ring are no longer valid.
2414 			 */
2415 			nx_fini_slots(nx, kring);
2416 			kring->ckr_ctx = NULL;
2417 		}
2418 	}
2419 
2420 	if (ksd_releases != 0) {
2421 		/*
2422 		 * Now that we've finished invoking the slot_fini()
2423 		 * callbacks, release the busy retain counts held
2424 		 * earlier in nx_init_rings().  This will allow the
2425 		 * kernel slot descriptor region to be torn down.
2426 		 */
2427 		skmem_arena_nexus_sd_set_noidle(
2428 			skmem_arena_nexus(na->na_arena), -ksd_releases);
2429 	}
2430 }
2431 
2432 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2433 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2434 {
2435 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2436 	struct __slot_desc *slot = kring->ckr_ksds;
2437 	int err = 0;
2438 	uint32_t i;
2439 
2440 	/*
2441 	 * If the slot init callback was not provided, or if the
2442 	 * kring was not created to hold any slot contexts, don't
2443 	 * go any further.
2444 	 */
2445 	if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2446 	    kring->ckr_slot_ctxs == NULL) {
2447 		return 0;
2448 	}
2449 
2450 	ASSERT(kring->ckr_slot_ctxs_set == 0);
2451 	ASSERT(slot != NULL);
2452 
2453 	for (i = 0; i < kring->ckr_num_slots; i++) {
2454 		struct kern_slot_prop *__single slot_ctx_prop = NULL;
2455 		/* -fbounds-safety: slot_ctx is unsafe anyway (mach_vmaddr_t) */
2456 		void *__single slot_ctx_arg = NULL;
2457 
2458 		ASSERT(&slot[i] <= kring->ckr_ksds_last);
2459 		if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2460 		    &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2461 			SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2462 			    "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2463 			    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2464 			break;
2465 		}
2466 		/* we don't want this to be used by client, so verify here */
2467 		ASSERT(slot_ctx_prop == NULL);
2468 		kring->ckr_slot_ctxs[i].slot_ctx_arg = slot_ctx_arg;
2469 		kring->ckr_slot_ctxs_set++;
2470 	}
2471 
2472 	if (err != 0) {
2473 		nx_fini_slots(nx, kring);
2474 	} else {
2475 		kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2476 	}
2477 
2478 	return err;
2479 }
2480 
2481 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2482 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2483 {
2484 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2485 	struct __slot_desc *slot = kring->ckr_ksds;
2486 	uint32_t i;
2487 
2488 	ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2489 	    nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2490 	ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2491 
2492 	for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2493 		ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2494 		if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2495 			nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2496 			    kring, &slot[i], i);
2497 		}
2498 		if (kring->ckr_slot_ctxs != NULL) {
2499 			kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2500 		}
2501 	}
2502 	kring->ckr_slot_ctxs_set = 0;
2503 
2504 	/* We're done with this kring */
2505 	kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2506 }
2507 
2508 
2509 /* 64-bit mask with range */
2510 #define BMASK64(_beg, _end)     \
2511 	((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2512 
2513 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2514 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2515     nexus_port_t last, nexus_port_t *nx_port)
2516 {
2517 	int err = 0;
2518 
2519 	ASSERT(first < last);
2520 	*nx_port = NEXUS_PORT_ANY;
2521 
2522 	if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2523 		/*
2524 		 * Left edge of the range is beyond the current map;
2525 		 * let nx_port_alloc() handle the growing later.
2526 		 */
2527 		*nx_port = first;
2528 	} else {
2529 		nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2530 		nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2531 		nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2532 		nexus_port_size_t i, j;
2533 		bitmap_t *bmap;
2534 
2535 		/*
2536 		 * The right edge of the range is either within or
2537 		 * beyond the current map; scan thru the current
2538 		 * map and find the first available port.
2539 		 */
2540 		for (i = fc; i <= lc; i++) {
2541 			bitmap_t mask;
2542 			nexus_port_size_t beg = 0, end = 63;
2543 
2544 			if (i == fc) {
2545 				beg = (first % NX_PORT_CHUNK);
2546 			}
2547 			if (i == (last / NX_PORT_CHUNK)) {
2548 				end = (last % NX_PORT_CHUNK);
2549 			}
2550 
2551 			if (i < lim) {
2552 				bmap = &nx->nx_ports_bmap[i];
2553 				mask = BMASK64(beg, end);
2554 
2555 				j = (nexus_port_size_t)ffsll((*bmap) & mask);
2556 				if (j == 0) {
2557 					continue;
2558 				}
2559 
2560 				--j;
2561 				*nx_port = (i * NX_PORT_CHUNK) + j;
2562 			}
2563 			break;
2564 		}
2565 
2566 		/*
2567 		 * If the requested range is within the current map and we
2568 		 * couldn't find a port, return an err.  Otherwise, return
2569 		 * the next port index to trigger growing later.
2570 		 */
2571 		if (*nx_port == NEXUS_PORT_ANY) {
2572 			if (lc == (last / NX_PORT_CHUNK)) {
2573 				err = EBUSY;
2574 				SK_ERR("port unavail in [%u, %u)", first, last);
2575 			} else {
2576 				*nx_port = nx->nx_num_ports;
2577 			}
2578 		}
2579 	}
2580 
2581 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2582 	    (int)*nx_port, err);
2583 
2584 	return err;
2585 }
2586 
2587 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2588 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2589 {
2590 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2591 	nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2592 	struct nx_port_info *ports;
2593 	size_t limit;
2594 	nexus_port_size_t i, num_ports, old_num_ports;
2595 	bitmap_t *bmap;
2596 
2597 	ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2598 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2599 	_CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2600 	ASSERT(powerof2(dom_port_max));
2601 	ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2602 
2603 	old_num_ports = nx->nx_num_ports;
2604 	num_ports = nx->nx_num_ports + grow;
2605 	limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2606 	if (num_ports > limit) {
2607 		SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2608 		    nx->nx_num_ports, grow, num_ports, limit);
2609 		return EDOM;
2610 	}
2611 
2612 	if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2613 	    (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2614 	    (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2615 	    Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2616 		SK_ERR("bmap alloc failed, num_port %u", num_ports);
2617 		return ENOMEM;
2618 	}
2619 	nx->nx_ports_bmap = bmap;
2620 	nx->nx_ports_bmap_size = (num_ports / NX_PORT_CHUNK) * sizeof(*bmap);
2621 
2622 	if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2623 	    num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2624 		/* can't free bmap here, otherwise nexus won't work */
2625 		SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2626 		return ENOMEM;
2627 	}
2628 
2629 	/* initialize the additional new ports */
2630 	bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2631 
2632 	/* initialize new bitmaps (set all bits) */
2633 	for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2634 	    i < (num_ports / NX_PORT_CHUNK); i++) {
2635 		bmap[i] = NX_PORT_CHUNK_FREE;
2636 	}
2637 
2638 	/*
2639 	 * -fbounds-safety: Not sure if moving nx_ports assignment down here
2640 	 * would cause a regression.
2641 	 */
2642 	nx->nx_ports = ports;
2643 	nx->nx_num_ports = num_ports;
2644 
2645 	SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2646 	    SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2647 
2648 	return 0;
2649 }
2650 
2651 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2652 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2653     struct nexus_adapter **na, struct proc *p)
2654 {
2655 	struct nx_port_info *npi = NULL;
2656 	struct nxbind *nxb0;
2657 	size_t g;
2658 	uint32_t i, j;
2659 	bitmap_t *bmap;
2660 	bool refonly = false;
2661 	int err = 0;
2662 
2663 	ASSERT(nx_port != NEXUS_PORT_ANY);
2664 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2665 
2666 	/* port is zero-based, so adjust here */
2667 	if ((nx_port + 1) > nx->nx_num_ports) {
2668 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2669 		VERIFY(g <= NEXUS_PORT_MAX);
2670 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2671 			goto done;
2672 		}
2673 	}
2674 	ASSERT(err == 0);
2675 	ASSERT(nx_port < nx->nx_num_ports);
2676 	npi = &nx->nx_ports[nx_port];
2677 	nxb0 = npi->npi_nxb;
2678 	i = nx_port / NX_PORT_CHUNK;
2679 	j = nx_port % NX_PORT_CHUNK;
2680 	bmap = &nx->nx_ports_bmap[i];
2681 
2682 	if (bit_test(*bmap, j)) {
2683 		/* port is not (yet) bound or allocated */
2684 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2685 		if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2686 			/*
2687 			 * If the port allocation is requested by userland
2688 			 * and the nexus is non-anonymous, then fail the
2689 			 * request.
2690 			 */
2691 			err = EACCES;
2692 			SK_ERR("user proc alloc on named nexus needs binding");
2693 		} else if (na != NULL && *na != NULL) {
2694 			/*
2695 			 * Otherwise claim it (clear bit) if the caller
2696 			 * supplied an adapter for this port; else, it
2697 			 * is just an existential check and so there's
2698 			 * no action needed at this point (we'll skip
2699 			 * the init below since vpna is NULL).
2700 			 */
2701 			bit_clear(*bmap, j);
2702 		}
2703 	} else {
2704 		/* if port is bound, check if credentials match */
2705 		if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2706 		    (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2707 			SK_ERR("nexus binding mismatch");
2708 			err = EACCES;
2709 		} else {
2710 			/*
2711 			 * If port is already occupied by an adapter,
2712 			 * see if the client is requesting a reference
2713 			 * to it; if so, return the adapter.  Otherwise,
2714 			 * if unoccupied and vpna is non-NULL, associate
2715 			 * it with this nexus port via the below init.
2716 			 */
2717 			if (NPI_NA(npi) != NULL) {
2718 				if (na != NULL && *na == NULL) {
2719 					*na = NPI_NA(npi);
2720 					na_retain_locked(*na);
2721 					/* skip the init below */
2722 					refonly = true;
2723 				} else {
2724 					/*
2725 					 * If the client supplied an adapter
2726 					 * (regardless of its value) for a
2727 					 * nexus port that's already occupied,
2728 					 * then we fail the request.
2729 					 */
2730 					SK_ERR("nexus adapted exits");
2731 					err = EEXIST;
2732 				}
2733 			}
2734 		}
2735 	}
2736 
2737 done:
2738 	/* initialize the nexus port and the adapter occupying it */
2739 	if (err == 0 && na != NULL && *na != NULL && !refonly) {
2740 		ASSERT(nx_port < nx->nx_num_ports);
2741 		ASSERT(npi->npi_nah == 0);
2742 		ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2743 		ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2744 		    (nx_port % NX_PORT_CHUNK)));
2745 
2746 		nx->nx_active_ports++;
2747 		npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2748 		(*na)->na_nx_port = nx_port;
2749 	}
2750 
2751 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2752 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2753 	    err);
2754 
2755 	return err;
2756 }
2757 
2758 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2759 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2760 {
2761 	struct nx_port_info *npi = &nx->nx_ports[nx_port];
2762 
2763 	npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2764 	    NEXUS_PORT_STATE_DEFUNCT);
2765 }
2766 
2767 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2768 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2769 {
2770 	struct nx_port_info *npi = NULL;
2771 	bitmap_t *bmap;
2772 	uint32_t i, j;
2773 
2774 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2775 	ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2776 	ASSERT(nx->nx_active_ports != 0);
2777 
2778 	i = nx_port / NX_PORT_CHUNK;
2779 	j = nx_port % NX_PORT_CHUNK;
2780 	bmap = &nx->nx_ports_bmap[i];
2781 	ASSERT(!bit_test(*bmap, j));
2782 
2783 	npi = &nx->nx_ports[nx_port];
2784 	npi->npi_nah = 0;
2785 	if (npi->npi_nxb == NULL) {
2786 		/* it's vacant, release it (set bit) */
2787 		bit_set(*bmap, j);
2788 	}
2789 
2790 	nx->nx_active_ports--;
2791 
2792 	//XXX [email protected] --- try to shrink bitmap & nx_ports ???
2793 
2794 	SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2795 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2796 }
2797 
2798 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2799 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2800     struct nxbind *nxb0, void *info)
2801 {
2802 	struct nx_port_info *npi = NULL;
2803 	size_t g;
2804 	uint32_t i, j;
2805 	bitmap_t *bmap;
2806 	int err = 0;
2807 
2808 	ASSERT(nx_port != NEXUS_PORT_ANY);
2809 	ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2810 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2811 	ASSERT(nxb0 != NULL);
2812 
2813 	if ((nx_port) + 1 > nx->nx_num_ports) {
2814 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2815 		VERIFY(g <= NEXUS_PORT_MAX);
2816 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2817 			goto done;
2818 		}
2819 	}
2820 	ASSERT(err == 0);
2821 
2822 	npi = &nx->nx_ports[nx_port];
2823 	i = nx_port / NX_PORT_CHUNK;
2824 	j = nx_port % NX_PORT_CHUNK;
2825 	bmap = &nx->nx_ports_bmap[i];
2826 	if (bit_test(*bmap, j)) {
2827 		/* port is not (yet) bound or allocated */
2828 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2829 
2830 		bit_clear(*bmap, j);
2831 		struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2832 		nxb_move(nxb0, nxb);
2833 		npi->npi_nxb = nxb;
2834 		npi->npi_info = info;
2835 		/* claim it (clear bit) */
2836 		bit_clear(*bmap, j);
2837 		ASSERT(err == 0);
2838 	} else {
2839 		/* port is already taken */
2840 		ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2841 		err = EEXIST;
2842 	}
2843 done:
2844 
2845 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2846 	    "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2847 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2848 
2849 	return err;
2850 }
2851 
2852 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2853 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2854 {
2855 	return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2856 }
2857 
2858 /*
2859  * -fbounds-safety: all callers pass npi_info. Why don't we just change the
2860  * input type to nx_port_info_header *?
2861  */
2862 static int
nx_port_info_size(struct nx_port_info_header * info,size_t * sz)2863 nx_port_info_size(struct nx_port_info_header *info, size_t *sz)
2864 {
2865 	struct nx_port_info_header *hdr = info;
2866 
2867 	switch (hdr->ih_type) {
2868 	case NX_PORT_INFO_TYPE_NETIF:
2869 		break;
2870 	default:
2871 		return EINVAL;
2872 	}
2873 	*sz = hdr->ih_size;
2874 	return 0;
2875 }
2876 
2877 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2878 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2879 {
2880 	struct nx_port_info *npi = NULL;
2881 	struct nxbind *nxb;
2882 	uint32_t i, j;
2883 	bitmap_t *bmap;
2884 	int err = 0;
2885 
2886 	ASSERT(nx_port != NEXUS_PORT_ANY);
2887 
2888 	if (nx_port >= nx->nx_num_ports) {
2889 		err = EDOM;
2890 		goto done;
2891 	}
2892 
2893 	npi = &nx->nx_ports[nx_port];
2894 	i = nx_port / NX_PORT_CHUNK;
2895 	j = nx_port % NX_PORT_CHUNK;
2896 	bmap = &nx->nx_ports_bmap[i];
2897 
2898 	if ((nxb = npi->npi_nxb) == NULL) {
2899 		/* must be either free or allocated */
2900 		ASSERT(NPI_NA(npi) == NULL ||
2901 		    (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2902 		err = ENOENT;
2903 	} else {
2904 		nxb_free(nxb);
2905 		npi->npi_nxb = NULL;
2906 		if (npi->npi_info != NULL) {
2907 			size_t sz;
2908 
2909 			VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2910 			sk_free_data(npi->npi_info, sz);
2911 			npi->npi_info = NULL;
2912 		}
2913 		ASSERT(!bit_test(*bmap, j));
2914 		if (NPI_NA(npi) == NULL) {
2915 			/* it's vacant, release it (set bit) */
2916 			bit_set(*bmap, j);
2917 		}
2918 	}
2919 
2920 done:
2921 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2922 	    "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2923 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2924 
2925 	return err;
2926 }
2927 
2928 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2929 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2930 {
2931 	if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2932 		return NPI_NA(&nx->nx_ports[nx_port]);
2933 	} else {
2934 		return NULL;
2935 	}
2936 }
2937 
2938 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * __sized_by (len)info,uint32_t len)2939 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2940     nx_port_info_type_t type, void *__sized_by(len)info, uint32_t len)
2941 {
2942 	struct nx_port_info *npi;
2943 	struct nx_port_info_header *hdr;
2944 
2945 	if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2946 		return ENXIO;
2947 	}
2948 	npi = &nx->nx_ports[port];
2949 	/*
2950 	 * -fbounds-safety: Changing npi_info to be __sized_by is a major
2951 	 * surgery. Just forge it here for now.
2952 	 */
2953 	hdr = __unsafe_forge_bidi_indexable(struct nx_port_info_header *,
2954 	    npi->npi_info, len);
2955 	if (hdr == NULL) {
2956 		return ENOENT;
2957 	}
2958 
2959 	if (hdr->ih_type != type) {
2960 		return EINVAL;
2961 	}
2962 
2963 	bcopy(hdr, info, len);
2964 	return 0;
2965 }
2966 
2967 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2968 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2969 {
2970 	return nx_port < nx->nx_num_ports;
2971 }
2972 
2973 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2974 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2975 {
2976 	ASSERT(nx_port_is_valid(nx, nx_port));
2977 
2978 	return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2979 }
2980 
2981 void
nx_port_free_all(struct kern_nexus * nx)2982 nx_port_free_all(struct kern_nexus *nx)
2983 {
2984 	/* uncrustify doesn't handle C blocks properly */
2985 	/* BEGIN IGNORE CODESTYLE */
2986 	nx_port_foreach(nx, ^(nexus_port_t p) {
2987 		struct nxbind *nxb;
2988 		/*
2989 		 * XXX -fbounds-safety: Come back to this after fixing npi_info
2990 		 */
2991 		void *__single info;
2992 		nxb = nx->nx_ports[p].npi_nxb;
2993 		info = nx->nx_ports[p].npi_info;
2994 		if (nxb != NULL) {
2995 			nxb_free(nxb);
2996 			nx->nx_ports[p].npi_nxb = NULL;
2997 		}
2998 		if (info != NULL) {
2999 			size_t sz;
3000 
3001 			VERIFY(nx_port_info_size(info, &sz) == 0);
3002 			skn_free_data(info, info, sz);
3003 			nx->nx_ports[p].npi_info = NULL;
3004 		}
3005 	});
3006 	/* END IGNORE CODESTYLE */
3007 
3008 	nx->nx_active_ports = 0;
3009 	sk_free_data_sized_by(nx->nx_ports_bmap, nx->nx_ports_bmap_size);
3010 	nx->nx_ports_bmap = NULL;
3011 	nx->nx_ports_bmap_size = 0;
3012 	sk_free_type_array_counted_by(struct nx_port_info, nx->nx_num_ports, nx->nx_ports);
3013 	nx->nx_ports = NULL;
3014 	nx->nx_num_ports = 0;
3015 }
3016 
3017 void
3018 nx_port_foreach(struct kern_nexus *nx,
3019     void (^port_handle)(nexus_port_t nx_port))
3020 {
3021 	for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
3022 		bitmap_t bmap = nx->nx_ports_bmap[i];
3023 
3024 		if (bmap == NX_PORT_CHUNK_FREE) {
3025 			continue;
3026 		}
3027 
3028 		for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
3029 			if (bit_test(bmap, j)) {
3030 				continue;
3031 			}
3032 			port_handle((i * NX_PORT_CHUNK) + j);
3033 		}
3034 	}
3035 }
3036 
3037 /*
3038  * sysctl interfaces
3039  */
3040 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
3041 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
3042 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
3043 
3044 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
3045     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3046     0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
3047 
3048 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
3049     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3050     0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
3051 
3052 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
3053     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3054     0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
3055     "A list of logical links");
3056 
3057 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3058     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3059     0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3060     "Nexus inet flows with stats collected in kernel");
3061 
3062 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3063     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3064     0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3065     "Nexus flow owners");
3066 
3067 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3068     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3069     0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3070     "Nexus flow routes");
3071 
3072 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3073     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3074     0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3075     "Nexus netif statistics collected in kernel");
3076 
3077 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3078     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3079     0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3080     "Nexus flowswitch statistics collected in kernel");
3081 
3082 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3083     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3084     0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3085     "Nexus userstack statistics counter");
3086 
3087 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3088     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3089     0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3090     "Nexus flow advisory dump");
3091 
3092 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3093     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3094     0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3095     "A list of netif queue stats entries");
3096 
3097 /*
3098  * Provider list sysctl
3099  */
3100 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3101 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3102     nexus_provider_info_t info)
3103 {
3104 	struct kern_nexus *nx;
3105 	uuid_t *uuids;
3106 
3107 	SK_LOCK_ASSERT_HELD();
3108 
3109 	/* provider UUID + params */
3110 	uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3111 	bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3112 	    sizeof(struct nxprov_params));
3113 	info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3114 
3115 	/* instance UUID list */
3116 	uuids = __unsafe_forge_bidi_indexable(uuid_t *,
3117 	    info->npi_instance_uuids, sizeof(uuid_t) * info->npi_instance_uuids_count);
3118 	STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3119 		uuid_copy(*uuids, nx->nx_uuid);
3120 		uuids++;
3121 	}
3122 }
3123 
3124 static int
3125 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3126 {
3127 #pragma unused(arg1, arg2, oidp)
3128 	size_t actual_space;
3129 	caddr_t buffer = NULL;
3130 	size_t buffer_space;
3131 	size_t allocated_space;
3132 	int out_error;
3133 	int error = 0;
3134 	struct kern_nexus_provider *nxprov;
3135 	caddr_t scan;
3136 
3137 	if (!kauth_cred_issuser(kauth_cred_get())) {
3138 		return EPERM;
3139 	}
3140 
3141 	net_update_uptime();
3142 	buffer_space = req->oldlen;
3143 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3144 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3145 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3146 		}
3147 		allocated_space = buffer_space;
3148 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3149 		if (__improbable(buffer == NULL)) {
3150 			return ENOBUFS;
3151 		}
3152 	} else if (req->oldptr == USER_ADDR_NULL) {
3153 		buffer_space = 0;
3154 	}
3155 	actual_space = 0;
3156 	scan = buffer;
3157 	SK_LOCK();
3158 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3159 		size_t                  info_size;
3160 
3161 		info_size
3162 		        = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3163 		if (scan != NULL) {
3164 			if (buffer_space < info_size) {
3165 				/* supplied buffer too small, stop copying */
3166 				error = ENOMEM;
3167 				break;
3168 			}
3169 			nexus_provider_info_populate(nxprov, (void *)scan);
3170 			scan += info_size;
3171 			buffer_space -= info_size;
3172 		}
3173 		actual_space += info_size;
3174 	}
3175 	SK_UNLOCK();
3176 
3177 	out_error = SYSCTL_OUT(req, buffer, actual_space);
3178 	if (out_error != 0) {
3179 		error = out_error;
3180 	}
3181 
3182 	if (buffer != NULL) {
3183 		sk_free_data(buffer, allocated_space);
3184 	}
3185 
3186 	return error;
3187 }
3188 
3189 /*
3190  * Channel list sysctl
3191  */
3192 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3193 channel_ring_count(struct kern_channel *ch, enum txrx which)
3194 {
3195 	return ch->ch_last[which] - ch->ch_first[which];
3196 }
3197 
3198 /*
3199  * -fbounds-safety: kring's range is [first..last]. Marking it
3200  * __counted_by(last) means range is [0..first..last]. The [0..first) might be
3201  * problematic. However, the for loop in this function starts indexing from
3202  * 'first', not 0, so that should be okay.
3203  * XXX Until BATS starts using uncrustify-7 (rdar://90709826), having a space
3204  * between __counted_by(entry_count) entries will be considered invalid code
3205  * style and build will fail. Until rdar://117811249 is resolved, either stick
3206  * to what makes BATS happy, or wrap IGNORE CODESTYLE around.
3207  */
3208 static void
populate_ring_entries(struct __kern_channel_ring * __counted_by (last)kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry * __counted_by (entry_count)entries,uint32_t NX_FB_ARG entry_count)3209 populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring,
3210     ring_id_t first, ring_id_t last,
3211     nexus_channel_ring_entry *__counted_by(entry_count)entries,
3212     uint32_t NX_FB_ARG entry_count)
3213 {
3214 	ring_id_t i;
3215 	nexus_channel_ring_entry_t scan;
3216 	struct __kern_channel_ring *ring;
3217 
3218 	scan = entries;
3219 	for (i = first; i < last; i++, scan++) {
3220 		ring = &kring[i];
3221 
3222 		DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3223 		    ring);
3224 		if (kr_stat_enable == 0) {
3225 			bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3226 			bzero(&scan->ncre_user_stats,
3227 			    sizeof(scan->ncre_user_stats));
3228 		} else {
3229 			scan->ncre_stats = ring->ckr_stats;
3230 			scan->ncre_user_stats = ring->ckr_usr_stats;
3231 		}
3232 		scan->ncre_error_stats = ring->ckr_err_stats;
3233 		scan->ncre_ring_id = i;
3234 	}
3235 }
3236 
3237 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3238 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3239 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3240 {
3241 	uint32_t flags = 0;
3242 
3243 	flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3244 	flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3245 	flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3246 	flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3247 	flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3248 	flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3249 	flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3250 	flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3251 	flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3252 	flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3253 	flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3254 	flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3255 	flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3256 
3257 	return flags;
3258 }
3259 
3260 SK_NO_INLINE_ATTRIBUTE
3261 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3262 nexus_channel_entry_populate(struct kern_channel *ch,
3263     nexus_channel_entry_t entry)
3264 {
3265 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3266 	uint32_t ch_flags = ch->ch_flags;
3267 	ring_id_t rx_first = ch->ch_first[NR_RX];
3268 	ring_id_t rx_last = ch->ch_last[NR_RX];
3269 	ring_id_t tx_last = ch->ch_last[NR_TX];
3270 	ring_id_t tx_first = ch->ch_first[NR_TX];
3271 
3272 	uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3273 	entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3274 	entry->nce_port = ch->ch_info->cinfo_nx_port;
3275 	entry->nce_pid = ch->ch_pid;
3276 	entry->nce_fd = ch->ch_fd;
3277 	entry->nce_tx_rings = tx_last - tx_first;
3278 	entry->nce_rx_rings = rx_last - rx_first;
3279 	populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3280 	    entry->nce_ring_entries, entry->nce_tx_rings);
3281 
3282 	/*
3283 	 * -fbounds-safety: If entry->nce_tx_rings > 0 and
3284 	 * entry->nce_rx_rings == 0 (i.e. entry->nce_ring_count ==
3285 	 * entry->nce_tx_rings), simply passing
3286 	 * entry->nce_ring_entries + entry->nce_tx_rings to populate_ring_entries
3287 	 * will fail bounds check, because it is equivalent to assigning
3288 	 * nce_ring_entries + nce_tx_rings to a __single variable, and in this
3289 	 * case it goes out of bounds. It's same thing as having:
3290 	 *     int a[1];
3291 	 *     some_func(a + 1);  <-- bounds check will fail
3292 	 */
3293 	if (rx_first < rx_last) {
3294 		populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3295 		    entry->nce_ring_entries + entry->nce_tx_rings,
3296 		    entry->nce_rx_rings);
3297 	}
3298 }
3299 
3300 SK_NO_INLINE_ATTRIBUTE
3301 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info * __sized_by (buffer_size)info,size_t buffer_size)3302 nexus_channel_info_populate(struct kern_nexus *nx,
3303     nexus_channel_info *__sized_by(buffer_size) info, size_t buffer_size)
3304 {
3305 	struct kern_channel *ch = NULL;
3306 	size_t info_size;
3307 	caddr_t scan = NULL;
3308 	nexus_channel_entry *entry;
3309 
3310 	SK_LOCK_ASSERT_HELD();
3311 
3312 	info_size = sizeof(nexus_channel_info);
3313 
3314 	/* channel list */
3315 	if (info != NULL) {
3316 		if (buffer_size < info_size) {
3317 			return info_size;
3318 		}
3319 
3320 		/* instance UUID */
3321 		uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3322 		info->nci_channel_entries_count = nx->nx_ch_count;
3323 		scan = (caddr_t __bidi_indexable)info->nci_channel_entries;
3324 	}
3325 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3326 		size_t          entry_size;
3327 		uint32_t        ring_count;
3328 
3329 		ring_count = channel_ring_count(ch, NR_TX) +
3330 		    channel_ring_count(ch, NR_RX);
3331 		entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3332 		info_size += entry_size;
3333 		if (scan != NULL) {
3334 			if (buffer_size < info_size) {
3335 				return info_size;
3336 			}
3337 			entry = (nexus_channel_entry *)(void *)scan;
3338 			entry->nce_ring_count = ring_count;
3339 
3340 			nexus_channel_entry_populate(ch, entry);
3341 			scan += entry_size;
3342 		}
3343 	}
3344 	return info_size;
3345 }
3346 
3347 static int
3348 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3349 {
3350 #pragma unused(arg1, arg2, oidp)
3351 	size_t actual_space;
3352 	caddr_t buffer = NULL;
3353 	size_t buffer_space;
3354 	size_t allocated_space;
3355 	int out_error;
3356 	struct kern_nexus *nx;
3357 	int error = 0;
3358 	caddr_t scan;
3359 
3360 	if (!kauth_cred_issuser(kauth_cred_get())) {
3361 		return EPERM;
3362 	}
3363 
3364 	net_update_uptime();
3365 	buffer_space = req->oldlen;
3366 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3367 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3368 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3369 		}
3370 		allocated_space = buffer_space;
3371 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3372 		if (__improbable(buffer == NULL)) {
3373 			return ENOBUFS;
3374 		}
3375 	} else if (req->oldptr == USER_ADDR_NULL) {
3376 		buffer_space = 0;
3377 	}
3378 	actual_space = 0;
3379 	scan = buffer;
3380 	SK_LOCK();
3381 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3382 		size_t info_size;
3383 
3384 		info_size = nexus_channel_info_populate(nx, (void *)scan,
3385 		    buffer_space);
3386 		if (scan != NULL) {
3387 			if (buffer_space < info_size) {
3388 				/* supplied buffer too small, stop copying */
3389 				error = ENOMEM;
3390 				break;
3391 			}
3392 			scan += info_size;
3393 			buffer_space -= info_size;
3394 		}
3395 		actual_space += info_size;
3396 	}
3397 	SK_UNLOCK();
3398 
3399 	if (actual_space != 0) {
3400 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3401 		if (out_error != 0) {
3402 			error = out_error;
3403 		}
3404 	}
3405 	if (buffer != NULL) {
3406 		sk_free_data(buffer, allocated_space);
3407 	}
3408 
3409 	return error;
3410 }
3411 
3412 static int
3413 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3414 {
3415 #pragma unused(arg1, arg2)
3416 	struct proc *p = req->p;
3417 	struct nexus_mib_filter filter;
3418 	int error = 0;
3419 	size_t actual_space;
3420 	size_t allocated_space = 0;
3421 	caddr_t __sized_by(allocated_space) buffer = NULL;
3422 	size_t buffer_space;
3423 	int out_error;
3424 	struct kern_nexus *nx;
3425 	caddr_t scan;
3426 
3427 	/* Restrict protocol stats access to root user only (like netstat). */
3428 	if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3429 	    !kauth_cred_issuser(kauth_cred_get())) {
3430 		SK_ERR("mib request rejected, EPERM");
3431 		return EPERM;
3432 	}
3433 
3434 	if (req->newptr == USER_ADDR_NULL) {
3435 		/*
3436 		 * For flow stats requests, non-root users need to provide a
3437 		 * 5-tuple. Otherwise, we do not grant access.
3438 		 */
3439 		if (oidp->oid_arg2 == NXMIB_FLOW &&
3440 		    !kauth_cred_issuser(kauth_cred_get())) {
3441 			SK_ERR("mib request rejected: tuple not provided");
3442 			return EPERM;
3443 		}
3444 		/* use subcommand for multiple nodes */
3445 		filter.nmf_type = oidp->oid_arg2;
3446 		filter.nmf_bitmap = 0x0;
3447 	} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3448 		SK_ERR("mis-matching newlen");
3449 		return EINVAL;
3450 	} else {
3451 		error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3452 		if (error != 0) {
3453 			SK_ERR("SYSCTL_IN err %d", error);
3454 			return error;
3455 		}
3456 		if (filter.nmf_type != oidp->oid_arg2) {
3457 			SK_ERR("mis-matching nmf_type");
3458 			return EINVAL;
3459 		}
3460 		/*
3461 		 * For flow stats requests, non-root users need to set the nexus
3462 		 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3463 		 * grant access. This ensures that fsw_mib_get_flow looks for a
3464 		 * flow entry that matches the given tuple of the non-root user.
3465 		 */
3466 		if (filter.nmf_type == NXMIB_FLOW &&
3467 		    (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3468 		    !kauth_cred_issuser(kauth_cred_get())) {
3469 			SK_ERR("mib request rejected: tuple filter not set");
3470 			return EPERM;
3471 		}
3472 	}
3473 
3474 	net_update_uptime();
3475 	buffer_space = req->oldlen;
3476 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3477 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3478 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3479 		}
3480 		buffer = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_sysctl_buf);
3481 		allocated_space = buffer_space;
3482 		if (__improbable(buffer == NULL)) {
3483 			return ENOBUFS;
3484 		}
3485 	} else if (req->oldptr == USER_ADDR_NULL) {
3486 		buffer_space = 0;
3487 	}
3488 	actual_space = 0;
3489 	scan = buffer;
3490 
3491 	SK_LOCK();
3492 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3493 		if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3494 			continue;
3495 		}
3496 
3497 		size_t size = 0;
3498 		struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3499 
3500 		/*
3501 		 * -fbounds-safety: Because scan takes the bounds of buffer
3502 		 * (which is __sized_by(allocated_space)), at some point scan
3503 		 * will reach its bounds (because of scan += size). When it
3504 		 * does, it won't pass the bounds check when scan is passed to
3505 		 * nxdom_prov_nx_mib_get function. We need to avoid passing scan
3506 		 * to nxdom_prov_nx_mib_get when it reaches its upper bound,
3507 		 * i.e. when buffer_space reaches 0 (see buffer_space -= size).
3508 		 */
3509 		if (req->oldptr == USER_ADDR_NULL || buffer_space) {
3510 			size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3511 			    buffer_space, p);
3512 		}
3513 
3514 		if (scan != NULL) {
3515 			if (buffer_space < size) {
3516 				/* supplied buffer too small, stop copying */
3517 				error = ENOMEM;
3518 				break;
3519 			}
3520 			scan += size;
3521 			buffer_space -= size;
3522 		}
3523 		actual_space += size;
3524 	}
3525 	SK_UNLOCK();
3526 
3527 	if (actual_space != 0) {
3528 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3529 		if (out_error != 0) {
3530 			error = out_error;
3531 		}
3532 	}
3533 	if (buffer != NULL) {
3534 		sk_free_data_sized_by(buffer, allocated_space);
3535 	}
3536 
3537 	return error;
3538 }
3539 
3540 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3541 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3542     boolean_t is_sk_locked)
3543 {
3544 	struct kern_nexus *nx = NULL;
3545 
3546 	if (!is_sk_locked) {
3547 		SK_LOCK();
3548 	} else {
3549 		SK_LOCK_ASSERT_HELD();
3550 	}
3551 
3552 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3553 		(*f)(nx, arg0);
3554 	}
3555 
3556 	if (!is_sk_locked) {
3557 		SK_UNLOCK();
3558 	}
3559 }
3560 
3561 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3562 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3563     struct kern_pbufpool_memory_info *rx_pool_info,
3564     struct kern_pbufpool_memory_info *tx_pool_info)
3565 {
3566 	struct kern_pbufpool *__single tpp, *__single rpp;
3567 	struct kern_nexus *nx;
3568 	errno_t err = 0;
3569 
3570 	nx = nx_find(nx_uuid, FALSE);
3571 	if (nx == NULL) {
3572 		err = ENOENT;
3573 		goto done;
3574 	}
3575 
3576 	if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3577 		err = ENOTSUP;
3578 		goto done;
3579 	}
3580 
3581 	err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3582 	if (err != 0) {
3583 		goto done;
3584 	}
3585 
3586 	if ((tpp == NULL) && (rpp == NULL)) {
3587 		err = ENOENT;
3588 		goto done;
3589 	}
3590 
3591 	if (tx_pool_info != NULL) {
3592 		bzero(tx_pool_info, sizeof(*tx_pool_info));
3593 	}
3594 	if (rx_pool_info != NULL) {
3595 		bzero(rx_pool_info, sizeof(*rx_pool_info));
3596 	}
3597 
3598 	if ((tx_pool_info != NULL) && (tpp != NULL)) {
3599 		err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3600 		if (err != 0) {
3601 			goto done;
3602 		}
3603 	}
3604 
3605 	if ((rx_pool_info != NULL) && (rpp != NULL)) {
3606 		err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3607 	}
3608 
3609 done:
3610 	if (nx != NULL) {
3611 		(void) nx_release(nx);
3612 		nx = NULL;
3613 	}
3614 	return err;
3615 }
3616 
3617 void
nx_interface_advisory_notify(struct kern_nexus * nx)3618 nx_interface_advisory_notify(struct kern_nexus *nx)
3619 {
3620 	struct kern_channel *ch;
3621 	struct netif_stats *nifs;
3622 	struct fsw_stats *fsw_stats;
3623 	nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3624 
3625 	if (nxdom_type == NEXUS_TYPE_NET_IF) {
3626 		nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3627 	} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3628 		fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3629 	} else {
3630 		VERIFY(0);
3631 		__builtin_unreachable();
3632 	}
3633 	if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3634 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3635 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3636 		} else {
3637 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3638 		}
3639 		return;
3640 	}
3641 	/*
3642 	 * if the channel is in "nx_ch_if_adv_head" list, then we can
3643 	 * safely assume that the channel is not closed yet.
3644 	 * In ch_close_common(), the channel is removed from the
3645 	 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3646 	 * exclusive mode, prior to closing the channel.
3647 	 */
3648 	STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3649 		struct nexus_adapter *na = ch->ch_na;
3650 
3651 		ASSERT(na != NULL);
3652 		na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3653 		    TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3654 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3655 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3656 		} else {
3657 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3658 		}
3659 	}
3660 	lck_rw_done(&nx->nx_ch_if_adv_lock);
3661 }
3662