xref: /xnu-11215.81.4/bsd/skywalk/nexus/nexus.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33 
34 static uint32_t disable_nxctl_check = 0;
35 #if (DEVELOPMENT || DEBUG)
36 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37     CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38 #endif
39 
40 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44 
45 static STAILQ_HEAD(, nxctl) nxctl_head =
46     STAILQ_HEAD_INITIALIZER(nxctl_head);
47 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48     STAILQ_HEAD_INITIALIZER(nxprov_head);
49 
50 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51 RB_HEAD(kern_nexus_tree, kern_nexus);
52 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54 static struct kern_nexus_tree   nx_head;
55 
56 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63 static void nxctl_retain_locked(struct nxctl *);
64 static int nxctl_release_locked(struct nxctl *);
65 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67 static void nxctl_free(struct nxctl *);
68 
69 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70     struct kern_nexus_domain_provider *, struct nxprov_reg *,
71     const struct kern_nexus_provider_init *init, int *);
72 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73 static void nxprov_retain_locked(struct kern_nexus_provider *);
74 static int nxprov_release_locked(struct kern_nexus_provider *);
75 static struct kern_nexus_provider *nxprov_alloc(
76 	struct kern_nexus_domain_provider *, zalloc_flags_t);
77 static void nxprov_free(struct kern_nexus_provider *);
78 
79 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83 static struct kern_nexus *nx_alloc(zalloc_flags_t);
84 static void nx_free(struct kern_nexus *);
85 
86 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87 
88 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89 
90 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91 
92 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93 
94 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95 
96 static int __nx_inited = 0;
97 
98 #define SKMEM_TAG_NX_KEY        "com.apple.skywalk.nexus.key"
99 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100 
101 #define SKMEM_TAG_NX_MIB        "com.apple.skywalk.nexus.mib"
102 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103 
104 #define SKMEM_TAG_NX_PORT        "com.apple.skywalk.nexus.port"
105 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106 
107 #define SKMEM_TAG_NX_PORT_INFO        "com.apple.skywalk.nexus.port.info"
108 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109 
110 /*
111  * Special nexus controller handle for Skywalk internal use.  Unlike all
112  * other nexus controller handles that are created by userland or kernel
113  * clients, this one never gets closed or freed.  It is also not part of
114  * the global nxctl_head list.
115  */
116 static struct nxctl _kernnxctl;
117 static struct nxctl _usernxctl;
118 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
119 struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
120 
121 /*
122  * -fbounds-safety: For static functions where additional size variables are
123  * added, we need to mark them __unused if this file is being built without
124  * -fbounds-safety.
125  */
126 #if !__has_ptrcheck
127 #define NX_FB_ARG __unused
128 #else
129 #define NX_FB_ARG
130 #endif
131 
132 int
nexus_init(void)133 nexus_init(void)
134 {
135 	SK_LOCK_ASSERT_HELD();
136 	ASSERT(!__nx_inited);
137 
138 	RB_INIT(&nx_head);
139 
140 	na_init();
141 
142 	/* attach system built-in domains and domain providers */
143 	nxdom_attach_all();
144 
145 	/*
146 	 * Initialize private kernel and shared user nexus controller handle;
147 	 *
148 	 * Shared Kernel controller is used internally for creating nexus providers
149 	 * and nexus instances from within the Skywalk code (e.g. netif_compat).
150 	 *
151 	 * Shared User controller is used userspace by clients(e.g. libnetcore)
152 	 * that would like to call nexus instances for use cases like
153 	 * configuring flow entry that they own indirectly (e.g. via NECP), so
154 	 * that the nexus would perform permission check based on other info
155 	 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
156 	 * credentials).
157 	 */
158 	nxctl_init(&_kernnxctl, kernproc, NULL);
159 	nxctl_retain_locked(&_kernnxctl);       /* one for us */
160 	nxctl_init(&_usernxctl, kernproc, NULL);
161 	nxctl_retain_locked(&_usernxctl);       /* one for us */
162 	nxctl_traffic_rule_init();
163 
164 	__nx_inited = 1;
165 
166 	return 0;
167 }
168 
169 void
nexus_fini(void)170 nexus_fini(void)
171 {
172 	SK_LOCK_ASSERT_HELD();
173 
174 	if (__nx_inited) {
175 		nxctl_traffic_rule_fini();
176 		nxctl_release_locked(&_kernnxctl);
177 		nxctl_release_locked(&_usernxctl);
178 
179 		/* tell all domains they're going away */
180 		nxdom_detach_all();
181 
182 		ASSERT(RB_EMPTY(&nx_head));
183 
184 		na_fini();
185 
186 		__nx_inited = 0;
187 	}
188 }
189 
190 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)191 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
192     int *err)
193 {
194 	struct nxctl *nxctl = NULL;
195 
196 	ASSERT(!uuid_is_null(nxctl_uuid));
197 
198 	/* privilege checks would be done when performing nxctl operations */
199 
200 	SK_LOCK();
201 
202 	nxctl = nxctl_alloc(p, fp, Z_WAITOK);
203 
204 	STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
205 	nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
206 	uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
207 
208 	nxctl_retain_locked(nxctl);     /* one for being in the list */
209 	nxctl_retain_locked(nxctl);     /* one for the caller */
210 
211 #if SK_LOG
212 	uuid_string_t uuidstr;
213 	SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
214 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
215 #endif /* SK_LOG */
216 
217 	SK_UNLOCK();
218 
219 	if (*err != 0) {
220 		nxctl_free(nxctl);
221 		nxctl = NULL;
222 	}
223 	return nxctl;
224 }
225 
226 void
nxctl_close(struct nxctl * nxctl)227 nxctl_close(struct nxctl *nxctl)
228 {
229 	struct kern_nexus_provider *nxprov = NULL, *tnxprov;
230 
231 	lck_mtx_lock(&nxctl->nxctl_lock);
232 	SK_LOCK();
233 
234 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
235 
236 #if SK_LOG
237 	uuid_string_t uuidstr;
238 	SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
239 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
240 	    nxctl->nxctl_flags, NEXUSCTLF_BITS);
241 #endif /* SK_LOG */
242 
243 	if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
244 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
245 		nxctl->nxctl_fp = NULL;
246 	}
247 
248 	/* may be called as part of failure cleanup, so check */
249 	if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
250 		/* caller must hold an extra ref */
251 		ASSERT(nxctl->nxctl_refcnt > 1);
252 		(void) nxctl_release_locked(nxctl);
253 
254 		STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
255 		nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
256 	}
257 
258 repeat:
259 	STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
260 		/*
261 		 * Close provider only for those which are owned by
262 		 * this control instance.  Note that if we close the
263 		 * provider, we need to repeat this search as the
264 		 * list might have been changed by another thread.
265 		 * That's possible since SK_UNLOCK() may be called
266 		 * as a result of calling nxprov_close().
267 		 */
268 		if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
269 		    nxprov->nxprov_ctl == nxctl) {
270 			nxprov_retain_locked(nxprov);
271 			(void) nxprov_close(nxprov, TRUE);
272 			(void) nxprov_release_locked(nxprov);
273 			goto repeat;
274 		}
275 	}
276 
277 	SK_UNLOCK();
278 	lck_mtx_unlock(&nxctl->nxctl_lock);
279 	nxctl_traffic_rule_clean(nxctl);
280 }
281 
282 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)283 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
284 {
285 #pragma unused(nxctl)
286 	int err = 0;
287 
288 	NXCTL_LOCK_ASSERT_HELD(nxctl);
289 
290 	if (sopt->sopt_dir != SOPT_SET) {
291 		sopt->sopt_dir = SOPT_SET;
292 	}
293 
294 	switch (sopt->sopt_name) {
295 	case NXOPT_NEXUS_BIND:
296 		err = nxctl_nexus_bind(nxctl, sopt);
297 		break;
298 
299 	case NXOPT_NEXUS_UNBIND:
300 		err = nxctl_nexus_unbind(nxctl, sopt);
301 		break;
302 
303 	case NXOPT_NEXUS_CONFIG:
304 		err = nxctl_nexus_config(nxctl, sopt);
305 		break;
306 
307 	default:
308 		err = ENOPROTOOPT;
309 		break;
310 	}
311 
312 	return err;
313 }
314 
315 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)316 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
317 {
318 #pragma unused(nxctl)
319 	int err = 0;
320 
321 	NXCTL_LOCK_ASSERT_HELD(nxctl);
322 
323 	if (sopt->sopt_dir != SOPT_GET) {
324 		sopt->sopt_dir = SOPT_GET;
325 	}
326 
327 	switch (sopt->sopt_name) {
328 	case NXOPT_NEXUS_PROV_LIST:
329 		err = nxctl_get_nexus_prov_list(nxctl, sopt);
330 		break;
331 
332 	case NXOPT_NEXUS_PROV_ENTRY:
333 		err = nxctl_get_nexus_prov_entry(nxctl, sopt);
334 		break;
335 
336 	case NXOPT_NEXUS_LIST:
337 		err = nxctl_get_nexus_list(nxctl, sopt);
338 		break;
339 
340 	case NXOPT_CHANNEL_LIST:
341 		err = nxctl_get_channel_list(nxctl, sopt);
342 		break;
343 
344 	default:
345 		err = ENOPROTOOPT;
346 		break;
347 	}
348 
349 	return err;
350 }
351 
352 /* Upper bound on # of nrl_num_regs that we'd return to user space */
353 #define MAX_NUM_REG_ENTRIES     256
354 
355 /* Hoisted out of line to reduce kernel stack footprint */
356 SK_NO_INLINE_ATTRIBUTE
357 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)358 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
359 {
360 	user_addr_t tmp_ptr = USER_ADDR_NULL;
361 	struct nxprov_reg_ent *pnre, *nres = NULL;
362 	struct nxprov_list_req nrlr;
363 	struct kern_nexus_provider *nxprov = NULL;
364 	uint32_t nregs = 0, ncregs = 0;
365 	int err = 0, observeall;
366 	size_t nres_sz;
367 
368 	NXCTL_LOCK_ASSERT_HELD(nxctl);
369 
370 	ASSERT(sopt->sopt_p != NULL);
371 	if (sopt->sopt_val == USER_ADDR_NULL) {
372 		return EINVAL;
373 	}
374 
375 	err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
376 	if (err != 0) {
377 		return err;
378 	}
379 
380 	if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
381 		nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
382 	}
383 
384 	/*
385 	 * If the caller specified a buffer, copy out the Nexus provider
386 	 * entries to caller gracefully.  We only copy out the number of
387 	 * entries which caller has asked for, but we always tell caller
388 	 * how big the buffer really needs to be.
389 	 */
390 	tmp_ptr = nrlr.nrl_regs;
391 	if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
392 		nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
393 		nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
394 		if (__improbable(nres == NULL)) {
395 			return ENOBUFS;
396 		}
397 	}
398 
399 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
400 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
401 
402 	SK_LOCK();
403 	/*
404 	 * Count number of providers.  If buffer space exists and
405 	 * remains, copy out provider entries.
406 	 */
407 	nregs = nrlr.nrl_num_regs;
408 	pnre = nres;
409 
410 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
411 		/*
412 		 * Return only entries that are visible to the caller,
413 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
414 		 */
415 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
416 			continue;
417 		}
418 
419 		if (nres != NULL && nregs > 0) {
420 			uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
421 			bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
422 			    sizeof(struct nxprov_params));
423 			--nregs;
424 			++pnre;
425 			++ncregs;
426 		}
427 	}
428 	SK_UNLOCK();
429 
430 	if (ncregs == 0) {
431 		err = ENOENT;
432 	}
433 
434 	if (nres != NULL) {
435 		if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
436 			if (sopt->sopt_p != kernproc) {
437 				err = copyout(nres, tmp_ptr,
438 				    ncregs * sizeof(*nres));
439 			} else {
440 				caddr_t tmp;
441 				tmp =  __unsafe_forge_bidi_indexable(caddr_t,
442 				    CAST_DOWN(caddr_t, tmp_ptr),
443 				    ncregs * sizeof(*nres));
444 				bcopy(nres, tmp, ncregs * sizeof(*nres));
445 			}
446 		}
447 		sk_free_data(nres, nres_sz);
448 		nres = NULL;
449 	}
450 
451 	if (err == 0) {
452 		nrlr.nrl_num_regs = ncregs;
453 		err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
454 	}
455 
456 	return err;
457 }
458 
459 /* Hoisted out of line to reduce kernel stack footprint */
460 SK_NO_INLINE_ATTRIBUTE
461 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)462 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
463 {
464 	struct nxprov_reg_ent nre;
465 	struct kern_nexus_provider *nxprov = NULL;
466 	int err = 0;
467 
468 	NXCTL_LOCK_ASSERT_HELD(nxctl);
469 
470 	ASSERT(sopt->sopt_p != NULL);
471 	if (sopt->sopt_val == USER_ADDR_NULL) {
472 		return EINVAL;
473 	}
474 
475 	bzero(&nre, sizeof(nre));
476 	err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
477 	if (err != 0) {
478 		return err;
479 	}
480 
481 	if (uuid_is_null(nre.npre_prov_uuid)) {
482 		return EINVAL;
483 	}
484 
485 	SK_LOCK();
486 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
487 		if (uuid_compare(nxprov->nxprov_uuid,
488 		    nre.npre_prov_uuid) == 0) {
489 			/*
490 			 * Return only entries that are visible to the caller,
491 			 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
492 			 */
493 			if (nxprov->nxprov_ctl != nxctl) {
494 				if (skywalk_priv_check_cred(sopt->sopt_p,
495 				    nxctl->nxctl_cred,
496 				    PRIV_SKYWALK_OBSERVE_ALL) != 0) {
497 					nxprov = NULL;
498 					break;
499 				}
500 			}
501 
502 			bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
503 			    sizeof(struct nxprov_params));
504 			break;
505 		}
506 	}
507 	SK_UNLOCK();
508 
509 	if (nxprov != NULL) {
510 		err = sooptcopyout(sopt, &nre, sizeof(nre));
511 	} else {
512 		err = ENOENT;
513 	}
514 
515 	return err;
516 }
517 
518 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
519 #define MAX_NUM_NX_UUIDS        4096
520 
521 /* Hoisted out of line to reduce kernel stack footprint */
522 SK_NO_INLINE_ATTRIBUTE
523 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)524 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
525 {
526 	user_addr_t tmp_ptr = USER_ADDR_NULL;
527 	uint32_t nuuids = 0, ncuuids = 0;
528 	uuid_t *puuid, *uuids = NULL;
529 	size_t uuids_sz;
530 	struct nx_list_req nlr;
531 	struct kern_nexus_provider *nxprov = NULL;
532 	struct kern_nexus *nx = NULL;
533 	int err = 0, observeall;
534 
535 	NXCTL_LOCK_ASSERT_HELD(nxctl);
536 
537 	ASSERT(sopt->sopt_p != NULL);
538 	if (sopt->sopt_val == USER_ADDR_NULL) {
539 		return EINVAL;
540 	}
541 
542 	err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
543 	if (err != 0) {
544 		return err;
545 	}
546 
547 	if (uuid_is_null(nlr.nl_prov_uuid)) {
548 		return EINVAL;
549 	} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
550 		nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
551 	}
552 
553 	/*
554 	 * If the caller specified a buffer, copy out the Nexus UUIDs to
555 	 * caller gracefully.  We only copy out the number of UUIDs which
556 	 * caller has asked for, but we always tell caller how big the
557 	 * buffer really needs to be.
558 	 */
559 	tmp_ptr = nlr.nl_nx_uuids;
560 	if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
561 		uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
562 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
563 		if (__improbable(uuids == NULL)) {
564 			return ENOBUFS;
565 		}
566 	}
567 
568 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
569 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
570 
571 	SK_LOCK();
572 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
573 		/*
574 		 * Return only entries that are visible to the caller,
575 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
576 		 */
577 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
578 			continue;
579 		}
580 
581 		if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
582 			break;
583 		}
584 	}
585 
586 	if (nxprov != NULL) {
587 		/*
588 		 * Count number of Nexus.  If buffer space exists
589 		 * and remains, copy out the Nexus UUIDs.
590 		 */
591 		nuuids = nlr.nl_num_nx_uuids;
592 		puuid = uuids;
593 
594 		STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
595 			++ncuuids;
596 			if (uuids != NULL && nuuids > 0) {
597 				uuid_copy(*puuid, nx->nx_uuid);
598 				--nuuids;
599 				++puuid;
600 			}
601 		}
602 	} else {
603 		err = ENOENT;
604 	}
605 	SK_UNLOCK();
606 
607 	if (uuids != NULL) {
608 		if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
609 			uintptr_t cnt_uuid;
610 
611 			/* Note: Pointer arithmetic */
612 			cnt_uuid = (uintptr_t)(puuid - uuids);
613 			if (cnt_uuid > 0) {
614 				if (sopt->sopt_p != kernproc) {
615 					err = copyout(uuids, tmp_ptr,
616 					    cnt_uuid * sizeof(uuid_t));
617 				} else {
618 					caddr_t tmp;
619 					tmp = __unsafe_forge_bidi_indexable(caddr_t,
620 					    CAST_DOWN(caddr_t, tmp_ptr),
621 					    cnt_uuid * sizeof(uuid_t));
622 					bcopy(uuids, tmp,
623 					    cnt_uuid * sizeof(uuid_t));
624 				}
625 			}
626 		}
627 		sk_free_data(uuids, uuids_sz);
628 		uuids = NULL;
629 	}
630 
631 	if (err == 0) {
632 		nlr.nl_num_nx_uuids = ncuuids;
633 		err = sooptcopyout(sopt, &nlr, sizeof(nlr));
634 	}
635 
636 	return err;
637 }
638 
639 /* Hoisted out of line to reduce kernel stack footprint */
640 SK_NO_INLINE_ATTRIBUTE
641 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)642 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
643 {
644 	boolean_t m_pid, m_exec_uuid, m_key;
645 	struct nx_bind_req nbr;
646 	struct proc *p = PROC_NULL;
647 	struct nxbind *nxb = NULL;
648 	uint64_t p_uniqueid = -1;
649 	pid_t p_pid = -1;
650 	struct kern_nexus *nx = NULL;
651 #if SK_LOG
652 	uuid_string_t exec_uuidstr;
653 #endif /* SK_LOG */
654 	uuid_t p_uuid;
655 	void *key = NULL;
656 	int err = 0;
657 
658 	NXCTL_LOCK_ASSERT_HELD(nxctl);
659 
660 	if (sopt->sopt_val == USER_ADDR_NULL) {
661 		return EINVAL;
662 	}
663 
664 	uuid_clear(p_uuid);
665 	bzero(&nbr, sizeof(nbr));
666 	err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
667 	if (err != 0) {
668 		return err;
669 	}
670 
671 	if (uuid_is_null(nbr.nb_nx_uuid)) {
672 		err = EINVAL;
673 		goto done_unlocked;
674 	}
675 
676 	nbr.nb_flags &= NBR_MATCH_MASK;
677 	if (nbr.nb_flags == 0) {
678 		/* must choose one of the match criteria */
679 		err = EINVAL;
680 		goto done_unlocked;
681 	}
682 	m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
683 	m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
684 	m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
685 
686 	if (m_pid || m_exec_uuid) {
687 		/*
688 		 * Validate process ID.  A valid PID is needed when we're
689 		 * asked to match by PID, or if asked to match by executable
690 		 * UUID with a NULL nb_exec_uuid supplied.  The latter is
691 		 * to support the case when a userland Nexus provider isn't
692 		 * able to acquire its client's executable UUID, but is
693 		 * able to identify it via PID.
694 		 */
695 		if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
696 		    (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
697 			err = ESRCH;
698 			goto done_unlocked;
699 		}
700 		/* exclude kernel from the match criteria */
701 		if (p == kernproc) {
702 			err = EACCES;
703 			goto done_unlocked;
704 		} else if (p != PROC_NULL) {
705 			proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
706 			p_uniqueid = proc_uniqueid(p);
707 			p_pid = proc_pid(p);
708 		} else {
709 			uuid_copy(p_uuid, nbr.nb_exec_uuid);
710 		}
711 	}
712 
713 	if (m_key) {
714 		if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
715 		    nbr.nb_key == USER_ADDR_NULL) {
716 			err = EINVAL;
717 			goto done_unlocked;
718 		}
719 
720 		key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
721 		if (__improbable(key == NULL)) {
722 			err = ENOMEM;
723 			goto done_unlocked;
724 		}
725 
726 		if (sopt->sopt_p != kernproc) {
727 			err = copyin(nbr.nb_key, key, nbr.nb_key_len);
728 			if (err != 0) {
729 				goto done_unlocked;
730 			}
731 		} else {
732 			/*
733 			 * -fbounds-safety: nbr.nb_key is user_addr_t. Changing
734 			 * it to a pointer type is risky, so we just forge it
735 			 * here instead.
736 			 */
737 			void *nb_key = __unsafe_forge_bidi_indexable(void *,
738 			    nbr.nb_key, nbr.nb_key_len);
739 			bcopy(nb_key, key, nbr.nb_key_len);
740 		}
741 	}
742 
743 	SK_LOCK();
744 	nx = nx_find(nbr.nb_nx_uuid, TRUE);
745 	if (nx == NULL || (disable_nxctl_check == 0 &&
746 	    nx->nx_prov->nxprov_ctl != nxctl &&
747 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
748 		err = ENOENT;
749 		goto done;
750 	}
751 
752 	/* bind isn't applicable on anonymous nexus provider */
753 	if (NX_ANONYMOUS_PROV(nx)) {
754 		err = ENXIO;
755 		goto done;
756 	}
757 
758 	/* port must be within the domain's range */
759 	if (nbr.nb_port != NEXUS_PORT_ANY &&
760 	    nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
761 		err = EDOM;
762 		goto done;
763 	} else if (nbr.nb_port == NEXUS_PORT_ANY) {
764 		/* for now, this is allowed only for kernel clients */
765 		if (sopt->sopt_p != kernproc) {
766 			err = EPERM;
767 			goto done;
768 		}
769 	}
770 
771 	nxb = nxb_alloc(Z_WAITOK);
772 
773 	if (m_pid) {
774 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
775 		nxb->nxb_uniqueid = p_uniqueid;
776 		nxb->nxb_pid = p_pid;
777 	}
778 	if (m_exec_uuid) {
779 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
780 		ASSERT(!uuid_is_null(p_uuid));
781 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
782 	}
783 	if (m_key) {
784 		nxb->nxb_flags |= NXBF_MATCH_KEY;
785 		ASSERT(key != NULL);
786 		ASSERT(nbr.nb_key_len != 0 &&
787 		    nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
788 		/*
789 		 * -fbounds-safety: since nxb_key is __sized_by(nxb_key_len),
790 		 * its assignment needs to be done side-by-side to nxb_key_len.
791 		 */
792 		nxb->nxb_key = key;
793 		key = NULL;     /* let nxb_free() free it */
794 		nxb->nxb_key_len = nbr.nb_key_len;
795 	}
796 
797 	/*
798 	 * Bind the creds to the nexus port.  If client doesn't have a port,
799 	 * find one, claim it, and associate the creds to it.  Upon success,
800 	 * the nexus may move the nxbind contents (including the key) to
801 	 * its own nxbind instance; in that case, nxb_free() below will not
802 	 * be freeing the key within.
803 	 */
804 	err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
805 	if (err != 0) {
806 		goto done;
807 	}
808 
809 	ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
810 	(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
811 
812 	SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
813 	    "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
814 	    SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
815 	    NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
816 	    sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
817 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
818 	    nxb->nxb_key_len);
819 
820 done:
821 	if (nx != NULL) {
822 		(void) nx_release_locked(nx);
823 		nx = NULL;
824 	}
825 	SK_UNLOCK();
826 
827 done_unlocked:
828 	ASSERT(nx == NULL);
829 
830 	if (nxb != NULL) {
831 		nxb_free(nxb);
832 		nxb = NULL;
833 	}
834 	if (key != NULL) {
835 		sk_free_data(key, nbr.nb_key_len);
836 		key = NULL;
837 	}
838 	if (p != PROC_NULL) {
839 		proc_rele(p);
840 	}
841 
842 	return err;
843 }
844 
845 /* Hoisted out of line to reduce kernel stack footprint */
846 SK_NO_INLINE_ATTRIBUTE
847 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)848 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
849 {
850 	struct nx_unbind_req nur;
851 	struct kern_nexus *nx = NULL;
852 	int err = 0;
853 
854 	NXCTL_LOCK_ASSERT_HELD(nxctl);
855 
856 	if (sopt->sopt_val == USER_ADDR_NULL) {
857 		return EINVAL;
858 	}
859 
860 	bzero(&nur, sizeof(nur));
861 	err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
862 	if (err != 0) {
863 		return err;
864 	}
865 
866 	if (uuid_is_null(nur.nu_nx_uuid)) {
867 		return EINVAL;
868 	}
869 
870 	SK_LOCK();
871 	nx = nx_find(nur.nu_nx_uuid, TRUE);
872 	if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
873 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
874 		err = ENOENT;
875 		goto done;
876 	}
877 
878 	/* unbind isn't applicable on anonymous nexus provider */
879 	if (NX_ANONYMOUS_PROV(nx)) {
880 		err = ENXIO;
881 		goto done;
882 	}
883 
884 	if (nur.nu_port == NEXUS_PORT_ANY) {
885 		err = EINVAL;
886 		goto done;
887 	}
888 
889 	err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
890 
891 done:
892 	if (nx != NULL) {
893 		(void) nx_release_locked(nx);
894 		nx = NULL;
895 	}
896 	SK_UNLOCK();
897 
898 	return err;
899 }
900 
901 /* Hoisted out of line to reduce kernel stack footprint */
902 SK_NO_INLINE_ATTRIBUTE
903 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)904 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
905 {
906 	struct kern_nexus *nx = NULL;
907 	struct nx_cfg_req ncr;
908 	int err = 0;
909 
910 	NXCTL_LOCK_ASSERT_HELD(nxctl);
911 
912 	if (sopt->sopt_val == USER_ADDR_NULL) {
913 		return EINVAL;
914 	}
915 
916 	bzero(&ncr, sizeof(ncr));
917 	err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
918 	if (err != 0) {
919 		return err;
920 	}
921 
922 	if (uuid_is_null(ncr.nc_nx_uuid)) {
923 		return EINVAL;
924 	}
925 
926 	SK_LOCK();
927 	nx = nx_find(ncr.nc_nx_uuid, TRUE);
928 	if (nx == NULL || (disable_nxctl_check == 0 &&
929 	    nx->nx_prov->nxprov_ctl != nxctl &&
930 	    nxctl != &_kernnxctl &&    /* allow kernel/shared user nxctl */
931 	    nxctl != &_usernxctl)) {
932 		err = ENOENT;
933 		goto done;
934 	}
935 
936 	if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
937 		err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
938 		    nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
939 	} else {
940 		err = EPERM;
941 	}
942 
943 	if (err == 0) {
944 		(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
945 	}
946 done:
947 	if (nx != NULL) {
948 		(void) nx_release_locked(nx);
949 		nx = NULL;
950 	}
951 	SK_UNLOCK();
952 
953 	return err;
954 }
955 
956 struct nxbind *
nxb_alloc(zalloc_flags_t how)957 nxb_alloc(zalloc_flags_t how)
958 {
959 	struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
960 
961 	if (nxb) {
962 		SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
963 	}
964 	return nxb;
965 }
966 
967 void
nxb_free(struct nxbind * nxb)968 nxb_free(struct nxbind *nxb)
969 {
970 	SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
971 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
972 
973 	if (nxb->nxb_key != NULL) {
974 		sk_free_data_sized_by(nxb->nxb_key, nxb->nxb_key_len);
975 		nxb->nxb_key = NULL;
976 		nxb->nxb_key_len = 0;
977 	}
978 	zfree(nxbind_zone, nxb);
979 }
980 
981 /*
982  * nxb0 is assumed to possess the truth, compare nxb1 against it.
983  */
984 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)985 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
986 {
987 	ASSERT(nxb0 != NULL && nxb1 != NULL);
988 	ASSERT(nxb0 != nxb1);
989 
990 	/* we always compare using uniqueid and not pid */
991 	if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
992 	    nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
993 		return FALSE;
994 	}
995 
996 	if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
997 	    uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
998 		return FALSE;
999 	}
1000 
1001 	ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
1002 	    (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
1003 
1004 	if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
1005 	    (nxb0->nxb_key_len != nxb1->nxb_key_len ||
1006 	    nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
1007 	    nxb1->nxb_key_len) != 0)) {
1008 		return FALSE;
1009 	}
1010 
1011 	return TRUE;
1012 }
1013 
1014 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)1015 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
1016 {
1017 	ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
1018 	    (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
1019 
1020 	/* in case the destination has a key attached, free it first */
1021 	if (dnxb->nxb_key != NULL) {
1022 		sk_free_data_sized_by(dnxb->nxb_key, dnxb->nxb_key_len);
1023 		dnxb->nxb_key = NULL;
1024 		dnxb->nxb_key_len = 0;
1025 	}
1026 
1027 	/* move everything from src to dst, and then wipe out src */
1028 	bcopy(snxb, dnxb, sizeof(*dnxb));
1029 	bzero(snxb, sizeof(*snxb));
1030 }
1031 
1032 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1033 #define MAX_NUM_CH_UUIDS        4096
1034 
1035 /* Hoisted out of line to reduce kernel stack footprint */
1036 SK_NO_INLINE_ATTRIBUTE
1037 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)1038 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1039 {
1040 	user_addr_t tmp_ptr = USER_ADDR_NULL;
1041 	uint32_t nuuids = 0, ncuuids = 0;
1042 	uuid_t *puuid, *uuids = NULL;
1043 	size_t uuids_sz;
1044 	struct ch_list_req clr;
1045 	struct kern_channel *ch = NULL;
1046 	struct kern_nexus *nx = NULL;
1047 	struct kern_nexus find;
1048 	int err = 0, observeall;
1049 
1050 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1051 
1052 	ASSERT(sopt->sopt_p != NULL);
1053 	if (sopt->sopt_val == USER_ADDR_NULL) {
1054 		return EINVAL;
1055 	}
1056 
1057 	err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1058 	if (err != 0) {
1059 		return err;
1060 	}
1061 
1062 	if (uuid_is_null(clr.cl_nx_uuid)) {
1063 		return EINVAL;
1064 	} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1065 		clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1066 	}
1067 
1068 	/*
1069 	 * If the caller specified a buffer, copy out the Channel UUIDs to
1070 	 * caller gracefully.  We only copy out the number of UUIDs which
1071 	 * caller has asked for, but we always tell caller how big the
1072 	 * buffer really needs to be.
1073 	 */
1074 	tmp_ptr = clr.cl_ch_uuids;
1075 	if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1076 		uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1077 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1078 		if (uuids == NULL) {
1079 			return ENOBUFS;
1080 		}
1081 	}
1082 
1083 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1084 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
1085 
1086 	SK_LOCK();
1087 	uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1088 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1089 	if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1090 		/*
1091 		 * Return only entries that are visible to the caller,
1092 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1093 		 */
1094 		nx = NULL;
1095 	}
1096 	if (nx != NULL) {
1097 		/*
1098 		 * Count number of Channels.  If buffer space exists
1099 		 * and remains, copy out the Channel UUIDs.
1100 		 */
1101 		nuuids = clr.cl_num_ch_uuids;
1102 		puuid = uuids;
1103 
1104 		STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1105 			++ncuuids;
1106 			if (uuids != NULL && nuuids > 0) {
1107 				uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1108 				--nuuids;
1109 				++puuid;
1110 			}
1111 		}
1112 	} else {
1113 		err = ENOENT;
1114 	}
1115 	SK_UNLOCK();
1116 
1117 	if (uuids != NULL) {
1118 		if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1119 			uintptr_t cnt_uuid;
1120 
1121 			/* Note: Pointer arithmetic */
1122 			cnt_uuid = (uintptr_t)(puuid - uuids);
1123 			ASSERT(cnt_uuid > 0);
1124 
1125 			if (sopt->sopt_p != kernproc) {
1126 				err = copyout(uuids, tmp_ptr,
1127 				    cnt_uuid * sizeof(uuid_t));
1128 			} else {
1129 				caddr_t tmp;
1130 				tmp = __unsafe_forge_bidi_indexable(caddr_t,
1131 				    CAST_DOWN(caddr_t, tmp_ptr),
1132 				    cnt_uuid * sizeof(uuid_t));
1133 				bcopy(uuids, tmp, cnt_uuid * sizeof(uuid_t));
1134 			}
1135 		}
1136 		sk_free_data(uuids, uuids_sz);
1137 		uuids = NULL;
1138 	}
1139 
1140 	if (err == 0) {
1141 		clr.cl_num_ch_uuids = ncuuids;
1142 		err = sooptcopyout(sopt, &clr, sizeof(clr));
1143 	}
1144 
1145 	return err;
1146 }
1147 
1148 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1149 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1150 {
1151 	uuid_t p_uuid;
1152 
1153 	bzero(nxctl, sizeof(*nxctl));
1154 
1155 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1156 
1157 	lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1158 	uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1159 	nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1160 	nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1161 	nxctl->nxctl_fp = fp;
1162 	if (nxctl == &_kernnxctl) {
1163 		ASSERT(p == kernproc);
1164 		nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1165 	}
1166 	if (nxctl == &_usernxctl) {
1167 		ASSERT(p == kernproc);
1168 		nxctl->nxctl_cred = NULL;
1169 	}
1170 	if (fp == NULL) {
1171 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1172 	}
1173 }
1174 
1175 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1176 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1177 {
1178 	struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1179 
1180 	if (nxctl != NULL) {
1181 		nxctl_init(nxctl, p, fp);
1182 	}
1183 	return nxctl;
1184 }
1185 
1186 static void
nxctl_free(struct nxctl * nxctl)1187 nxctl_free(struct nxctl *nxctl)
1188 {
1189 	ASSERT(nxctl->nxctl_refcnt == 0);
1190 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1191 	kauth_cred_unref(&nxctl->nxctl_cred);
1192 	lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1193 	SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1194 	if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1195 		zfree(nxctl_zone, nxctl);
1196 	}
1197 }
1198 
1199 static void
nxctl_retain_locked(struct nxctl * nxctl)1200 nxctl_retain_locked(struct nxctl *nxctl)
1201 {
1202 	SK_LOCK_ASSERT_HELD();
1203 
1204 	nxctl->nxctl_refcnt++;
1205 	ASSERT(nxctl->nxctl_refcnt != 0);
1206 }
1207 
1208 void
nxctl_retain(struct nxctl * nxctl)1209 nxctl_retain(struct nxctl *nxctl)
1210 {
1211 	SK_LOCK();
1212 	nxctl_retain_locked(nxctl);
1213 	SK_UNLOCK();
1214 }
1215 
1216 static int
nxctl_release_locked(struct nxctl * nxctl)1217 nxctl_release_locked(struct nxctl *nxctl)
1218 {
1219 	int oldref = nxctl->nxctl_refcnt;
1220 
1221 	SK_LOCK_ASSERT_HELD();
1222 
1223 	ASSERT(nxctl->nxctl_refcnt != 0);
1224 	if (--nxctl->nxctl_refcnt == 0) {
1225 		nxctl_free(nxctl);
1226 	}
1227 
1228 	return oldref == 1;
1229 }
1230 
1231 int
nxctl_release(struct nxctl * nxctl)1232 nxctl_release(struct nxctl *nxctl)
1233 {
1234 	int lastref;
1235 
1236 	SK_LOCK();
1237 	lastref = nxctl_release_locked(nxctl);
1238 	SK_UNLOCK();
1239 
1240 	return lastref;
1241 }
1242 
1243 /* XXX
1244  * -fbounds-safety: Why is this taking a void *? All callers are passing nxctl.
1245  * How come there's no nxctl_ctor?
1246  */
1247 void
nxctl_dtor(struct nxctl * arg)1248 nxctl_dtor(struct nxctl *arg)
1249 {
1250 	struct nxctl *nxctl = arg;
1251 
1252 	nxctl_close(nxctl);
1253 	SK_LOCK();
1254 	(void) nxctl_release_locked(nxctl);
1255 	SK_UNLOCK();
1256 }
1257 
1258 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1259 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1260     struct proc *p)
1261 {
1262 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1263 	int err = 0;
1264 
1265 	ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1266 	ASSERT(ch->ch_ctx == NULL);
1267 
1268 	SK_LOCK_ASSERT_HELD();
1269 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1270 
1271 	/* monitor channels aren't externally visible/usable, so ignore */
1272 	if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1273 	    (ch->ch_flags & CHANF_EXT_SKIP) ||
1274 	    (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1275 	    nxprov->nxprov_ext.nxpi_connected == NULL)) {
1276 		return 0;
1277 	}
1278 
1279 	ch_retain_locked(ch);
1280 	lck_mtx_unlock(&ch->ch_lock);
1281 	SK_UNLOCK();
1282 	lck_mtx_lock(&ch->ch_lock);
1283 
1284 	err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1285 	    ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1286 	if (err != 0) {
1287 		SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1288 		    "error %d", SK_KVA(ch), ch->ch_flags,
1289 		    CHANF_BITS, SK_KVA(nx), err);
1290 		ch->ch_ctx = NULL;
1291 		goto done;
1292 	}
1293 	/*
1294 	 * Upon ring/slot init failure, this is cleared
1295 	 * by nxprov_advise_disconnect() below.
1296 	 */
1297 	os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1298 	if (NXPROV_LLINK(nxprov)) {
1299 		err = nx_netif_llink_ext_init_default_queues(nx);
1300 	} else {
1301 		err = nx_init_rings(nx, ch);
1302 	}
1303 	if (err != 0) {
1304 		goto done;
1305 	}
1306 	ASSERT(err == 0);
1307 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1308 	    CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1309 
1310 	err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1311 	if (err != 0) {
1312 		SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1313 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1314 		goto done;
1315 	}
1316 	os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1317 	SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1318 	    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1319 
1320 
1321 done:
1322 	lck_mtx_unlock(&ch->ch_lock);
1323 	SK_LOCK();
1324 	lck_mtx_lock(&ch->ch_lock);
1325 	if ((err != 0) &&
1326 	    (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1327 		nxprov_advise_disconnect(nx, ch);
1328 	}
1329 	/* caller is expected to hold one, in addition to ourselves */
1330 	VERIFY(ch->ch_refcnt >= 2);
1331 	ch_release_locked(ch);
1332 
1333 	return err;
1334 }
1335 
1336 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1337 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1338 {
1339 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1340 
1341 	SK_LOCK_ASSERT_HELD();
1342 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1343 
1344 	/* check as we might be called in the error handling path */
1345 	if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1346 		ch_retain_locked(ch);
1347 		lck_mtx_unlock(&ch->ch_lock);
1348 		SK_UNLOCK();
1349 		lck_mtx_lock(&ch->ch_lock);
1350 
1351 		ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1352 		if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1353 			nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1354 			os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1355 		}
1356 
1357 		/*
1358 		 * Inform the external domain provider that the rings
1359 		 * and slots for this channel are no longer valid.
1360 		 */
1361 		if (NXPROV_LLINK(nxprov)) {
1362 			nx_netif_llink_ext_fini_default_queues(nx);
1363 		} else {
1364 			nx_fini_rings(nx, ch);
1365 		}
1366 
1367 		ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1368 		nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1369 		os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1370 
1371 		SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1372 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1373 
1374 		/* We're done with this channel */
1375 		ch->ch_ctx = NULL;
1376 
1377 		lck_mtx_unlock(&ch->ch_lock);
1378 		SK_LOCK();
1379 		lck_mtx_lock(&ch->ch_lock);
1380 		/* caller is expected to hold one, in addition to ourselves */
1381 		VERIFY(ch->ch_refcnt >= 2);
1382 		ch_release_locked(ch);
1383 	}
1384 	ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1385 	ASSERT(ch->ch_ctx == NULL);
1386 }
1387 
1388 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1389 nxprov_create_common(struct nxctl *nxctl,
1390     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1391     const struct kern_nexus_provider_init *init, int *err)
1392 {
1393 	struct skmem_region_params srp[SKMEM_REGIONS];
1394 	struct kern_nexus_provider *nxprov = NULL;
1395 	struct nxprov_params nxp;
1396 	uint32_t override = 0;
1397 	uint32_t pp_region_config_flags;
1398 	int i;
1399 
1400 	_CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1401 	_CASSERT(sizeof(*init) >=
1402 	    sizeof(struct kern_nexus_netif_provider_init));
1403 
1404 	SK_LOCK_ASSERT_HELD();
1405 	ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1406 
1407 	pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1408 	    PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1409 	/*
1410 	 * Special handling for external nexus providers; similar
1411 	 * logic to what's done in kern_pbufpool_create().
1412 	 */
1413 	if (init != NULL) {
1414 		if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1415 			pp_region_config_flags |=
1416 			    PP_REGION_CONFIG_BUF_MONOLITHIC;
1417 		}
1418 
1419 		if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1420 			pp_region_config_flags |=
1421 			    PP_REGION_CONFIG_BUF_NOCACHE;
1422 		}
1423 	}
1424 
1425 	/*
1426 	 * For network devices, set the packet metadata memory as persistent
1427 	 * so that it is wired at segment creation.  This allows us to access
1428 	 * it with preemption disabled, as well as for rdar://problem/46511741.
1429 	 */
1430 	if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1431 		pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1432 	}
1433 
1434 	/* process and validate provider parameters */
1435 	if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1436 	    &nxp, srp, override, pp_region_config_flags)) != 0) {
1437 		goto done;
1438 	}
1439 
1440 	nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1441 	ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1442 
1443 	STAILQ_INIT(&nxprov->nxprov_nx_head);
1444 	STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1445 	nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1446 	nxprov->nxprov_ctl = nxctl;
1447 	uuid_generate_random(nxprov->nxprov_uuid);
1448 	bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1449 
1450 	if (init != NULL) {
1451 		if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1452 			ASSERT(NXPROV_LLINK(nxprov));
1453 			bcopy(init, &nxprov->nxprov_netif_ext,
1454 			    sizeof(nxprov->nxprov_netif_ext));
1455 		} else {
1456 			ASSERT(!NXPROV_LLINK(nxprov));
1457 			ASSERT(init->nxpi_version ==
1458 			    KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1459 			bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1460 		}
1461 		nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1462 	}
1463 
1464 	/* store validated region parameters to the provider */
1465 	for (i = 0; i < SKMEM_REGIONS; i++) {
1466 		nxprov->nxprov_region_params[i] = srp[i];
1467 	}
1468 
1469 	if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1470 		uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1471 
1472 		if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1473 			nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1474 		}
1475 	} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1476 	    NEXUS_TYPE_NET_IF) {
1477 		/*
1478 		 * Treat non-netif built-in nexus providers as those
1479 		 * meant for inter-process communications, i.e. there
1480 		 * is no actual networking hardware involved.
1481 		 */
1482 		nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1483 	}
1484 
1485 	nxprov_retain_locked(nxprov);   /* one for being in the list */
1486 	nxprov_retain_locked(nxprov);   /* one for the caller */
1487 
1488 #if SK_LOG
1489 	uuid_string_t uuidstr;
1490 	SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1491 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1492 #endif /* SK_LOG */
1493 
1494 done:
1495 	return nxprov;
1496 }
1497 
1498 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1499 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1500     int *err)
1501 {
1502 	struct nxprov_params *nxp = &reg->nxpreg_params;
1503 	struct kern_nexus_domain_provider *nxdom_prov = NULL;
1504 	struct kern_nexus_provider *nxprov = NULL;
1505 
1506 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1507 
1508 	ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1509 	*err = 0;
1510 
1511 	switch (nxp->nxp_type) {
1512 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1513 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1514 		    PRIV_SKYWALK_REGISTER_USER_PIPE);
1515 		break;
1516 
1517 	case NEXUS_TYPE_FLOW_SWITCH:    /* allowed for userland */
1518 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1519 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1520 		break;
1521 
1522 	case NEXUS_TYPE_NET_IF:         /* allowed for userland */
1523 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1524 		    PRIV_SKYWALK_REGISTER_NET_IF);
1525 		break;
1526 
1527 	case NEXUS_TYPE_KERNEL_PIPE:    /* only for kernel */
1528 	case NEXUS_TYPE_MONITOR:        /* invalid */
1529 	default:
1530 		*err = EINVAL;
1531 		goto done;
1532 	}
1533 
1534 	if (*err != 0) {
1535 		goto done;
1536 	}
1537 
1538 	ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1539 	if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1540 		*err = ENXIO;
1541 		goto done;
1542 	}
1543 
1544 #if CONFIG_NEXUS_NETIF
1545 	/* make sure netif_compat is the default here */
1546 	ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1547 	    strbufcmp(nxdom_prov->nxdom_prov_name, sizeof(nxdom_prov->nxdom_prov_name),
1548 	    NEXUS_PROVIDER_NET_IF_COMPAT, sizeof(NEXUS_PROVIDER_NET_IF_COMPAT)) == 0);
1549 #endif /* CONFIG_NEXUS_NETIF */
1550 
1551 	SK_LOCK();
1552 	/* callee holds a reference for our caller upon success */
1553 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1554 	SK_UNLOCK();
1555 done:
1556 	return nxprov;
1557 }
1558 
1559 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1560 nxprov_create_kern(struct nxctl *nxctl,
1561     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1562     const struct kern_nexus_provider_init *init, int *err)
1563 {
1564 	struct nxprov_params *nxp = &reg->nxpreg_params;
1565 	struct kern_nexus_provider *nxprov = NULL;
1566 
1567 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1568 	SK_LOCK_ASSERT_HELD();
1569 
1570 	ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1571 	ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1572 	ASSERT(init == NULL ||
1573 	    init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1574 	    init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1575 
1576 	*err = 0;
1577 
1578 	switch (nxp->nxp_type) {
1579 	case NEXUS_TYPE_NET_IF:
1580 		break;
1581 	case NEXUS_TYPE_KERNEL_PIPE:
1582 		if (init == NULL) {
1583 			*err = EINVAL;
1584 			goto done;
1585 		}
1586 		break;
1587 	case NEXUS_TYPE_FLOW_SWITCH:
1588 		if (init != NULL) {
1589 			*err = EINVAL;
1590 			goto done;
1591 		}
1592 		break;
1593 
1594 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1595 	case NEXUS_TYPE_MONITOR:        /* invalid */
1596 	default:
1597 		*err = EINVAL;
1598 		goto done;
1599 	}
1600 
1601 	/* callee holds a reference for our caller upon success */
1602 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1603 
1604 done:
1605 	return nxprov;
1606 }
1607 
1608 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1609 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1610 {
1611 	struct kern_nexus_provider *nxprov = NULL;
1612 	int err = 0;
1613 
1614 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1615 
1616 	SK_LOCK();
1617 
1618 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1619 		if (nxctl == nxprov->nxprov_ctl &&
1620 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1621 			nxprov_retain_locked(nxprov);
1622 			break;
1623 		}
1624 	}
1625 
1626 	if (nxprov == NULL) {
1627 		err = ENOENT;
1628 	} else {
1629 		err = nxprov_close(nxprov, TRUE);
1630 	}
1631 
1632 	if (nxprov != NULL) {
1633 		(void) nxprov_release_locked(nxprov);
1634 	}
1635 
1636 	SK_UNLOCK();
1637 
1638 	return err;
1639 }
1640 
1641 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1642 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1643 {
1644 	int err = 0;
1645 
1646 	if (!locked) {
1647 		SK_LOCK();
1648 	}
1649 
1650 	SK_LOCK_ASSERT_HELD();
1651 
1652 #if SK_LOG
1653 	uuid_string_t uuidstr;
1654 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1655 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1656 	    nxprov->nxprov_flags, NXPROVF_BITS);
1657 #endif /* SK_LOG */
1658 
1659 	if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1660 		err = EALREADY;
1661 	} else {
1662 		struct kern_nexus *nx, *tnx;
1663 
1664 		nxprov->nxprov_ctl = NULL;
1665 
1666 		STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1667 		    nx_prov_link, tnx) {
1668 			nx_retain_locked(nx);
1669 			(void) nx_close(nx, TRUE);
1670 			(void) nx_release_locked(nx);
1671 		}
1672 
1673 		if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1674 			/* no nexus created on this, so detach now */
1675 			nxprov_detach(nxprov, TRUE);
1676 		} else {
1677 			/* detach when last nexus is destroyed */
1678 			ASSERT(nxprov->nxprov_refcnt > 1);
1679 			nxprov->nxprov_flags |= NXPROVF_CLOSED;
1680 		}
1681 	}
1682 
1683 	if (!locked) {
1684 		SK_UNLOCK();
1685 	}
1686 
1687 	return err;
1688 }
1689 
1690 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1691 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1692 {
1693 	if (!locked) {
1694 		SK_LOCK();
1695 	}
1696 
1697 	SK_LOCK_ASSERT_HELD();
1698 
1699 #if SK_LOG
1700 	uuid_string_t uuidstr;
1701 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1702 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1703 	    nxprov->nxprov_flags, NXPROVF_BITS);
1704 #endif /* SK_LOG */
1705 
1706 	ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1707 	STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1708 	nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1709 
1710 	/* caller must hold an extra ref */
1711 	ASSERT(nxprov->nxprov_refcnt > 1);
1712 	(void) nxprov_release_locked(nxprov);
1713 
1714 	if (!locked) {
1715 		SK_UNLOCK();
1716 	}
1717 }
1718 
1719 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1720 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1721 {
1722 	struct kern_nexus_provider *nxprov;
1723 	struct nxprov_params *nxp;
1724 
1725 	ASSERT(nxdom_prov != NULL);
1726 
1727 	nxp = nxprov_params_alloc(how);
1728 	if (nxp == NULL) {
1729 		SK_ERR("Failed to allocate nxprov_params");
1730 		return NULL;
1731 	}
1732 
1733 	nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1734 	if (nxprov == NULL) {
1735 		SK_ERR("Failed to allocate nxprov");
1736 		nxprov_params_free(nxp);
1737 		return NULL;
1738 	}
1739 
1740 	nxprov->nxprov_dom_prov = nxdom_prov;
1741 	nxprov->nxprov_params = nxp;
1742 	/* hold a reference for nxprov */
1743 	nxdom_prov_retain_locked(nxdom_prov);
1744 
1745 	return nxprov;
1746 }
1747 
1748 static void
nxprov_free(struct kern_nexus_provider * nxprov)1749 nxprov_free(struct kern_nexus_provider *nxprov)
1750 {
1751 	struct kern_nexus_domain_provider *nxdom_prov =
1752 	    nxprov->nxprov_dom_prov;
1753 
1754 	SK_LOCK_ASSERT_HELD();
1755 
1756 	ASSERT(nxdom_prov != NULL);
1757 	(void) nxdom_prov_release_locked(nxdom_prov);
1758 	nxprov->nxprov_dom_prov = NULL;
1759 	ASSERT(nxprov->nxprov_params != NULL);
1760 	nxprov_params_free(nxprov->nxprov_params);
1761 	nxprov->nxprov_params = NULL;
1762 	ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1763 	SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1764 	zfree(nxprov_zone, nxprov);
1765 }
1766 
1767 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1768 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1769 {
1770 	SK_LOCK_ASSERT_HELD();
1771 
1772 	nxprov->nxprov_refcnt++;
1773 	ASSERT(nxprov->nxprov_refcnt != 0);
1774 }
1775 
1776 void
nxprov_retain(struct kern_nexus_provider * nxprov)1777 nxprov_retain(struct kern_nexus_provider *nxprov)
1778 {
1779 	SK_LOCK();
1780 	nxprov_retain_locked(nxprov);
1781 	SK_UNLOCK();
1782 }
1783 
1784 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1785 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1786 {
1787 	int oldref = nxprov->nxprov_refcnt;
1788 
1789 	SK_LOCK_ASSERT_HELD();
1790 
1791 	ASSERT(nxprov->nxprov_refcnt != 0);
1792 	if (--nxprov->nxprov_refcnt == 0) {
1793 		nxprov_free(nxprov);
1794 	}
1795 
1796 	return oldref == 1;
1797 }
1798 
1799 int
nxprov_release(struct kern_nexus_provider * nxprov)1800 nxprov_release(struct kern_nexus_provider *nxprov)
1801 {
1802 	int lastref;
1803 
1804 	SK_LOCK();
1805 	lastref = nxprov_release_locked(nxprov);
1806 	SK_UNLOCK();
1807 
1808 	return lastref;
1809 }
1810 
1811 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1812 nxprov_params_alloc(zalloc_flags_t how)
1813 {
1814 	return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1815 }
1816 
1817 void
nxprov_params_free(struct nxprov_params * nxp)1818 nxprov_params_free(struct nxprov_params *nxp)
1819 {
1820 	SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1821 	zfree(nxprov_params_zone, nxp);
1822 }
1823 
1824 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1825 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1826 {
1827 	struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1828 
1829 	if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1830 		SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1831 		return ENOTSUP;
1832 	}
1833 
1834 	/*
1835 	 * Require that the nexus domain metadata type and the
1836 	 * metadata type of the caller-provided pbufpool match.
1837 	 */
1838 	if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1839 	    pp->pp_md_type ||
1840 	    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1841 	    pp->pp_md_subtype) {
1842 		SK_ERR("Mismatch in metadata type/subtype "
1843 		    "(%u/%u != %u/%u)", pp->pp_md_type,
1844 		    nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1845 		    pp->pp_md_subtype,
1846 		    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1847 		return EINVAL;
1848 	}
1849 
1850 	/*
1851 	 * Require that the nexus provider memory configuration
1852 	 * has the same impedance as the caller-provided one.
1853 	 * Both need to be lacking or present; if one of them
1854 	 * is set and the other isn't, then we bail.
1855 	 */
1856 	if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1857 	    !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1858 		SK_ERR("Memory config mismatch: monolithic mode");
1859 		return EINVAL;
1860 	}
1861 
1862 	return 0;
1863 }
1864 
1865 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1866 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1867     const nexus_type_t dom_type, const void *nx_ctx,
1868     nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1869     struct kern_pbufpool *rx_pp, int *err)
1870 {
1871 	struct kern_nexus_domain_provider *nxdom_prov;
1872 	struct kern_nexus_provider *nxprov = NULL;
1873 	struct kern_nexus *nx = NULL;
1874 #if SK_LOG
1875 	uuid_string_t uuidstr;
1876 #endif /* SK_LOG */
1877 
1878 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1879 
1880 	ASSERT(dom_type < NEXUS_TYPE_MAX);
1881 	ASSERT(!uuid_is_null(nxprov_uuid));
1882 	*err = 0;
1883 
1884 	SK_LOCK();
1885 
1886 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1887 		if (nxctl == nxprov->nxprov_ctl &&
1888 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1889 			break;
1890 		}
1891 	}
1892 
1893 	if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1894 		SK_ERR("Provider not found or has been closed");
1895 		*err = ENOENT;
1896 		goto done;
1897 	}
1898 
1899 	nxdom_prov = nxprov->nxprov_dom_prov;
1900 	if (dom_type != NEXUS_TYPE_UNDEFINED &&
1901 	    (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1902 		SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1903 		    dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1904 		nxdom_prov = NULL;
1905 		nxprov = NULL;
1906 		*err = ENODEV;
1907 		goto done;
1908 	}
1909 
1910 	if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1911 	    (!tx_pp || !rx_pp)) {
1912 #if SK_LOG
1913 		SK_ERR("TX/RX packet pool is required for netif logical link "
1914 		    "nexus provider UUID: %s",
1915 		    sk_uuid_unparse(nxprov_uuid, uuidstr));
1916 #endif /* SK_LOG */
1917 		nxdom_prov = NULL;
1918 		nxprov = NULL;
1919 		*err = EINVAL;
1920 		goto done;
1921 	}
1922 
1923 	if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1924 	    (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1925 		goto done;
1926 	}
1927 
1928 	nx = nx_alloc(Z_WAITOK);
1929 
1930 	STAILQ_INIT(&nx->nx_ch_head);
1931 	STAILQ_INIT(&nx->nx_ch_nonxref_head);
1932 	lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1933 	    &nexus_lock_attr);
1934 	STAILQ_INIT(&nx->nx_ch_if_adv_head);
1935 	uuid_generate_random(nx->nx_uuid);
1936 	nx->nx_prov = nxprov;
1937 	nx->nx_ctx = __DECONST(void *, nx_ctx);
1938 	nx->nx_ctx_release = nx_ctx_release;
1939 	nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1940 
1941 	if (tx_pp != NULL) {
1942 		nx->nx_tx_pp = tx_pp;
1943 		pp_retain(tx_pp);       /* released by nx_free */
1944 	}
1945 
1946 	if (rx_pp != NULL) {
1947 		nx->nx_rx_pp = rx_pp;
1948 		pp_retain(rx_pp);       /* released by nx_free */
1949 	}
1950 
1951 	/* this nexus is alive; tell the nexus constructor to set it up */
1952 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1953 		*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1954 		if (*err != 0) {
1955 			nx->nx_prov = NULL;
1956 			goto done;
1957 		}
1958 	}
1959 
1960 	nxprov_retain_locked(nxprov);   /* hold a ref on the nexus reg */
1961 
1962 	STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1963 	nxprov->nxprov_nx_count++;
1964 	RB_INSERT(kern_nexus_tree, &nx_head, nx);
1965 	os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1966 
1967 	nx_retain_locked(nx);   /* one for the provider list */
1968 	nx_retain_locked(nx);   /* one for the global list */
1969 	nx_retain_locked(nx);   /* one for the caller */
1970 
1971 #if SK_LOG
1972 	SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1973 	    nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 	    nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1975 #endif /* SK_LOG */
1976 done:
1977 	SK_UNLOCK();
1978 
1979 	if (*err != 0) {
1980 		if (nx != NULL) {
1981 			nx_free(nx);
1982 			nx = NULL;
1983 		}
1984 	}
1985 	return nx;
1986 }
1987 
1988 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1989 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1990 {
1991 	struct kern_nexus *nx = NULL;
1992 	struct kern_nexus find;
1993 	int err = 0;
1994 
1995 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1996 
1997 	SK_LOCK();
1998 
1999 	uuid_copy(find.nx_uuid, nx_uuid);
2000 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2001 	if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
2002 		nx = NULL;
2003 	}
2004 
2005 	if (nx != NULL) {
2006 		nx_retain_locked(nx);
2007 	}
2008 
2009 	if (nx == NULL) {
2010 		err = ENOENT;
2011 	} else {
2012 		/* prevent any opens */
2013 		os_atomic_or(&nx->nx_flags, NXF_INVALIDATED, relaxed);
2014 		err = nx_close(nx, TRUE);
2015 		(void) nx_release_locked(nx);
2016 	}
2017 
2018 	SK_UNLOCK();
2019 
2020 	return err;
2021 }
2022 
2023 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)2024 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
2025 {
2026 	return uuid_compare(a->nx_uuid, b->nx_uuid);
2027 }
2028 
2029 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)2030 nx_find(const uuid_t nx_uuid, boolean_t locked)
2031 {
2032 	struct kern_nexus *nx = NULL;
2033 	struct kern_nexus find;
2034 
2035 	if (!locked) {
2036 		SK_LOCK();
2037 	}
2038 
2039 	SK_LOCK_ASSERT_HELD();
2040 
2041 	uuid_copy(find.nx_uuid, nx_uuid);
2042 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2043 	if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2044 		nx = NULL;
2045 	}
2046 
2047 	/* return reference to caller */
2048 	if (nx != NULL) {
2049 		nx_retain_locked(nx);
2050 	}
2051 
2052 	if (!locked) {
2053 		SK_UNLOCK();
2054 	}
2055 
2056 	return nx;
2057 }
2058 
2059 int
nx_close(struct kern_nexus * nx,boolean_t locked)2060 nx_close(struct kern_nexus *nx, boolean_t locked)
2061 {
2062 	int err = 0;
2063 
2064 	if (!locked) {
2065 		SK_LOCK();
2066 	}
2067 
2068 	SK_LOCK_ASSERT_HELD();
2069 
2070 
2071 	if (nx->nx_flags & NXF_CLOSED) {
2072 		err = EALREADY;
2073 	} else {
2074 #if SK_LOG
2075 		uuid_string_t uuidstr;
2076 		SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2077 		    NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2078 		    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2079 		    NXF_BITS);
2080 #endif /* SK_LOG */
2081 
2082 		if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2083 			/* no regular channels open to it, so detach now */
2084 			nx_detach(nx);
2085 		} else {
2086 			/* detach when the last channel closes */
2087 			ASSERT(nx->nx_refcnt > 3);
2088 			os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2089 		}
2090 	}
2091 
2092 	if (!locked) {
2093 		SK_UNLOCK();
2094 	}
2095 
2096 	return err;
2097 }
2098 
2099 void
nx_stop(struct kern_nexus * nx)2100 nx_stop(struct kern_nexus *nx)
2101 {
2102 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2103 
2104 	SK_LOCK_ASSERT_HELD();
2105 
2106 	/* send a stop message */
2107 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2108 		nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2109 	}
2110 }
2111 
2112 void
nx_detach(struct kern_nexus * nx)2113 nx_detach(struct kern_nexus *nx)
2114 {
2115 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2116 
2117 	SK_LOCK_ASSERT_HELD();
2118 
2119 #if SK_LOG
2120 	uuid_string_t uuidstr;
2121 	SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2122 	    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2123 #endif /* SK_LOG */
2124 
2125 	/* Caller must hold extra refs, on top of the two in reg/global lists */
2126 	ASSERT(nx->nx_refcnt >= 3);
2127 	ASSERT(nx->nx_flags & NXF_ATTACHED);
2128 
2129 	/* this nexus is done; let the nexus destructor do final cleanups */
2130 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2131 		nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2132 	}
2133 
2134 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2135 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2136 
2137 	STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2138 	nxprov->nxprov_nx_count--;
2139 	RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2140 	os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2141 	nx->nx_prov = NULL;
2142 	if (nx->nx_ctx_release != NULL) {
2143 		nx->nx_ctx_release(nx->nx_ctx);
2144 	}
2145 	nx->nx_ctx = NULL;
2146 
2147 	(void) nx_release_locked(nx);   /* one for the reg list */
2148 	(void) nx_release_locked(nx);   /* one for the global list */
2149 
2150 	/*
2151 	 * If this was the last nexus and the provider has been closed,
2152 	 * detach the provider and and finish up the postponed job.
2153 	 */
2154 	if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2155 	    (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2156 		nxprov_detach(nxprov, TRUE);
2157 	}
2158 	(void) nxprov_release_locked(nxprov);
2159 }
2160 
2161 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2162 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2163     struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2164 {
2165 	struct __kern_nexus_adv_metadata *adv_md;
2166 	uint32_t msize = 0;
2167 	/* -fbounds-safety: why do we need maddr? */
2168 	void *__sized_by(msize) maddr = NULL;
2169 
2170 	_CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2171 	_CASSERT((sizeof(struct sk_nexusadv) +
2172 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2173 	_CASSERT((sizeof(struct netif_nexus_advisory) +
2174 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2175 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2176 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2177 	ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2178 	    type == NEXUS_ADVISORY_TYPE_NETIF);
2179 
2180 	if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2181 	    NULL, NULL, NULL)) == NULL) {
2182 		return ENOMEM;
2183 	}
2184 
2185 	nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, &maddr,
2186 	    NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC),
2187 	    nx->nx_adv.nxv_reg->skr_c_obj_size, &msize);
2188 	nx->nx_adv.nxv_adv_size = nx->nx_adv.nxv_reg->skr_c_obj_size;
2189 	adv_md = nx->nx_adv.nxv_adv;
2190 	adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2191 	adv_md->knam_type = type;
2192 	adv_md->__reserved = 0;
2193 	nx->nx_adv.nxv_adv_type = type;
2194 	nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2195 	if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2196 		nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2197 		    NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2198 	} else {
2199 		nx->nx_adv.netif_nxv_adv->nna_version =
2200 		    NX_NETIF_ADVISORY_CURRENT_VERSION;
2201 	}
2202 	return 0;
2203 }
2204 
2205 void
nx_advisory_free(struct kern_nexus * nx)2206 nx_advisory_free(struct kern_nexus *nx)
2207 {
2208 	if (nx->nx_adv.nxv_reg != NULL) {
2209 		ASSERT(nx->nx_adv.nxv_adv != NULL);
2210 		skmem_region_free(nx->nx_adv.nxv_reg,
2211 		    nx->nx_adv.nxv_adv, NULL);
2212 		nx->nx_adv.nxv_adv = NULL;
2213 		nx->nx_adv.nxv_adv_size = 0;
2214 		nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2215 		nx->nx_adv.flowswitch_nxv_adv = NULL;
2216 		skmem_region_release(nx->nx_adv.nxv_reg);
2217 		nx->nx_adv.nxv_reg = NULL;
2218 	}
2219 
2220 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2221 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2222 	ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2223 	ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2224 }
2225 
2226 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2227 nx_alloc(zalloc_flags_t how)
2228 {
2229 	SK_LOCK_ASSERT_HELD();
2230 
2231 	return zalloc_flags(nx_zone, how | Z_ZERO);
2232 }
2233 
2234 static void
nx_free(struct kern_nexus * nx)2235 nx_free(struct kern_nexus *nx)
2236 {
2237 	ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2238 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2239 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2240 
2241 	nx_port_free_all(nx);
2242 
2243 	if (nx->nx_tx_pp != NULL) {
2244 		pp_release(nx->nx_tx_pp);
2245 		nx->nx_tx_pp = NULL;
2246 	}
2247 	if (nx->nx_rx_pp != NULL) {
2248 		pp_release(nx->nx_rx_pp);
2249 		nx->nx_rx_pp = NULL;
2250 	}
2251 
2252 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2253 	lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2254 
2255 	SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2256 	zfree(nx_zone, nx);
2257 }
2258 
2259 void
nx_retain_locked(struct kern_nexus * nx)2260 nx_retain_locked(struct kern_nexus *nx)
2261 {
2262 	SK_LOCK_ASSERT_HELD();
2263 
2264 	nx->nx_refcnt++;
2265 	VERIFY(nx->nx_refcnt > 0);
2266 }
2267 
2268 void
nx_retain(struct kern_nexus * nx)2269 nx_retain(struct kern_nexus *nx)
2270 {
2271 	SK_LOCK();
2272 	nx_retain_locked(nx);
2273 	SK_UNLOCK();
2274 }
2275 
2276 int
nx_release_locked(struct kern_nexus * nx)2277 nx_release_locked(struct kern_nexus *nx)
2278 {
2279 	int oldref = nx->nx_refcnt;
2280 
2281 	SK_LOCK_ASSERT_HELD();
2282 
2283 	VERIFY(nx->nx_refcnt > 0);
2284 	if (--nx->nx_refcnt == 0) {
2285 		nx_free(nx);
2286 	}
2287 
2288 	return oldref == 1;
2289 }
2290 
2291 int
nx_release(struct kern_nexus * nx)2292 nx_release(struct kern_nexus *nx)
2293 {
2294 	int lastref;
2295 
2296 	SK_LOCK_ASSERT_NOTHELD();
2297 
2298 	SK_LOCK();
2299 	lastref = nx_release_locked(nx);
2300 	SK_UNLOCK();
2301 
2302 	return lastref;
2303 }
2304 
2305 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2306 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2307 {
2308 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2309 	struct nexus_adapter *na = ch->ch_na;
2310 	boolean_t undo = FALSE;
2311 	int ksd_retains = 0;
2312 	enum txrx t;
2313 	int err = 0;
2314 
2315 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2316 	    CHANF_EXT_PRECONNECT);
2317 
2318 	if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2319 		return 0;
2320 	}
2321 
2322 	for_rx_tx(t) {
2323 		uint32_t i;
2324 
2325 		for (i = 0; i < na_get_nrings(na, t); i++) {
2326 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2327 
2328 			/* skip host rings */
2329 			if (kring->ckr_flags & CKRF_HOST) {
2330 				continue;
2331 			}
2332 
2333 			if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2334 				    nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2335 				    &kring->ckr_ctx)) != 0) {
2336 				SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2337 				    "(0x%llx) krflags %b ring_init error %d",
2338 				    SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2339 				    SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2340 				    kring->ckr_flags, CKRF_BITS, err);
2341 				kring->ckr_ctx = NULL;
2342 				undo = TRUE;
2343 				break;
2344 			}
2345 			kring->ckr_flags |= CKRF_EXT_RING_INITED;
2346 
2347 			if ((err = nx_init_slots(nx, kring)) != 0) {
2348 				undo = TRUE;
2349 				break;
2350 			}
2351 
2352 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2353 				++ksd_retains;
2354 			}
2355 		}
2356 		if (undo) {
2357 			break;
2358 		}
2359 	}
2360 
2361 	/*
2362 	 * Note: retain KSD even in case of error, as we have set
2363 	 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2364 	 * nx_fini_rings would take care of release based on it.
2365 	 */
2366 	if (ksd_retains != 0) {
2367 		/*
2368 		 * Mark the kernel slot descriptor region as busy; this
2369 		 * prevents it from being torn-down at channel defunct
2370 		 * time, as we need to invoke the slot_fini() callback
2371 		 * for each slot and we need the descriptors until then.
2372 		 */
2373 		skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2374 		    ksd_retains);
2375 	}
2376 
2377 	if (err != 0) {
2378 		ASSERT(undo);
2379 		nx_fini_rings(nx, ch);
2380 	}
2381 
2382 	return err;
2383 }
2384 
2385 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2386 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2387 {
2388 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2389 	struct nexus_adapter *na = ch->ch_na;
2390 	int ksd_releases = 0;
2391 	enum txrx t;
2392 
2393 	for_rx_tx(t) {
2394 		uint32_t i;
2395 
2396 		for (i = 0; i < na_get_nrings(na, t); i++) {
2397 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2398 
2399 			if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2400 				continue;
2401 			}
2402 
2403 			ASSERT(!(kring->ckr_flags & CKRF_HOST));
2404 			ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2405 			nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2406 			kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2407 
2408 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2409 				++ksd_releases;
2410 			}
2411 
2412 			/*
2413 			 * Undo the work done in nx_init_slots() and inform
2414 			 * the external domain provider, if applicable, that
2415 			 * the slots for this ring are no longer valid.
2416 			 */
2417 			nx_fini_slots(nx, kring);
2418 			kring->ckr_ctx = NULL;
2419 		}
2420 	}
2421 
2422 	if (ksd_releases != 0) {
2423 		/*
2424 		 * Now that we've finished invoking the slot_fini()
2425 		 * callbacks, release the busy retain counts held
2426 		 * earlier in nx_init_rings().  This will allow the
2427 		 * kernel slot descriptor region to be torn down.
2428 		 */
2429 		skmem_arena_nexus_sd_set_noidle(
2430 			skmem_arena_nexus(na->na_arena), -ksd_releases);
2431 	}
2432 }
2433 
2434 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2435 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2436 {
2437 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2438 	struct __slot_desc *slot = kring->ckr_ksds;
2439 	int err = 0;
2440 	uint32_t i;
2441 
2442 	/*
2443 	 * If the slot init callback was not provided, or if the
2444 	 * kring was not created to hold any slot contexts, don't
2445 	 * go any further.
2446 	 */
2447 	if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2448 	    kring->ckr_slot_ctxs == NULL) {
2449 		return 0;
2450 	}
2451 
2452 	ASSERT(kring->ckr_slot_ctxs_set == 0);
2453 	ASSERT(slot != NULL);
2454 
2455 	for (i = 0; i < kring->ckr_num_slots; i++) {
2456 		struct kern_slot_prop *__single slot_ctx_prop = NULL;
2457 		/* -fbounds-safety: slot_ctx is unsafe anyway (mach_vmaddr_t) */
2458 		void *__single slot_ctx_arg = NULL;
2459 
2460 		ASSERT(&slot[i] <= kring->ckr_ksds_last);
2461 		if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2462 		    &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2463 			SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2464 			    "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2465 			    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2466 			break;
2467 		}
2468 		/* we don't want this to be used by client, so verify here */
2469 		ASSERT(slot_ctx_prop == NULL);
2470 		kring->ckr_slot_ctxs[i].slot_ctx_arg = slot_ctx_arg;
2471 		kring->ckr_slot_ctxs_set++;
2472 	}
2473 
2474 	if (err != 0) {
2475 		nx_fini_slots(nx, kring);
2476 	} else {
2477 		kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2478 	}
2479 
2480 	return err;
2481 }
2482 
2483 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2484 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2485 {
2486 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2487 	struct __slot_desc *slot = kring->ckr_ksds;
2488 	uint32_t i;
2489 
2490 	ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2491 	    nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2492 	ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2493 
2494 	for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2495 		ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2496 		if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2497 			nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2498 			    kring, &slot[i], i);
2499 		}
2500 		if (kring->ckr_slot_ctxs != NULL) {
2501 			kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2502 		}
2503 	}
2504 	kring->ckr_slot_ctxs_set = 0;
2505 
2506 	/* We're done with this kring */
2507 	kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2508 }
2509 
2510 
2511 /* 64-bit mask with range */
2512 #define BMASK64(_beg, _end)     \
2513 	((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2514 
2515 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2516 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2517     nexus_port_t last, nexus_port_t *nx_port)
2518 {
2519 	int err = 0;
2520 
2521 	ASSERT(first < last);
2522 	*nx_port = NEXUS_PORT_ANY;
2523 
2524 	if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2525 		/*
2526 		 * Left edge of the range is beyond the current map;
2527 		 * let nx_port_alloc() handle the growing later.
2528 		 */
2529 		*nx_port = first;
2530 	} else {
2531 		nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2532 		nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2533 		nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2534 		nexus_port_size_t i, j;
2535 		bitmap_t *bmap;
2536 
2537 		/*
2538 		 * The right edge of the range is either within or
2539 		 * beyond the current map; scan thru the current
2540 		 * map and find the first available port.
2541 		 */
2542 		for (i = fc; i <= lc; i++) {
2543 			bitmap_t mask;
2544 			nexus_port_size_t beg = 0, end = 63;
2545 
2546 			if (i == fc) {
2547 				beg = (first % NX_PORT_CHUNK);
2548 			}
2549 			if (i == (last / NX_PORT_CHUNK)) {
2550 				end = (last % NX_PORT_CHUNK);
2551 			}
2552 
2553 			if (i < lim) {
2554 				bmap = &nx->nx_ports_bmap[i];
2555 				mask = BMASK64(beg, end);
2556 
2557 				j = (nexus_port_size_t)ffsll((*bmap) & mask);
2558 				if (j == 0) {
2559 					continue;
2560 				}
2561 
2562 				--j;
2563 				*nx_port = (i * NX_PORT_CHUNK) + j;
2564 			}
2565 			break;
2566 		}
2567 
2568 		/*
2569 		 * If the requested range is within the current map and we
2570 		 * couldn't find a port, return an err.  Otherwise, return
2571 		 * the next port index to trigger growing later.
2572 		 */
2573 		if (*nx_port == NEXUS_PORT_ANY) {
2574 			if (lc == (last / NX_PORT_CHUNK)) {
2575 				err = EBUSY;
2576 				SK_ERR("port unavail in [%u, %u)", first, last);
2577 			} else {
2578 				*nx_port = nx->nx_num_ports;
2579 			}
2580 		}
2581 	}
2582 
2583 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2584 	    (int)*nx_port, err);
2585 
2586 	return err;
2587 }
2588 
2589 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2590 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2591 {
2592 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2593 	nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2594 	struct nx_port_info *ports;
2595 	size_t limit;
2596 	nexus_port_size_t i, num_ports, old_num_ports;
2597 	bitmap_t *bmap;
2598 
2599 	ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2600 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2601 	_CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2602 	ASSERT(powerof2(dom_port_max));
2603 	ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2604 
2605 	old_num_ports = nx->nx_num_ports;
2606 	num_ports = nx->nx_num_ports + grow;
2607 	limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2608 	if (num_ports > limit) {
2609 		SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2610 		    nx->nx_num_ports, grow, num_ports, limit);
2611 		return EDOM;
2612 	}
2613 
2614 	if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2615 	    (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2616 	    (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2617 	    Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2618 		SK_ERR("bmap alloc failed, num_port %u", num_ports);
2619 		return ENOMEM;
2620 	}
2621 	nx->nx_ports_bmap = bmap;
2622 	nx->nx_ports_bmap_size = (num_ports / NX_PORT_CHUNK) * sizeof(*bmap);
2623 
2624 	if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2625 	    num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2626 		/* can't free bmap here, otherwise nexus won't work */
2627 		SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2628 		return ENOMEM;
2629 	}
2630 
2631 	/* initialize the additional new ports */
2632 	bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2633 
2634 	/* initialize new bitmaps (set all bits) */
2635 	for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2636 	    i < (num_ports / NX_PORT_CHUNK); i++) {
2637 		bmap[i] = NX_PORT_CHUNK_FREE;
2638 	}
2639 
2640 	/*
2641 	 * -fbounds-safety: Not sure if moving nx_ports assignment down here
2642 	 * would cause a regression.
2643 	 */
2644 	nx->nx_ports = ports;
2645 	nx->nx_num_ports = num_ports;
2646 
2647 	SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2648 	    SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2649 
2650 	return 0;
2651 }
2652 
2653 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2654 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2655     struct nexus_adapter **na, struct proc *p)
2656 {
2657 	struct nx_port_info *npi = NULL;
2658 	struct nxbind *nxb0;
2659 	size_t g;
2660 	uint32_t i, j;
2661 	bitmap_t *bmap;
2662 	bool refonly = false;
2663 	int err = 0;
2664 
2665 	ASSERT(nx_port != NEXUS_PORT_ANY);
2666 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2667 
2668 	/* port is zero-based, so adjust here */
2669 	if ((nx_port + 1) > nx->nx_num_ports) {
2670 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2671 		VERIFY(g <= NEXUS_PORT_MAX);
2672 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2673 			goto done;
2674 		}
2675 	}
2676 	ASSERT(err == 0);
2677 	ASSERT(nx_port < nx->nx_num_ports);
2678 	npi = &nx->nx_ports[nx_port];
2679 	nxb0 = npi->npi_nxb;
2680 	i = nx_port / NX_PORT_CHUNK;
2681 	j = nx_port % NX_PORT_CHUNK;
2682 	bmap = &nx->nx_ports_bmap[i];
2683 
2684 	if (bit_test(*bmap, j)) {
2685 		/* port is not (yet) bound or allocated */
2686 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2687 		if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2688 			/*
2689 			 * If the port allocation is requested by userland
2690 			 * and the nexus is non-anonymous, then fail the
2691 			 * request.
2692 			 */
2693 			err = EACCES;
2694 			SK_ERR("user proc alloc on named nexus needs binding");
2695 		} else if (na != NULL && *na != NULL) {
2696 			/*
2697 			 * Otherwise claim it (clear bit) if the caller
2698 			 * supplied an adapter for this port; else, it
2699 			 * is just an existential check and so there's
2700 			 * no action needed at this point (we'll skip
2701 			 * the init below since vpna is NULL).
2702 			 */
2703 			bit_clear(*bmap, j);
2704 		}
2705 	} else {
2706 		/* if port is bound, check if credentials match */
2707 		if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2708 		    (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2709 			SK_ERR("nexus binding mismatch");
2710 			err = EACCES;
2711 		} else {
2712 			/*
2713 			 * If port is already occupied by an adapter,
2714 			 * see if the client is requesting a reference
2715 			 * to it; if so, return the adapter.  Otherwise,
2716 			 * if unoccupied and vpna is non-NULL, associate
2717 			 * it with this nexus port via the below init.
2718 			 */
2719 			if (NPI_NA(npi) != NULL) {
2720 				if (na != NULL && *na == NULL) {
2721 					*na = NPI_NA(npi);
2722 					na_retain_locked(*na);
2723 					/* skip the init below */
2724 					refonly = true;
2725 				} else {
2726 					/*
2727 					 * If the client supplied an adapter
2728 					 * (regardless of its value) for a
2729 					 * nexus port that's already occupied,
2730 					 * then we fail the request.
2731 					 */
2732 					SK_ERR("nexus adapted exits");
2733 					err = EEXIST;
2734 				}
2735 			}
2736 		}
2737 	}
2738 
2739 done:
2740 	/* initialize the nexus port and the adapter occupying it */
2741 	if (err == 0 && na != NULL && *na != NULL && !refonly) {
2742 		ASSERT(nx_port < nx->nx_num_ports);
2743 		ASSERT(npi->npi_nah == 0);
2744 		ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2745 		ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2746 		    (nx_port % NX_PORT_CHUNK)));
2747 
2748 		nx->nx_active_ports++;
2749 		npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2750 		(*na)->na_nx_port = nx_port;
2751 	}
2752 
2753 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2754 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2755 	    err);
2756 
2757 	return err;
2758 }
2759 
2760 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2761 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2762 {
2763 	struct nx_port_info *npi = &nx->nx_ports[nx_port];
2764 
2765 	npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2766 	    NEXUS_PORT_STATE_DEFUNCT);
2767 }
2768 
2769 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2770 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2771 {
2772 	struct nx_port_info *npi = NULL;
2773 	bitmap_t *bmap;
2774 	uint32_t i, j;
2775 
2776 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2777 	ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2778 	ASSERT(nx->nx_active_ports != 0);
2779 
2780 	i = nx_port / NX_PORT_CHUNK;
2781 	j = nx_port % NX_PORT_CHUNK;
2782 	bmap = &nx->nx_ports_bmap[i];
2783 	ASSERT(!bit_test(*bmap, j));
2784 
2785 	npi = &nx->nx_ports[nx_port];
2786 	npi->npi_nah = 0;
2787 	if (npi->npi_nxb == NULL) {
2788 		/* it's vacant, release it (set bit) */
2789 		bit_set(*bmap, j);
2790 	}
2791 
2792 	nx->nx_active_ports--;
2793 
2794 	//XXX [email protected] --- try to shrink bitmap & nx_ports ???
2795 
2796 	SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2797 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2798 }
2799 
2800 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2801 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2802     struct nxbind *nxb0, void *info)
2803 {
2804 	struct nx_port_info *npi = NULL;
2805 	size_t g;
2806 	uint32_t i, j;
2807 	bitmap_t *bmap;
2808 	int err = 0;
2809 
2810 	ASSERT(nx_port != NEXUS_PORT_ANY);
2811 	ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2812 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2813 	ASSERT(nxb0 != NULL);
2814 
2815 	if ((nx_port) + 1 > nx->nx_num_ports) {
2816 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2817 		VERIFY(g <= NEXUS_PORT_MAX);
2818 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2819 			goto done;
2820 		}
2821 	}
2822 	ASSERT(err == 0);
2823 
2824 	npi = &nx->nx_ports[nx_port];
2825 	i = nx_port / NX_PORT_CHUNK;
2826 	j = nx_port % NX_PORT_CHUNK;
2827 	bmap = &nx->nx_ports_bmap[i];
2828 	if (bit_test(*bmap, j)) {
2829 		/* port is not (yet) bound or allocated */
2830 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2831 
2832 		bit_clear(*bmap, j);
2833 		struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2834 		nxb_move(nxb0, nxb);
2835 		npi->npi_nxb = nxb;
2836 		npi->npi_info = info;
2837 		/* claim it (clear bit) */
2838 		bit_clear(*bmap, j);
2839 		ASSERT(err == 0);
2840 	} else {
2841 		/* port is already taken */
2842 		ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2843 		err = EEXIST;
2844 	}
2845 done:
2846 
2847 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2848 	    "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2849 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2850 
2851 	return err;
2852 }
2853 
2854 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2855 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2856 {
2857 	return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2858 }
2859 
2860 /*
2861  * -fbounds-safety: all callers pass npi_info. Why don't we just change the
2862  * input type to nx_port_info_header *?
2863  */
2864 static int
nx_port_info_size(struct nx_port_info_header * info,size_t * sz)2865 nx_port_info_size(struct nx_port_info_header *info, size_t *sz)
2866 {
2867 	struct nx_port_info_header *hdr = info;
2868 
2869 	switch (hdr->ih_type) {
2870 	case NX_PORT_INFO_TYPE_NETIF:
2871 		break;
2872 	default:
2873 		return EINVAL;
2874 	}
2875 	*sz = hdr->ih_size;
2876 	return 0;
2877 }
2878 
2879 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2880 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2881 {
2882 	struct nx_port_info *npi = NULL;
2883 	struct nxbind *nxb;
2884 	uint32_t i, j;
2885 	bitmap_t *bmap;
2886 	int err = 0;
2887 
2888 	ASSERT(nx_port != NEXUS_PORT_ANY);
2889 
2890 	if (nx_port >= nx->nx_num_ports) {
2891 		err = EDOM;
2892 		goto done;
2893 	}
2894 
2895 	npi = &nx->nx_ports[nx_port];
2896 	i = nx_port / NX_PORT_CHUNK;
2897 	j = nx_port % NX_PORT_CHUNK;
2898 	bmap = &nx->nx_ports_bmap[i];
2899 
2900 	if ((nxb = npi->npi_nxb) == NULL) {
2901 		/* must be either free or allocated */
2902 		ASSERT(NPI_NA(npi) == NULL ||
2903 		    (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2904 		err = ENOENT;
2905 	} else {
2906 		nxb_free(nxb);
2907 		npi->npi_nxb = NULL;
2908 		if (npi->npi_info != NULL) {
2909 			size_t sz;
2910 
2911 			VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2912 			sk_free_data(npi->npi_info, sz);
2913 			npi->npi_info = NULL;
2914 		}
2915 		ASSERT(!bit_test(*bmap, j));
2916 		if (NPI_NA(npi) == NULL) {
2917 			/* it's vacant, release it (set bit) */
2918 			bit_set(*bmap, j);
2919 		}
2920 	}
2921 
2922 done:
2923 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2924 	    "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2925 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2926 
2927 	return err;
2928 }
2929 
2930 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2931 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2932 {
2933 	if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2934 		return NPI_NA(&nx->nx_ports[nx_port]);
2935 	} else {
2936 		return NULL;
2937 	}
2938 }
2939 
2940 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * __sized_by (len)info,uint32_t len)2941 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2942     nx_port_info_type_t type, void *__sized_by(len)info, uint32_t len)
2943 {
2944 	struct nx_port_info *npi;
2945 	struct nx_port_info_header *hdr;
2946 
2947 	if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2948 		return ENXIO;
2949 	}
2950 	npi = &nx->nx_ports[port];
2951 	/*
2952 	 * -fbounds-safety: Changing npi_info to be __sized_by is a major
2953 	 * surgery. Just forge it here for now.
2954 	 */
2955 	hdr = __unsafe_forge_bidi_indexable(struct nx_port_info_header *,
2956 	    npi->npi_info, len);
2957 	if (hdr == NULL) {
2958 		return ENOENT;
2959 	}
2960 
2961 	if (hdr->ih_type != type) {
2962 		return EINVAL;
2963 	}
2964 
2965 	bcopy(hdr, info, len);
2966 	return 0;
2967 }
2968 
2969 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2970 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2971 {
2972 	return nx_port < nx->nx_num_ports;
2973 }
2974 
2975 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2976 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2977 {
2978 	ASSERT(nx_port_is_valid(nx, nx_port));
2979 
2980 	return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2981 }
2982 
2983 void
nx_port_free_all(struct kern_nexus * nx)2984 nx_port_free_all(struct kern_nexus *nx)
2985 {
2986 	/* uncrustify doesn't handle C blocks properly */
2987 	/* BEGIN IGNORE CODESTYLE */
2988 	nx_port_foreach(nx, ^(nexus_port_t p) {
2989 		struct nxbind *nxb;
2990 		/*
2991 		 * XXX -fbounds-safety: Come back to this after fixing npi_info
2992 		 */
2993 		void *__single info;
2994 		nxb = nx->nx_ports[p].npi_nxb;
2995 		info = nx->nx_ports[p].npi_info;
2996 		if (nxb != NULL) {
2997 			nxb_free(nxb);
2998 			nx->nx_ports[p].npi_nxb = NULL;
2999 		}
3000 		if (info != NULL) {
3001 			size_t sz;
3002 
3003 			VERIFY(nx_port_info_size(info, &sz) == 0);
3004 			skn_free_data(info, info, sz);
3005 			nx->nx_ports[p].npi_info = NULL;
3006 		}
3007 	});
3008 	/* END IGNORE CODESTYLE */
3009 
3010 	nx->nx_active_ports = 0;
3011 	sk_free_data_sized_by(nx->nx_ports_bmap, nx->nx_ports_bmap_size);
3012 	nx->nx_ports_bmap = NULL;
3013 	nx->nx_ports_bmap_size = 0;
3014 	sk_free_type_array_counted_by(struct nx_port_info, nx->nx_num_ports, nx->nx_ports);
3015 	nx->nx_ports = NULL;
3016 	nx->nx_num_ports = 0;
3017 }
3018 
3019 void
3020 nx_port_foreach(struct kern_nexus *nx,
3021     void (^port_handle)(nexus_port_t nx_port))
3022 {
3023 	for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
3024 		bitmap_t bmap = nx->nx_ports_bmap[i];
3025 
3026 		if (bmap == NX_PORT_CHUNK_FREE) {
3027 			continue;
3028 		}
3029 
3030 		for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
3031 			if (bit_test(bmap, j)) {
3032 				continue;
3033 			}
3034 			port_handle((i * NX_PORT_CHUNK) + j);
3035 		}
3036 	}
3037 }
3038 
3039 /*
3040  * sysctl interfaces
3041  */
3042 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
3043 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
3044 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
3045 
3046 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
3047     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3048     0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
3049 
3050 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
3051     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3052     0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
3053 
3054 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
3055     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3056     0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
3057     "A list of logical links");
3058 
3059 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3060     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3061     0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3062     "Nexus inet flows with stats collected in kernel");
3063 
3064 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3065     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3066     0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3067     "Nexus flow owners");
3068 
3069 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3070     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3071     0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3072     "Nexus flow routes");
3073 
3074 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3075     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3076     0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3077     "Nexus netif statistics collected in kernel");
3078 
3079 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3080     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3081     0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3082     "Nexus flowswitch statistics collected in kernel");
3083 
3084 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3085     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3086     0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3087     "Nexus userstack statistics counter");
3088 
3089 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3090     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3091     0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3092     "Nexus flow advisory dump");
3093 
3094 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3095     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3096     0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3097     "A list of netif queue stats entries");
3098 
3099 /*
3100  * Provider list sysctl
3101  */
3102 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3103 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3104     nexus_provider_info_t info)
3105 {
3106 	struct kern_nexus *nx;
3107 	uuid_t *uuids;
3108 
3109 	SK_LOCK_ASSERT_HELD();
3110 
3111 	/* provider UUID + params */
3112 	uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3113 	bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3114 	    sizeof(struct nxprov_params));
3115 	info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3116 
3117 	/* instance UUID list */
3118 	uuids = __unsafe_forge_bidi_indexable(uuid_t *,
3119 	    info->npi_instance_uuids, sizeof(uuid_t) * info->npi_instance_uuids_count);
3120 	STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3121 		uuid_copy(*uuids, nx->nx_uuid);
3122 		uuids++;
3123 	}
3124 }
3125 
3126 static int
3127 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3128 {
3129 #pragma unused(arg1, arg2, oidp)
3130 	size_t actual_space;
3131 	caddr_t buffer = NULL;
3132 	size_t buffer_space;
3133 	size_t allocated_space;
3134 	int out_error;
3135 	int error = 0;
3136 	struct kern_nexus_provider *nxprov;
3137 	caddr_t scan;
3138 
3139 	if (!kauth_cred_issuser(kauth_cred_get())) {
3140 		return EPERM;
3141 	}
3142 
3143 	net_update_uptime();
3144 	buffer_space = req->oldlen;
3145 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3146 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3147 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3148 		}
3149 		allocated_space = buffer_space;
3150 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3151 		if (__improbable(buffer == NULL)) {
3152 			return ENOBUFS;
3153 		}
3154 	} else if (req->oldptr == USER_ADDR_NULL) {
3155 		buffer_space = 0;
3156 	}
3157 	actual_space = 0;
3158 	scan = buffer;
3159 	SK_LOCK();
3160 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3161 		size_t                  info_size;
3162 
3163 		info_size
3164 		        = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3165 		if (scan != NULL) {
3166 			if (buffer_space < info_size) {
3167 				/* supplied buffer too small, stop copying */
3168 				error = ENOMEM;
3169 				break;
3170 			}
3171 			nexus_provider_info_populate(nxprov, (void *)scan);
3172 			scan += info_size;
3173 			buffer_space -= info_size;
3174 		}
3175 		actual_space += info_size;
3176 	}
3177 	SK_UNLOCK();
3178 
3179 	out_error = SYSCTL_OUT(req, buffer, actual_space);
3180 	if (out_error != 0) {
3181 		error = out_error;
3182 	}
3183 
3184 	if (buffer != NULL) {
3185 		sk_free_data(buffer, allocated_space);
3186 	}
3187 
3188 	return error;
3189 }
3190 
3191 /*
3192  * Channel list sysctl
3193  */
3194 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3195 channel_ring_count(struct kern_channel *ch, enum txrx which)
3196 {
3197 	return ch->ch_last[which] - ch->ch_first[which];
3198 }
3199 
3200 /*
3201  * -fbounds-safety: kring's range is [first..last]. Marking it
3202  * __counted_by(last) means range is [0..first..last]. The [0..first) might be
3203  * problematic. However, the for loop in this function starts indexing from
3204  * 'first', not 0, so that should be okay.
3205  * XXX Until BATS starts using uncrustify-7 (rdar://90709826), having a space
3206  * between __counted_by(entry_count) entries will be considered invalid code
3207  * style and build will fail. Until rdar://117811249 is resolved, either stick
3208  * to what makes BATS happy, or wrap IGNORE CODESTYLE around.
3209  */
3210 static void
populate_ring_entries(struct __kern_channel_ring * __counted_by (last)kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry * __counted_by (entry_count)entries,uint32_t NX_FB_ARG entry_count)3211 populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring,
3212     ring_id_t first, ring_id_t last,
3213     nexus_channel_ring_entry *__counted_by(entry_count)entries,
3214     uint32_t NX_FB_ARG entry_count)
3215 {
3216 	ring_id_t i;
3217 	nexus_channel_ring_entry_t scan;
3218 	struct __kern_channel_ring *ring;
3219 
3220 	scan = entries;
3221 	for (i = first; i < last; i++, scan++) {
3222 		ring = &kring[i];
3223 
3224 		DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3225 		    ring);
3226 		if (kr_stat_enable == 0) {
3227 			bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3228 			bzero(&scan->ncre_user_stats,
3229 			    sizeof(scan->ncre_user_stats));
3230 		} else {
3231 			scan->ncre_stats = ring->ckr_stats;
3232 			scan->ncre_user_stats = ring->ckr_usr_stats;
3233 		}
3234 		scan->ncre_error_stats = ring->ckr_err_stats;
3235 		scan->ncre_ring_id = i;
3236 	}
3237 }
3238 
3239 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3240 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3241 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3242 {
3243 	uint32_t flags = 0;
3244 
3245 	flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3246 	flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3247 	flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3248 	flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3249 	flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3250 	flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3251 	flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3252 	flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3253 	flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3254 	flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3255 	flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3256 	flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3257 	flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3258 
3259 	return flags;
3260 }
3261 
3262 SK_NO_INLINE_ATTRIBUTE
3263 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3264 nexus_channel_entry_populate(struct kern_channel *ch,
3265     nexus_channel_entry_t entry)
3266 {
3267 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3268 	uint32_t ch_flags = ch->ch_flags;
3269 	ring_id_t rx_first = ch->ch_first[NR_RX];
3270 	ring_id_t rx_last = ch->ch_last[NR_RX];
3271 	ring_id_t tx_last = ch->ch_last[NR_TX];
3272 	ring_id_t tx_first = ch->ch_first[NR_TX];
3273 
3274 	uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3275 	entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3276 	entry->nce_port = ch->ch_info->cinfo_nx_port;
3277 	entry->nce_pid = ch->ch_pid;
3278 	entry->nce_fd = ch->ch_fd;
3279 	entry->nce_tx_rings = tx_last - tx_first;
3280 	entry->nce_rx_rings = rx_last - rx_first;
3281 	populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3282 	    entry->nce_ring_entries, entry->nce_tx_rings);
3283 
3284 	/*
3285 	 * -fbounds-safety: If entry->nce_tx_rings > 0 and
3286 	 * entry->nce_rx_rings == 0 (i.e. entry->nce_ring_count ==
3287 	 * entry->nce_tx_rings), simply passing
3288 	 * entry->nce_ring_entries + entry->nce_tx_rings to populate_ring_entries
3289 	 * will fail bounds check, because it is equivalent to assigning
3290 	 * nce_ring_entries + nce_tx_rings to a __single variable, and in this
3291 	 * case it goes out of bounds. It's same thing as having:
3292 	 *     int a[1];
3293 	 *     some_func(a + 1);  <-- bounds check will fail
3294 	 */
3295 	if (rx_first < rx_last) {
3296 		populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3297 		    entry->nce_ring_entries + entry->nce_tx_rings,
3298 		    entry->nce_rx_rings);
3299 	}
3300 }
3301 
3302 SK_NO_INLINE_ATTRIBUTE
3303 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info * __sized_by (buffer_size)info,size_t buffer_size)3304 nexus_channel_info_populate(struct kern_nexus *nx,
3305     nexus_channel_info *__sized_by(buffer_size) info, size_t buffer_size)
3306 {
3307 	struct kern_channel *ch = NULL;
3308 	size_t info_size;
3309 	caddr_t scan = NULL;
3310 	nexus_channel_entry *entry;
3311 
3312 	SK_LOCK_ASSERT_HELD();
3313 
3314 	info_size = sizeof(nexus_channel_info);
3315 
3316 	/* channel list */
3317 	if (info != NULL) {
3318 		if (buffer_size < info_size) {
3319 			return info_size;
3320 		}
3321 
3322 		/* instance UUID */
3323 		uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3324 		info->nci_channel_entries_count = nx->nx_ch_count;
3325 		scan = (caddr_t __bidi_indexable)info->nci_channel_entries;
3326 	}
3327 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3328 		size_t          entry_size;
3329 		uint32_t        ring_count;
3330 
3331 		ring_count = channel_ring_count(ch, NR_TX) +
3332 		    channel_ring_count(ch, NR_RX);
3333 		entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3334 		info_size += entry_size;
3335 		if (scan != NULL) {
3336 			if (buffer_size < info_size) {
3337 				return info_size;
3338 			}
3339 			entry = (nexus_channel_entry *)(void *)scan;
3340 			entry->nce_ring_count = ring_count;
3341 
3342 			nexus_channel_entry_populate(ch, entry);
3343 			scan += entry_size;
3344 		}
3345 	}
3346 	return info_size;
3347 }
3348 
3349 static int
3350 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3351 {
3352 #pragma unused(arg1, arg2, oidp)
3353 	size_t actual_space;
3354 	caddr_t buffer = NULL;
3355 	size_t buffer_space;
3356 	size_t allocated_space;
3357 	int out_error;
3358 	struct kern_nexus *nx;
3359 	int error = 0;
3360 	caddr_t scan;
3361 
3362 	if (!kauth_cred_issuser(kauth_cred_get())) {
3363 		return EPERM;
3364 	}
3365 
3366 	net_update_uptime();
3367 	buffer_space = req->oldlen;
3368 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3369 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3370 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3371 		}
3372 		allocated_space = buffer_space;
3373 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3374 		if (__improbable(buffer == NULL)) {
3375 			return ENOBUFS;
3376 		}
3377 	} else if (req->oldptr == USER_ADDR_NULL) {
3378 		buffer_space = 0;
3379 	}
3380 	actual_space = 0;
3381 	scan = buffer;
3382 	SK_LOCK();
3383 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3384 		size_t info_size;
3385 
3386 		info_size = nexus_channel_info_populate(nx, (void *)scan,
3387 		    buffer_space);
3388 		if (scan != NULL) {
3389 			if (buffer_space < info_size) {
3390 				/* supplied buffer too small, stop copying */
3391 				error = ENOMEM;
3392 				break;
3393 			}
3394 			scan += info_size;
3395 			buffer_space -= info_size;
3396 		}
3397 		actual_space += info_size;
3398 	}
3399 	SK_UNLOCK();
3400 
3401 	if (actual_space != 0) {
3402 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3403 		if (out_error != 0) {
3404 			error = out_error;
3405 		}
3406 	}
3407 	if (buffer != NULL) {
3408 		sk_free_data(buffer, allocated_space);
3409 	}
3410 
3411 	return error;
3412 }
3413 
3414 static int
3415 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3416 {
3417 #pragma unused(arg1, arg2)
3418 	struct proc *p = req->p;
3419 	struct nexus_mib_filter filter;
3420 	int error = 0;
3421 	size_t actual_space;
3422 	size_t allocated_space = 0;
3423 	caddr_t __sized_by(allocated_space) buffer = NULL;
3424 	size_t buffer_space;
3425 	int out_error;
3426 	struct kern_nexus *nx;
3427 	caddr_t scan;
3428 
3429 	/* Restrict protocol stats access to root user only (like netstat). */
3430 	if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3431 	    !kauth_cred_issuser(kauth_cred_get())) {
3432 		SK_ERR("mib request rejected, EPERM");
3433 		return EPERM;
3434 	}
3435 
3436 	if (req->newptr == USER_ADDR_NULL) {
3437 		/*
3438 		 * For flow stats requests, non-root users need to provide a
3439 		 * 5-tuple. Otherwise, we do not grant access.
3440 		 */
3441 		if (oidp->oid_arg2 == NXMIB_FLOW &&
3442 		    !kauth_cred_issuser(kauth_cred_get())) {
3443 			SK_ERR("mib request rejected: tuple not provided");
3444 			return EPERM;
3445 		}
3446 		/* use subcommand for multiple nodes */
3447 		filter.nmf_type = oidp->oid_arg2;
3448 		filter.nmf_bitmap = 0x0;
3449 	} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3450 		SK_ERR("mis-matching newlen");
3451 		return EINVAL;
3452 	} else {
3453 		error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3454 		if (error != 0) {
3455 			SK_ERR("SYSCTL_IN err %d", error);
3456 			return error;
3457 		}
3458 		if (filter.nmf_type != oidp->oid_arg2) {
3459 			SK_ERR("mis-matching nmf_type");
3460 			return EINVAL;
3461 		}
3462 		/*
3463 		 * For flow stats requests, non-root users need to set the nexus
3464 		 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3465 		 * grant access. This ensures that fsw_mib_get_flow looks for a
3466 		 * flow entry that matches the given tuple of the non-root user.
3467 		 */
3468 		if (filter.nmf_type == NXMIB_FLOW &&
3469 		    (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3470 		    !kauth_cred_issuser(kauth_cred_get())) {
3471 			SK_ERR("mib request rejected: tuple filter not set");
3472 			return EPERM;
3473 		}
3474 	}
3475 
3476 	net_update_uptime();
3477 	buffer_space = req->oldlen;
3478 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3479 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3480 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3481 		}
3482 		buffer = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_sysctl_buf);
3483 		allocated_space = buffer_space;
3484 		if (__improbable(buffer == NULL)) {
3485 			return ENOBUFS;
3486 		}
3487 	} else if (req->oldptr == USER_ADDR_NULL) {
3488 		buffer_space = 0;
3489 	}
3490 	actual_space = 0;
3491 	scan = buffer;
3492 
3493 	SK_LOCK();
3494 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3495 		if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3496 			continue;
3497 		}
3498 
3499 		size_t size = 0;
3500 		struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3501 
3502 		/*
3503 		 * -fbounds-safety: Because scan takes the bounds of buffer
3504 		 * (which is __sized_by(allocated_space)), at some point scan
3505 		 * will reach its bounds (because of scan += size). When it
3506 		 * does, it won't pass the bounds check when scan is passed to
3507 		 * nxdom_prov_nx_mib_get function. We need to avoid passing scan
3508 		 * to nxdom_prov_nx_mib_get when it reaches its upper bound,
3509 		 * i.e. when buffer_space reaches 0 (see buffer_space -= size).
3510 		 */
3511 		if (req->oldptr == USER_ADDR_NULL || buffer_space) {
3512 			size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3513 			    buffer_space, p);
3514 		}
3515 
3516 		if (scan != NULL) {
3517 			if (buffer_space < size) {
3518 				/* supplied buffer too small, stop copying */
3519 				error = ENOMEM;
3520 				break;
3521 			}
3522 			scan += size;
3523 			buffer_space -= size;
3524 		}
3525 		actual_space += size;
3526 	}
3527 	SK_UNLOCK();
3528 
3529 	if (actual_space != 0) {
3530 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3531 		if (out_error != 0) {
3532 			error = out_error;
3533 		}
3534 	}
3535 	if (buffer != NULL) {
3536 		sk_free_data_sized_by(buffer, allocated_space);
3537 	}
3538 
3539 	return error;
3540 }
3541 
3542 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3543 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3544     boolean_t is_sk_locked)
3545 {
3546 	struct kern_nexus *nx = NULL;
3547 
3548 	if (!is_sk_locked) {
3549 		SK_LOCK();
3550 	} else {
3551 		SK_LOCK_ASSERT_HELD();
3552 	}
3553 
3554 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3555 		(*f)(nx, arg0);
3556 	}
3557 
3558 	if (!is_sk_locked) {
3559 		SK_UNLOCK();
3560 	}
3561 }
3562 
3563 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3564 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3565     struct kern_pbufpool_memory_info *rx_pool_info,
3566     struct kern_pbufpool_memory_info *tx_pool_info)
3567 {
3568 	struct kern_pbufpool *__single tpp, *__single rpp;
3569 	struct kern_nexus *nx;
3570 	errno_t err = 0;
3571 
3572 	nx = nx_find(nx_uuid, FALSE);
3573 	if (nx == NULL) {
3574 		err = ENOENT;
3575 		goto done;
3576 	}
3577 
3578 	if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3579 		err = ENOTSUP;
3580 		goto done;
3581 	}
3582 
3583 	err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3584 	if (err != 0) {
3585 		goto done;
3586 	}
3587 
3588 	if ((tpp == NULL) && (rpp == NULL)) {
3589 		err = ENOENT;
3590 		goto done;
3591 	}
3592 
3593 	if (tx_pool_info != NULL) {
3594 		bzero(tx_pool_info, sizeof(*tx_pool_info));
3595 	}
3596 	if (rx_pool_info != NULL) {
3597 		bzero(rx_pool_info, sizeof(*rx_pool_info));
3598 	}
3599 
3600 	if ((tx_pool_info != NULL) && (tpp != NULL)) {
3601 		err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3602 		if (err != 0) {
3603 			goto done;
3604 		}
3605 	}
3606 
3607 	if ((rx_pool_info != NULL) && (rpp != NULL)) {
3608 		err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3609 	}
3610 
3611 done:
3612 	if (nx != NULL) {
3613 		(void) nx_release(nx);
3614 		nx = NULL;
3615 	}
3616 	return err;
3617 }
3618 
3619 void
nx_interface_advisory_notify(struct kern_nexus * nx)3620 nx_interface_advisory_notify(struct kern_nexus *nx)
3621 {
3622 	struct kern_channel *ch;
3623 	struct netif_stats *nifs;
3624 	struct fsw_stats *fsw_stats;
3625 	nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3626 
3627 	if (nxdom_type == NEXUS_TYPE_NET_IF) {
3628 		nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3629 	} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3630 		fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3631 	} else {
3632 		VERIFY(0);
3633 		__builtin_unreachable();
3634 	}
3635 	if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3636 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3637 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3638 		} else {
3639 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3640 		}
3641 		return;
3642 	}
3643 	/*
3644 	 * if the channel is in "nx_ch_if_adv_head" list, then we can
3645 	 * safely assume that the channel is not closed yet.
3646 	 * In ch_close_common(), the channel is removed from the
3647 	 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3648 	 * exclusive mode, prior to closing the channel.
3649 	 */
3650 	STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3651 		struct nexus_adapter *na = ch->ch_na;
3652 
3653 		ASSERT(na != NULL);
3654 		na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3655 		    TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3656 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3657 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3658 		} else {
3659 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3660 		}
3661 	}
3662 	lck_rw_done(&nx->nx_ch_if_adv_lock);
3663 }
3664