xref: /xnu-10063.101.15/bsd/skywalk/nexus/nexus.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33 
34 static uint32_t disable_nxctl_check = 0;
35 #if (DEVELOPMENT || DEBUG)
36 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37     CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38 #endif
39 
40 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44 
45 static STAILQ_HEAD(, nxctl) nxctl_head =
46     STAILQ_HEAD_INITIALIZER(nxctl_head);
47 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48     STAILQ_HEAD_INITIALIZER(nxprov_head);
49 
50 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51 RB_HEAD(kern_nexus_tree, kern_nexus);
52 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54 static struct kern_nexus_tree   nx_head;
55 
56 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63 static void nxctl_retain_locked(struct nxctl *);
64 static int nxctl_release_locked(struct nxctl *);
65 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67 static void nxctl_free(struct nxctl *);
68 
69 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70     struct kern_nexus_domain_provider *, struct nxprov_reg *,
71     const struct kern_nexus_provider_init *init, int *);
72 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73 static void nxprov_retain_locked(struct kern_nexus_provider *);
74 static int nxprov_release_locked(struct kern_nexus_provider *);
75 static struct kern_nexus_provider *nxprov_alloc(
76 	struct kern_nexus_domain_provider *, zalloc_flags_t);
77 static void nxprov_free(struct kern_nexus_provider *);
78 
79 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83 static struct kern_nexus *nx_alloc(zalloc_flags_t);
84 static void nx_free(struct kern_nexus *);
85 
86 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87 
88 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89 
90 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91 
92 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93 
94 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95 
96 static int __nx_inited = 0;
97 
98 #define SKMEM_TAG_NX_KEY        "com.apple.skywalk.nexus.key"
99 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100 
101 #define SKMEM_TAG_NX_MIB        "com.apple.skywalk.nexus.mib"
102 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103 
104 #define SKMEM_TAG_NX_PORT        "com.apple.skywalk.nexus.port"
105 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106 
107 #define SKMEM_TAG_NX_PORT_INFO        "com.apple.skywalk.nexus.port.info"
108 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109 
110 /*
111  * Special nexus controller handle for Skywalk internal use.  Unlike all
112  * other nexus controller handles that are created by userland or kernel
113  * clients, this one never gets closed or freed.  It is also not part of
114  * the global nxctl_head list.
115  */
116 static struct nxctl _kernnxctl;
117 static struct nxctl _usernxctl;
118 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
119 struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
120 
121 int
nexus_init(void)122 nexus_init(void)
123 {
124 	SK_LOCK_ASSERT_HELD();
125 	ASSERT(!__nx_inited);
126 
127 	RB_INIT(&nx_head);
128 
129 	na_init();
130 
131 	/* attach system built-in domains and domain providers */
132 	nxdom_attach_all();
133 
134 	/*
135 	 * Initialize private kernel and shared user nexus controller handle;
136 	 *
137 	 * Shared Kernel controller is used internally for creating nexus providers
138 	 * and nexus instances from within the Skywalk code (e.g. netif_compat).
139 	 *
140 	 * Shared User controller is used userspace by clients(e.g. libnetcore)
141 	 * that would like to call nexus instances for use cases like
142 	 * configuring flow entry that they own indirectly (e.g. via NECP), so
143 	 * that the nexus would perform permission check based on other info
144 	 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
145 	 * credentials).
146 	 */
147 	nxctl_init(&_kernnxctl, kernproc, NULL);
148 	nxctl_retain_locked(&_kernnxctl);       /* one for us */
149 	nxctl_init(&_usernxctl, kernproc, NULL);
150 	nxctl_retain_locked(&_usernxctl);       /* one for us */
151 	nxctl_traffic_rule_init();
152 
153 	__nx_inited = 1;
154 
155 	return 0;
156 }
157 
158 void
nexus_fini(void)159 nexus_fini(void)
160 {
161 	SK_LOCK_ASSERT_HELD();
162 
163 	if (__nx_inited) {
164 		nxctl_traffic_rule_fini();
165 		nxctl_release_locked(&_kernnxctl);
166 		nxctl_release_locked(&_usernxctl);
167 
168 		/* tell all domains they're going away */
169 		nxdom_detach_all();
170 
171 		ASSERT(RB_EMPTY(&nx_head));
172 
173 		na_fini();
174 
175 		__nx_inited = 0;
176 	}
177 }
178 
179 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)180 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
181     int *err)
182 {
183 	struct nxctl *nxctl = NULL;
184 
185 	ASSERT(!uuid_is_null(nxctl_uuid));
186 
187 	/* privilege checks would be done when performing nxctl operations */
188 
189 	SK_LOCK();
190 
191 	nxctl = nxctl_alloc(p, fp, Z_WAITOK);
192 
193 	STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
194 	nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
195 	uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
196 
197 	nxctl_retain_locked(nxctl);     /* one for being in the list */
198 	nxctl_retain_locked(nxctl);     /* one for the caller */
199 
200 #if SK_LOG
201 	uuid_string_t uuidstr;
202 	SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
203 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
204 #endif /* SK_LOG */
205 
206 	SK_UNLOCK();
207 
208 	if (*err != 0) {
209 		nxctl_free(nxctl);
210 		nxctl = NULL;
211 	}
212 	return nxctl;
213 }
214 
215 void
nxctl_close(struct nxctl * nxctl)216 nxctl_close(struct nxctl *nxctl)
217 {
218 	struct kern_nexus_provider *nxprov = NULL, *tnxprov;
219 
220 	lck_mtx_lock(&nxctl->nxctl_lock);
221 	SK_LOCK();
222 
223 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
224 
225 #if SK_LOG
226 	uuid_string_t uuidstr;
227 	SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
228 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
229 	    nxctl->nxctl_flags, NEXUSCTLF_BITS);
230 #endif /* SK_LOG */
231 
232 	if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
233 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
234 		nxctl->nxctl_fp = NULL;
235 	}
236 
237 	/* may be called as part of failure cleanup, so check */
238 	if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
239 		/* caller must hold an extra ref */
240 		ASSERT(nxctl->nxctl_refcnt > 1);
241 		(void) nxctl_release_locked(nxctl);
242 
243 		STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
244 		nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
245 	}
246 
247 repeat:
248 	STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
249 		/*
250 		 * Close provider only for those which are owned by
251 		 * this control instance.  Note that if we close the
252 		 * provider, we need to repeat this search as the
253 		 * list might have been changed by another thread.
254 		 * That's possible since SK_UNLOCK() may be called
255 		 * as a result of calling nxprov_close().
256 		 */
257 		if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
258 		    nxprov->nxprov_ctl == nxctl) {
259 			nxprov_retain_locked(nxprov);
260 			(void) nxprov_close(nxprov, TRUE);
261 			(void) nxprov_release_locked(nxprov);
262 			goto repeat;
263 		}
264 	}
265 
266 	SK_UNLOCK();
267 	lck_mtx_unlock(&nxctl->nxctl_lock);
268 	nxctl_traffic_rule_clean(nxctl);
269 }
270 
271 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)272 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
273 {
274 #pragma unused(nxctl)
275 	int err = 0;
276 
277 	NXCTL_LOCK_ASSERT_HELD(nxctl);
278 
279 	if (sopt->sopt_dir != SOPT_SET) {
280 		sopt->sopt_dir = SOPT_SET;
281 	}
282 
283 	switch (sopt->sopt_name) {
284 	case NXOPT_NEXUS_BIND:
285 		err = nxctl_nexus_bind(nxctl, sopt);
286 		break;
287 
288 	case NXOPT_NEXUS_UNBIND:
289 		err = nxctl_nexus_unbind(nxctl, sopt);
290 		break;
291 
292 	case NXOPT_NEXUS_CONFIG:
293 		err = nxctl_nexus_config(nxctl, sopt);
294 		break;
295 
296 	default:
297 		err = ENOPROTOOPT;
298 		break;
299 	}
300 
301 	return err;
302 }
303 
304 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)305 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
306 {
307 #pragma unused(nxctl)
308 	int err = 0;
309 
310 	NXCTL_LOCK_ASSERT_HELD(nxctl);
311 
312 	if (sopt->sopt_dir != SOPT_GET) {
313 		sopt->sopt_dir = SOPT_GET;
314 	}
315 
316 	switch (sopt->sopt_name) {
317 	case NXOPT_NEXUS_PROV_LIST:
318 		err = nxctl_get_nexus_prov_list(nxctl, sopt);
319 		break;
320 
321 	case NXOPT_NEXUS_PROV_ENTRY:
322 		err = nxctl_get_nexus_prov_entry(nxctl, sopt);
323 		break;
324 
325 	case NXOPT_NEXUS_LIST:
326 		err = nxctl_get_nexus_list(nxctl, sopt);
327 		break;
328 
329 	case NXOPT_CHANNEL_LIST:
330 		err = nxctl_get_channel_list(nxctl, sopt);
331 		break;
332 
333 	default:
334 		err = ENOPROTOOPT;
335 		break;
336 	}
337 
338 	return err;
339 }
340 
341 /* Upper bound on # of nrl_num_regs that we'd return to user space */
342 #define MAX_NUM_REG_ENTRIES     256
343 
344 /* Hoisted out of line to reduce kernel stack footprint */
345 SK_NO_INLINE_ATTRIBUTE
346 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)347 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
348 {
349 	user_addr_t tmp_ptr = USER_ADDR_NULL;
350 	struct nxprov_reg_ent *pnre, *nres = NULL;
351 	struct nxprov_list_req nrlr;
352 	struct kern_nexus_provider *nxprov = NULL;
353 	uint32_t nregs = 0, ncregs = 0;
354 	int err = 0, observeall;
355 	size_t nres_sz;
356 
357 	NXCTL_LOCK_ASSERT_HELD(nxctl);
358 
359 	ASSERT(sopt->sopt_p != NULL);
360 	if (sopt->sopt_val == USER_ADDR_NULL) {
361 		return EINVAL;
362 	}
363 
364 	err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
365 	if (err != 0) {
366 		return err;
367 	}
368 
369 	if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
370 		nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
371 	}
372 
373 	/*
374 	 * If the caller specified a buffer, copy out the Nexus provider
375 	 * entries to caller gracefully.  We only copy out the number of
376 	 * entries which caller has asked for, but we always tell caller
377 	 * how big the buffer really needs to be.
378 	 */
379 	tmp_ptr = nrlr.nrl_regs;
380 	if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
381 		nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
382 		nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
383 		if (__improbable(nres == NULL)) {
384 			return ENOBUFS;
385 		}
386 	}
387 
388 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
389 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
390 
391 	SK_LOCK();
392 	/*
393 	 * Count number of providers.  If buffer space exists and
394 	 * remains, copy out provider entries.
395 	 */
396 	nregs = nrlr.nrl_num_regs;
397 	pnre = nres;
398 
399 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
400 		/*
401 		 * Return only entries that are visible to the caller,
402 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
403 		 */
404 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
405 			continue;
406 		}
407 
408 		if (nres != NULL && nregs > 0) {
409 			uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
410 			bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
411 			    sizeof(struct nxprov_params));
412 			--nregs;
413 			++pnre;
414 			++ncregs;
415 		}
416 	}
417 	SK_UNLOCK();
418 
419 	if (ncregs == 0) {
420 		err = ENOENT;
421 	}
422 
423 	if (nres != NULL) {
424 		if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
425 			if (sopt->sopt_p != kernproc) {
426 				err = copyout(nres, tmp_ptr,
427 				    ncregs * sizeof(*nres));
428 			} else {
429 				bcopy(nres, CAST_DOWN(caddr_t, tmp_ptr),
430 				    ncregs * sizeof(*nres));
431 			}
432 		}
433 		sk_free_data(nres, nres_sz);
434 		nres = NULL;
435 	}
436 
437 	if (err == 0) {
438 		nrlr.nrl_num_regs = ncregs;
439 		err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
440 	}
441 
442 	return err;
443 }
444 
445 /* Hoisted out of line to reduce kernel stack footprint */
446 SK_NO_INLINE_ATTRIBUTE
447 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)448 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
449 {
450 	struct nxprov_reg_ent nre;
451 	struct kern_nexus_provider *nxprov = NULL;
452 	int err = 0;
453 
454 	NXCTL_LOCK_ASSERT_HELD(nxctl);
455 
456 	ASSERT(sopt->sopt_p != NULL);
457 	if (sopt->sopt_val == USER_ADDR_NULL) {
458 		return EINVAL;
459 	}
460 
461 	bzero(&nre, sizeof(nre));
462 	err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
463 	if (err != 0) {
464 		return err;
465 	}
466 
467 	if (uuid_is_null(nre.npre_prov_uuid)) {
468 		return EINVAL;
469 	}
470 
471 	SK_LOCK();
472 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
473 		if (uuid_compare(nxprov->nxprov_uuid,
474 		    nre.npre_prov_uuid) == 0) {
475 			/*
476 			 * Return only entries that are visible to the caller,
477 			 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
478 			 */
479 			if (nxprov->nxprov_ctl != nxctl) {
480 				if (skywalk_priv_check_cred(sopt->sopt_p,
481 				    nxctl->nxctl_cred,
482 				    PRIV_SKYWALK_OBSERVE_ALL) != 0) {
483 					nxprov = NULL;
484 					break;
485 				}
486 			}
487 
488 			bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
489 			    sizeof(struct nxprov_params));
490 			break;
491 		}
492 	}
493 	SK_UNLOCK();
494 
495 	if (nxprov != NULL) {
496 		err = sooptcopyout(sopt, &nre, sizeof(nre));
497 	} else {
498 		err = ENOENT;
499 	}
500 
501 	return err;
502 }
503 
504 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
505 #define MAX_NUM_NX_UUIDS        4096
506 
507 /* Hoisted out of line to reduce kernel stack footprint */
508 SK_NO_INLINE_ATTRIBUTE
509 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)510 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
511 {
512 	user_addr_t tmp_ptr = USER_ADDR_NULL;
513 	uint32_t nuuids = 0, ncuuids = 0;
514 	uuid_t *puuid, *uuids = NULL;
515 	size_t uuids_sz;
516 	struct nx_list_req nlr;
517 	struct kern_nexus_provider *nxprov = NULL;
518 	struct kern_nexus *nx = NULL;
519 	int err = 0, observeall;
520 
521 	NXCTL_LOCK_ASSERT_HELD(nxctl);
522 
523 	ASSERT(sopt->sopt_p != NULL);
524 	if (sopt->sopt_val == USER_ADDR_NULL) {
525 		return EINVAL;
526 	}
527 
528 	err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
529 	if (err != 0) {
530 		return err;
531 	}
532 
533 	if (uuid_is_null(nlr.nl_prov_uuid)) {
534 		return EINVAL;
535 	} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
536 		nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
537 	}
538 
539 	/*
540 	 * If the caller specified a buffer, copy out the Nexus UUIDs to
541 	 * caller gracefully.  We only copy out the number of UUIDs which
542 	 * caller has asked for, but we always tell caller how big the
543 	 * buffer really needs to be.
544 	 */
545 	tmp_ptr = nlr.nl_nx_uuids;
546 	if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
547 		uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
548 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
549 		if (__improbable(uuids == NULL)) {
550 			return ENOBUFS;
551 		}
552 	}
553 
554 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
555 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
556 
557 	SK_LOCK();
558 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
559 		/*
560 		 * Return only entries that are visible to the caller,
561 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
562 		 */
563 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
564 			continue;
565 		}
566 
567 		if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
568 			break;
569 		}
570 	}
571 
572 	if (nxprov != NULL) {
573 		/*
574 		 * Count number of Nexus.  If buffer space exists
575 		 * and remains, copy out the Nexus UUIDs.
576 		 */
577 		nuuids = nlr.nl_num_nx_uuids;
578 		puuid = uuids;
579 
580 		STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
581 			++ncuuids;
582 			if (uuids != NULL && nuuids > 0) {
583 				uuid_copy(*puuid, nx->nx_uuid);
584 				--nuuids;
585 				++puuid;
586 			}
587 		}
588 	} else {
589 		err = ENOENT;
590 	}
591 	SK_UNLOCK();
592 
593 	if (uuids != NULL) {
594 		if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
595 			uintptr_t cnt_uuid;
596 
597 			/* Note: Pointer arithmetic */
598 			cnt_uuid = (uintptr_t)(puuid - uuids);
599 			if (cnt_uuid > 0) {
600 				if (sopt->sopt_p != kernproc) {
601 					err = copyout(uuids, tmp_ptr,
602 					    cnt_uuid * sizeof(uuid_t));
603 				} else {
604 					bcopy(uuids,
605 					    CAST_DOWN(caddr_t, tmp_ptr),
606 					    cnt_uuid * sizeof(uuid_t));
607 				}
608 			}
609 		}
610 		sk_free_data(uuids, uuids_sz);
611 		uuids = NULL;
612 	}
613 
614 	if (err == 0) {
615 		nlr.nl_num_nx_uuids = ncuuids;
616 		err = sooptcopyout(sopt, &nlr, sizeof(nlr));
617 	}
618 
619 	return err;
620 }
621 
622 /* Hoisted out of line to reduce kernel stack footprint */
623 SK_NO_INLINE_ATTRIBUTE
624 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)625 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
626 {
627 	boolean_t m_pid, m_exec_uuid, m_key;
628 	struct nx_bind_req nbr;
629 	struct proc *p = PROC_NULL;
630 	struct nxbind *nxb = NULL;
631 	uint64_t p_uniqueid = -1;
632 	pid_t p_pid = -1;
633 	struct kern_nexus *nx = NULL;
634 #if SK_LOG
635 	uuid_string_t exec_uuidstr;
636 #endif /* SK_LOG */
637 	uuid_t p_uuid;
638 	void *key = NULL;
639 	int err = 0;
640 
641 	NXCTL_LOCK_ASSERT_HELD(nxctl);
642 
643 	if (sopt->sopt_val == USER_ADDR_NULL) {
644 		return EINVAL;
645 	}
646 
647 	uuid_clear(p_uuid);
648 	bzero(&nbr, sizeof(nbr));
649 	err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
650 	if (err != 0) {
651 		return err;
652 	}
653 
654 	if (uuid_is_null(nbr.nb_nx_uuid)) {
655 		err = EINVAL;
656 		goto done_unlocked;
657 	}
658 
659 	nbr.nb_flags &= NBR_MATCH_MASK;
660 	if (nbr.nb_flags == 0) {
661 		/* must choose one of the match criteria */
662 		err = EINVAL;
663 		goto done_unlocked;
664 	}
665 	m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
666 	m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
667 	m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
668 
669 	if (m_pid || m_exec_uuid) {
670 		/*
671 		 * Validate process ID.  A valid PID is needed when we're
672 		 * asked to match by PID, or if asked to match by executable
673 		 * UUID with a NULL nb_exec_uuid supplied.  The latter is
674 		 * to support the case when a userland Nexus provider isn't
675 		 * able to acquire its client's executable UUID, but is
676 		 * able to identify it via PID.
677 		 */
678 		if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
679 		    (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
680 			err = ESRCH;
681 			goto done_unlocked;
682 		}
683 		/* exclude kernel from the match criteria */
684 		if (p == kernproc) {
685 			err = EACCES;
686 			goto done_unlocked;
687 		} else if (p != PROC_NULL) {
688 			proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
689 			p_uniqueid = proc_uniqueid(p);
690 			p_pid = proc_pid(p);
691 		} else {
692 			uuid_copy(p_uuid, nbr.nb_exec_uuid);
693 		}
694 	}
695 
696 	if (m_key) {
697 		if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
698 		    nbr.nb_key == USER_ADDR_NULL) {
699 			err = EINVAL;
700 			goto done_unlocked;
701 		}
702 
703 		key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
704 		if (__improbable(key == NULL)) {
705 			err = ENOMEM;
706 			goto done_unlocked;
707 		}
708 
709 		if (sopt->sopt_p != kernproc) {
710 			err = copyin(nbr.nb_key, key, nbr.nb_key_len);
711 			if (err != 0) {
712 				goto done_unlocked;
713 			}
714 		} else {
715 			bcopy((void *)nbr.nb_key, key, nbr.nb_key_len);
716 		}
717 	}
718 
719 	SK_LOCK();
720 	nx = nx_find(nbr.nb_nx_uuid, TRUE);
721 	if (nx == NULL || (disable_nxctl_check == 0 &&
722 	    nx->nx_prov->nxprov_ctl != nxctl &&
723 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
724 		err = ENOENT;
725 		goto done;
726 	}
727 
728 	/* bind isn't applicable on anonymous nexus provider */
729 	if (NX_ANONYMOUS_PROV(nx)) {
730 		err = ENXIO;
731 		goto done;
732 	}
733 
734 	/* port must be within the domain's range */
735 	if (nbr.nb_port != NEXUS_PORT_ANY &&
736 	    nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
737 		err = EDOM;
738 		goto done;
739 	} else if (nbr.nb_port == NEXUS_PORT_ANY) {
740 		/* for now, this is allowed only for kernel clients */
741 		if (sopt->sopt_p != kernproc) {
742 			err = EPERM;
743 			goto done;
744 		}
745 	}
746 
747 	nxb = nxb_alloc(Z_WAITOK);
748 
749 	if (m_pid) {
750 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
751 		nxb->nxb_uniqueid = p_uniqueid;
752 		nxb->nxb_pid = p_pid;
753 	}
754 	if (m_exec_uuid) {
755 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
756 		ASSERT(!uuid_is_null(p_uuid));
757 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
758 	}
759 	if (m_key) {
760 		nxb->nxb_flags |= NXBF_MATCH_KEY;
761 		ASSERT(key != NULL);
762 		nxb->nxb_key = key;
763 		key = NULL;     /* let nxb_free() free it */
764 		ASSERT(nbr.nb_key_len != 0 &&
765 		    nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
766 		nxb->nxb_key_len = nbr.nb_key_len;
767 	}
768 
769 	/*
770 	 * Bind the creds to the nexus port.  If client doesn't have a port,
771 	 * find one, claim it, and associate the creds to it.  Upon success,
772 	 * the nexus may move the nxbind contents (including the key) to
773 	 * its own nxbind instance; in that case, nxb_free() below will not
774 	 * be freeing the key within.
775 	 */
776 	err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
777 	if (err != 0) {
778 		goto done;
779 	}
780 
781 	ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
782 	(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
783 
784 	SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
785 	    "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
786 	    SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
787 	    NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
788 	    sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
789 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
790 	    nxb->nxb_key_len);
791 
792 done:
793 	if (nx != NULL) {
794 		(void) nx_release_locked(nx);
795 		nx = NULL;
796 	}
797 	SK_UNLOCK();
798 
799 done_unlocked:
800 	ASSERT(nx == NULL);
801 
802 	if (nxb != NULL) {
803 		nxb_free(nxb);
804 		nxb = NULL;
805 	}
806 	if (key != NULL) {
807 		sk_free_data(key, nbr.nb_key_len);
808 		key = NULL;
809 	}
810 	if (p != PROC_NULL) {
811 		proc_rele(p);
812 	}
813 
814 	return err;
815 }
816 
817 /* Hoisted out of line to reduce kernel stack footprint */
818 SK_NO_INLINE_ATTRIBUTE
819 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)820 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
821 {
822 	struct nx_unbind_req nur;
823 	struct kern_nexus *nx = NULL;
824 	int err = 0;
825 
826 	NXCTL_LOCK_ASSERT_HELD(nxctl);
827 
828 	if (sopt->sopt_val == USER_ADDR_NULL) {
829 		return EINVAL;
830 	}
831 
832 	bzero(&nur, sizeof(nur));
833 	err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
834 	if (err != 0) {
835 		return err;
836 	}
837 
838 	if (uuid_is_null(nur.nu_nx_uuid)) {
839 		return EINVAL;
840 	}
841 
842 	SK_LOCK();
843 	nx = nx_find(nur.nu_nx_uuid, TRUE);
844 	if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
845 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
846 		err = ENOENT;
847 		goto done;
848 	}
849 
850 	/* unbind isn't applicable on anonymous nexus provider */
851 	if (NX_ANONYMOUS_PROV(nx)) {
852 		err = ENXIO;
853 		goto done;
854 	}
855 
856 	if (nur.nu_port == NEXUS_PORT_ANY) {
857 		err = EINVAL;
858 		goto done;
859 	}
860 
861 	err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
862 
863 done:
864 	if (nx != NULL) {
865 		(void) nx_release_locked(nx);
866 		nx = NULL;
867 	}
868 	SK_UNLOCK();
869 
870 	return err;
871 }
872 
873 /* Hoisted out of line to reduce kernel stack footprint */
874 SK_NO_INLINE_ATTRIBUTE
875 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)876 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
877 {
878 	struct kern_nexus *nx = NULL;
879 	struct nx_cfg_req ncr;
880 	int err = 0;
881 
882 	NXCTL_LOCK_ASSERT_HELD(nxctl);
883 
884 	if (sopt->sopt_val == USER_ADDR_NULL) {
885 		return EINVAL;
886 	}
887 
888 	bzero(&ncr, sizeof(ncr));
889 	err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
890 	if (err != 0) {
891 		return err;
892 	}
893 
894 	if (uuid_is_null(ncr.nc_nx_uuid)) {
895 		return EINVAL;
896 	}
897 
898 	SK_LOCK();
899 	nx = nx_find(ncr.nc_nx_uuid, TRUE);
900 	if (nx == NULL || (disable_nxctl_check == 0 &&
901 	    nx->nx_prov->nxprov_ctl != nxctl &&
902 	    nxctl != &_kernnxctl &&    /* allow kernel/shared user nxctl */
903 	    nxctl != &_usernxctl)) {
904 		err = ENOENT;
905 		goto done;
906 	}
907 
908 	if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
909 		err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
910 		    nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
911 	} else {
912 		err = EPERM;
913 	}
914 
915 	if (err == 0) {
916 		(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
917 	}
918 done:
919 	if (nx != NULL) {
920 		(void) nx_release_locked(nx);
921 		nx = NULL;
922 	}
923 	SK_UNLOCK();
924 
925 	return err;
926 }
927 
928 struct nxbind *
nxb_alloc(zalloc_flags_t how)929 nxb_alloc(zalloc_flags_t how)
930 {
931 	struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
932 
933 	if (nxb) {
934 		SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
935 	}
936 	return nxb;
937 }
938 
939 void
nxb_free(struct nxbind * nxb)940 nxb_free(struct nxbind *nxb)
941 {
942 	SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
943 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
944 
945 	if (nxb->nxb_key != NULL) {
946 		sk_free_data(nxb->nxb_key, nxb->nxb_key_len);
947 		nxb->nxb_key = NULL;
948 	}
949 	zfree(nxbind_zone, nxb);
950 }
951 
952 /*
953  * nxb0 is assumed to possess the truth, compare nxb1 against it.
954  */
955 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)956 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
957 {
958 	ASSERT(nxb0 != NULL && nxb1 != NULL);
959 	ASSERT(nxb0 != nxb1);
960 
961 	/* we always compare using uniqueid and not pid */
962 	if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
963 	    nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
964 		return FALSE;
965 	}
966 
967 	if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
968 	    uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
969 		return FALSE;
970 	}
971 
972 	ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
973 	    (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
974 
975 	if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
976 	    (nxb0->nxb_key_len != nxb1->nxb_key_len ||
977 	    nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
978 	    nxb1->nxb_key_len) != 0)) {
979 		return FALSE;
980 	}
981 
982 	return TRUE;
983 }
984 
985 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)986 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
987 {
988 	ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
989 	    (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
990 
991 	/* in case the destination has a key attached, free it first */
992 	if (dnxb->nxb_key != NULL) {
993 		sk_free_data(dnxb->nxb_key, dnxb->nxb_key_len);
994 		dnxb->nxb_key = NULL;
995 	}
996 
997 	/* move everything from src to dst, and then wipe out src */
998 	bcopy(snxb, dnxb, sizeof(*dnxb));
999 	bzero(snxb, sizeof(*snxb));
1000 }
1001 
1002 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1003 #define MAX_NUM_CH_UUIDS        4096
1004 
1005 /* Hoisted out of line to reduce kernel stack footprint */
1006 SK_NO_INLINE_ATTRIBUTE
1007 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)1008 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1009 {
1010 	user_addr_t tmp_ptr = USER_ADDR_NULL;
1011 	uint32_t nuuids = 0, ncuuids = 0;
1012 	uuid_t *puuid, *uuids = NULL;
1013 	size_t uuids_sz;
1014 	struct ch_list_req clr;
1015 	struct kern_channel *ch = NULL;
1016 	struct kern_nexus *nx = NULL;
1017 	struct kern_nexus find;
1018 	int err = 0, observeall;
1019 
1020 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1021 
1022 	ASSERT(sopt->sopt_p != NULL);
1023 	if (sopt->sopt_val == USER_ADDR_NULL) {
1024 		return EINVAL;
1025 	}
1026 
1027 	err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1028 	if (err != 0) {
1029 		return err;
1030 	}
1031 
1032 	if (uuid_is_null(clr.cl_nx_uuid)) {
1033 		return EINVAL;
1034 	} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1035 		clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1036 	}
1037 
1038 	/*
1039 	 * If the caller specified a buffer, copy out the Channel UUIDs to
1040 	 * caller gracefully.  We only copy out the number of UUIDs which
1041 	 * caller has asked for, but we always tell caller how big the
1042 	 * buffer really needs to be.
1043 	 */
1044 	tmp_ptr = clr.cl_ch_uuids;
1045 	if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1046 		uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1047 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1048 		if (uuids == NULL) {
1049 			return ENOBUFS;
1050 		}
1051 	}
1052 
1053 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1054 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
1055 
1056 	SK_LOCK();
1057 	uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1058 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1059 	if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1060 		/*
1061 		 * Return only entries that are visible to the caller,
1062 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1063 		 */
1064 		nx = NULL;
1065 	}
1066 	if (nx != NULL) {
1067 		/*
1068 		 * Count number of Channels.  If buffer space exists
1069 		 * and remains, copy out the Channel UUIDs.
1070 		 */
1071 		nuuids = clr.cl_num_ch_uuids;
1072 		puuid = uuids;
1073 
1074 		STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1075 			++ncuuids;
1076 			if (uuids != NULL && nuuids > 0) {
1077 				uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1078 				--nuuids;
1079 				++puuid;
1080 			}
1081 		}
1082 	} else {
1083 		err = ENOENT;
1084 	}
1085 	SK_UNLOCK();
1086 
1087 	if (uuids != NULL) {
1088 		if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1089 			uintptr_t cnt_uuid;
1090 
1091 			/* Note: Pointer arithmetic */
1092 			cnt_uuid = (uintptr_t)(puuid - uuids);
1093 			ASSERT(cnt_uuid > 0);
1094 
1095 			if (sopt->sopt_p != kernproc) {
1096 				err = copyout(uuids, tmp_ptr,
1097 				    cnt_uuid * sizeof(uuid_t));
1098 			} else {
1099 				bcopy(uuids, CAST_DOWN(caddr_t, tmp_ptr),
1100 				    cnt_uuid * sizeof(uuid_t));
1101 			}
1102 		}
1103 		sk_free_data(uuids, uuids_sz);
1104 		uuids = NULL;
1105 	}
1106 
1107 	if (err == 0) {
1108 		clr.cl_num_ch_uuids = ncuuids;
1109 		err = sooptcopyout(sopt, &clr, sizeof(clr));
1110 	}
1111 
1112 	return err;
1113 }
1114 
1115 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1116 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1117 {
1118 	uuid_t p_uuid;
1119 
1120 	bzero(nxctl, sizeof(*nxctl));
1121 
1122 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1123 
1124 	lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1125 	uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1126 	nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1127 	nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1128 	nxctl->nxctl_fp = fp;
1129 	if (nxctl == &_kernnxctl) {
1130 		ASSERT(p == kernproc);
1131 		nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1132 	}
1133 	if (nxctl == &_usernxctl) {
1134 		ASSERT(p == kernproc);
1135 		nxctl->nxctl_cred = NULL;
1136 	}
1137 	if (fp == NULL) {
1138 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1139 	}
1140 }
1141 
1142 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1143 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1144 {
1145 	struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1146 
1147 	if (nxctl != NULL) {
1148 		nxctl_init(nxctl, p, fp);
1149 	}
1150 	return nxctl;
1151 }
1152 
1153 static void
nxctl_free(struct nxctl * nxctl)1154 nxctl_free(struct nxctl *nxctl)
1155 {
1156 	ASSERT(nxctl->nxctl_refcnt == 0);
1157 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1158 	kauth_cred_unref(&nxctl->nxctl_cred);
1159 	lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1160 	SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1161 	if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1162 		zfree(nxctl_zone, nxctl);
1163 	}
1164 }
1165 
1166 static void
nxctl_retain_locked(struct nxctl * nxctl)1167 nxctl_retain_locked(struct nxctl *nxctl)
1168 {
1169 	SK_LOCK_ASSERT_HELD();
1170 
1171 	nxctl->nxctl_refcnt++;
1172 	ASSERT(nxctl->nxctl_refcnt != 0);
1173 }
1174 
1175 void
nxctl_retain(struct nxctl * nxctl)1176 nxctl_retain(struct nxctl *nxctl)
1177 {
1178 	SK_LOCK();
1179 	nxctl_retain_locked(nxctl);
1180 	SK_UNLOCK();
1181 }
1182 
1183 static int
nxctl_release_locked(struct nxctl * nxctl)1184 nxctl_release_locked(struct nxctl *nxctl)
1185 {
1186 	int oldref = nxctl->nxctl_refcnt;
1187 
1188 	SK_LOCK_ASSERT_HELD();
1189 
1190 	ASSERT(nxctl->nxctl_refcnt != 0);
1191 	if (--nxctl->nxctl_refcnt == 0) {
1192 		nxctl_free(nxctl);
1193 	}
1194 
1195 	return oldref == 1;
1196 }
1197 
1198 int
nxctl_release(struct nxctl * nxctl)1199 nxctl_release(struct nxctl *nxctl)
1200 {
1201 	int lastref;
1202 
1203 	SK_LOCK();
1204 	lastref = nxctl_release_locked(nxctl);
1205 	SK_UNLOCK();
1206 
1207 	return lastref;
1208 }
1209 
1210 void
nxctl_dtor(void * arg)1211 nxctl_dtor(void *arg)
1212 {
1213 	struct nxctl *nxctl = arg;
1214 
1215 	nxctl_close(nxctl);
1216 	SK_LOCK();
1217 	(void) nxctl_release_locked(nxctl);
1218 	SK_UNLOCK();
1219 }
1220 
1221 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1222 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1223     struct proc *p)
1224 {
1225 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1226 	int err = 0;
1227 
1228 	ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1229 	ASSERT(ch->ch_ctx == NULL);
1230 
1231 	SK_LOCK_ASSERT_HELD();
1232 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1233 
1234 	/* monitor channels aren't externally visible/usable, so ignore */
1235 	if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1236 	    (ch->ch_flags & CHANF_EXT_SKIP) ||
1237 	    (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1238 	    nxprov->nxprov_ext.nxpi_connected == NULL)) {
1239 		return 0;
1240 	}
1241 
1242 	ch_retain_locked(ch);
1243 	lck_mtx_unlock(&ch->ch_lock);
1244 	SK_UNLOCK();
1245 	lck_mtx_lock(&ch->ch_lock);
1246 
1247 	err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1248 	    ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1249 	if (err != 0) {
1250 		SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1251 		    "error %d", SK_KVA(ch), ch->ch_flags,
1252 		    CHANF_BITS, SK_KVA(nx), err);
1253 		ch->ch_ctx = NULL;
1254 		goto done;
1255 	}
1256 	/*
1257 	 * Upon ring/slot init failure, this is cleared
1258 	 * by nxprov_advise_disconnect() below.
1259 	 */
1260 	os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1261 	if (NXPROV_LLINK(nxprov)) {
1262 		err = nx_netif_llink_ext_init_default_queues(nx);
1263 	} else {
1264 		err = nx_init_rings(nx, ch);
1265 	}
1266 	if (err != 0) {
1267 		goto done;
1268 	}
1269 	ASSERT(err == 0);
1270 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1271 	    CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1272 
1273 	err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1274 	if (err != 0) {
1275 		SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1276 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1277 		goto done;
1278 	}
1279 	os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1280 	SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1281 	    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1282 
1283 
1284 done:
1285 	lck_mtx_unlock(&ch->ch_lock);
1286 	SK_LOCK();
1287 	lck_mtx_lock(&ch->ch_lock);
1288 	if ((err != 0) &&
1289 	    (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1290 		nxprov_advise_disconnect(nx, ch);
1291 	}
1292 	/* caller is expected to hold one, in addition to ourselves */
1293 	VERIFY(ch->ch_refcnt >= 2);
1294 	ch_release_locked(ch);
1295 
1296 	return err;
1297 }
1298 
1299 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1300 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1301 {
1302 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1303 
1304 	SK_LOCK_ASSERT_HELD();
1305 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1306 
1307 	/* check as we might be called in the error handling path */
1308 	if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1309 		ch_retain_locked(ch);
1310 		lck_mtx_unlock(&ch->ch_lock);
1311 		SK_UNLOCK();
1312 		lck_mtx_lock(&ch->ch_lock);
1313 
1314 		ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1315 		if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1316 			nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1317 			os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1318 		}
1319 
1320 		/*
1321 		 * Inform the external domain provider that the rings
1322 		 * and slots for this channel are no longer valid.
1323 		 */
1324 		if (NXPROV_LLINK(nxprov)) {
1325 			nx_netif_llink_ext_fini_default_queues(nx);
1326 		} else {
1327 			nx_fini_rings(nx, ch);
1328 		}
1329 
1330 		ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1331 		nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1332 		os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1333 
1334 		SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1335 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1336 
1337 		/* We're done with this channel */
1338 		ch->ch_ctx = NULL;
1339 
1340 		lck_mtx_unlock(&ch->ch_lock);
1341 		SK_LOCK();
1342 		lck_mtx_lock(&ch->ch_lock);
1343 		/* caller is expected to hold one, in addition to ourselves */
1344 		VERIFY(ch->ch_refcnt >= 2);
1345 		ch_release_locked(ch);
1346 	}
1347 	ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1348 	ASSERT(ch->ch_ctx == NULL);
1349 }
1350 
1351 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1352 nxprov_create_common(struct nxctl *nxctl,
1353     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1354     const struct kern_nexus_provider_init *init, int *err)
1355 {
1356 	struct skmem_region_params srp[SKMEM_REGIONS];
1357 	struct kern_nexus_provider *nxprov = NULL;
1358 	struct nxprov_params nxp;
1359 	uint32_t override = 0;
1360 	uint32_t pp_region_config_flags;
1361 	int i;
1362 
1363 	_CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1364 	_CASSERT(sizeof(*init) >=
1365 	    sizeof(struct kern_nexus_netif_provider_init));
1366 
1367 	SK_LOCK_ASSERT_HELD();
1368 	ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1369 
1370 	pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1371 	    PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1372 	/*
1373 	 * Special handling for external nexus providers; similar
1374 	 * logic to what's done in kern_pbufpool_create().
1375 	 */
1376 	if (init != NULL) {
1377 		if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1378 			pp_region_config_flags |=
1379 			    PP_REGION_CONFIG_BUF_MONOLITHIC;
1380 		}
1381 
1382 		if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1383 			pp_region_config_flags |=
1384 			    PP_REGION_CONFIG_BUF_NOCACHE;
1385 		}
1386 	}
1387 
1388 	/*
1389 	 * For network devices, set the packet metadata memory as persistent
1390 	 * so that it is wired at segment creation.  This allows us to access
1391 	 * it with preemption disabled, as well as for rdar://problem/46511741.
1392 	 */
1393 	if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1394 		pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1395 	}
1396 
1397 	/* process and validate provider parameters */
1398 	if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1399 	    &nxp, srp, override, pp_region_config_flags)) != 0) {
1400 		goto done;
1401 	}
1402 
1403 	nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1404 	ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1405 
1406 	STAILQ_INIT(&nxprov->nxprov_nx_head);
1407 	STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1408 	nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1409 	nxprov->nxprov_ctl = nxctl;
1410 	uuid_generate_random(nxprov->nxprov_uuid);
1411 	bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1412 
1413 	if (init != NULL) {
1414 		if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1415 			ASSERT(NXPROV_LLINK(nxprov));
1416 			bcopy(init, &nxprov->nxprov_netif_ext,
1417 			    sizeof(nxprov->nxprov_netif_ext));
1418 		} else {
1419 			ASSERT(!NXPROV_LLINK(nxprov));
1420 			ASSERT(init->nxpi_version ==
1421 			    KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1422 			bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1423 		}
1424 		nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1425 	}
1426 
1427 	/* store validated region parameters to the provider */
1428 	for (i = 0; i < SKMEM_REGIONS; i++) {
1429 		nxprov->nxprov_region_params[i] = srp[i];
1430 	}
1431 
1432 	if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1433 		uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1434 
1435 		if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1436 			nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1437 		}
1438 	} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1439 	    NEXUS_TYPE_NET_IF) {
1440 		/*
1441 		 * Treat non-netif built-in nexus providers as those
1442 		 * meant for inter-process communications, i.e. there
1443 		 * is no actual networking hardware involved.
1444 		 */
1445 		nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1446 	}
1447 
1448 	nxprov_retain_locked(nxprov);   /* one for being in the list */
1449 	nxprov_retain_locked(nxprov);   /* one for the caller */
1450 
1451 #if SK_LOG
1452 	uuid_string_t uuidstr;
1453 	SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1454 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1455 #endif /* SK_LOG */
1456 
1457 done:
1458 	return nxprov;
1459 }
1460 
1461 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1462 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1463     int *err)
1464 {
1465 	struct nxprov_params *nxp = &reg->nxpreg_params;
1466 	struct kern_nexus_domain_provider *nxdom_prov = NULL;
1467 	struct kern_nexus_provider *nxprov = NULL;
1468 
1469 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1470 
1471 	ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1472 	*err = 0;
1473 
1474 	switch (nxp->nxp_type) {
1475 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1476 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1477 		    PRIV_SKYWALK_REGISTER_USER_PIPE);
1478 		break;
1479 
1480 	case NEXUS_TYPE_FLOW_SWITCH:    /* allowed for userland */
1481 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1482 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1483 		break;
1484 
1485 	case NEXUS_TYPE_NET_IF:         /* allowed for userland */
1486 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1487 		    PRIV_SKYWALK_REGISTER_NET_IF);
1488 		break;
1489 
1490 	case NEXUS_TYPE_KERNEL_PIPE:    /* only for kernel */
1491 	case NEXUS_TYPE_MONITOR:        /* invalid */
1492 	default:
1493 		*err = EINVAL;
1494 		goto done;
1495 	}
1496 
1497 	if (*err != 0) {
1498 		goto done;
1499 	}
1500 
1501 	ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1502 	if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1503 		*err = ENXIO;
1504 		goto done;
1505 	}
1506 
1507 #if CONFIG_NEXUS_NETIF
1508 	/* make sure netif_compat is the default here */
1509 	ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1510 	    strcmp(nxdom_prov->nxdom_prov_name,
1511 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1512 #endif /* CONFIG_NEXUS_NETIF */
1513 
1514 	SK_LOCK();
1515 	/* callee holds a reference for our caller upon success */
1516 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1517 	SK_UNLOCK();
1518 done:
1519 	return nxprov;
1520 }
1521 
1522 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1523 nxprov_create_kern(struct nxctl *nxctl,
1524     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1525     const struct kern_nexus_provider_init *init, int *err)
1526 {
1527 	struct nxprov_params *nxp = &reg->nxpreg_params;
1528 	struct kern_nexus_provider *nxprov = NULL;
1529 
1530 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1531 	SK_LOCK_ASSERT_HELD();
1532 
1533 	ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1534 	ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1535 	ASSERT(init == NULL ||
1536 	    init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1537 	    init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1538 
1539 	*err = 0;
1540 
1541 	switch (nxp->nxp_type) {
1542 	case NEXUS_TYPE_NET_IF:
1543 		break;
1544 	case NEXUS_TYPE_KERNEL_PIPE:
1545 		if (init == NULL) {
1546 			*err = EINVAL;
1547 			goto done;
1548 		}
1549 		break;
1550 	case NEXUS_TYPE_FLOW_SWITCH:
1551 		if (init != NULL) {
1552 			*err = EINVAL;
1553 			goto done;
1554 		}
1555 		break;
1556 
1557 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1558 	case NEXUS_TYPE_MONITOR:        /* invalid */
1559 	default:
1560 		*err = EINVAL;
1561 		goto done;
1562 	}
1563 
1564 	/* callee holds a reference for our caller upon success */
1565 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1566 
1567 done:
1568 	return nxprov;
1569 }
1570 
1571 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1572 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1573 {
1574 	struct kern_nexus_provider *nxprov = NULL;
1575 	int err = 0;
1576 
1577 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1578 
1579 	SK_LOCK();
1580 
1581 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1582 		if (nxctl == nxprov->nxprov_ctl &&
1583 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1584 			nxprov_retain_locked(nxprov);
1585 			break;
1586 		}
1587 	}
1588 
1589 	if (nxprov == NULL) {
1590 		err = ENOENT;
1591 	} else {
1592 		err = nxprov_close(nxprov, TRUE);
1593 	}
1594 
1595 	if (nxprov != NULL) {
1596 		(void) nxprov_release_locked(nxprov);
1597 	}
1598 
1599 	SK_UNLOCK();
1600 
1601 	return err;
1602 }
1603 
1604 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1605 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1606 {
1607 	int err = 0;
1608 
1609 	if (!locked) {
1610 		SK_LOCK();
1611 	}
1612 
1613 	SK_LOCK_ASSERT_HELD();
1614 
1615 #if SK_LOG
1616 	uuid_string_t uuidstr;
1617 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1618 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1619 	    nxprov->nxprov_flags, NXPROVF_BITS);
1620 #endif /* SK_LOG */
1621 
1622 	if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1623 		err = EALREADY;
1624 	} else {
1625 		struct kern_nexus *nx, *tnx;
1626 
1627 		nxprov->nxprov_ctl = NULL;
1628 
1629 		STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1630 		    nx_prov_link, tnx) {
1631 			nx_retain_locked(nx);
1632 			(void) nx_close(nx, TRUE);
1633 			(void) nx_release_locked(nx);
1634 		}
1635 
1636 		if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1637 			/* no nexus created on this, so detach now */
1638 			nxprov_detach(nxprov, TRUE);
1639 		} else {
1640 			/* detach when last nexus is destroyed */
1641 			ASSERT(nxprov->nxprov_refcnt > 1);
1642 			nxprov->nxprov_flags |= NXPROVF_CLOSED;
1643 		}
1644 	}
1645 
1646 	if (!locked) {
1647 		SK_UNLOCK();
1648 	}
1649 
1650 	return err;
1651 }
1652 
1653 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1654 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1655 {
1656 	if (!locked) {
1657 		SK_LOCK();
1658 	}
1659 
1660 	SK_LOCK_ASSERT_HELD();
1661 
1662 #if SK_LOG
1663 	uuid_string_t uuidstr;
1664 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1665 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1666 	    nxprov->nxprov_flags, NXPROVF_BITS);
1667 #endif /* SK_LOG */
1668 
1669 	ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1670 	STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1671 	nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1672 
1673 	/* caller must hold an extra ref */
1674 	ASSERT(nxprov->nxprov_refcnt > 1);
1675 	(void) nxprov_release_locked(nxprov);
1676 
1677 	if (!locked) {
1678 		SK_UNLOCK();
1679 	}
1680 }
1681 
1682 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1683 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1684 {
1685 	struct kern_nexus_provider *nxprov;
1686 	struct nxprov_params *nxp;
1687 
1688 	ASSERT(nxdom_prov != NULL);
1689 
1690 	nxp = nxprov_params_alloc(how);
1691 	if (nxp == NULL) {
1692 		SK_ERR("Failed to allocate nxprov_params");
1693 		return NULL;
1694 	}
1695 
1696 	nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1697 	if (nxprov == NULL) {
1698 		SK_ERR("Failed to allocate nxprov");
1699 		nxprov_params_free(nxp);
1700 		return NULL;
1701 	}
1702 
1703 	nxprov->nxprov_dom_prov = nxdom_prov;
1704 	nxprov->nxprov_params = nxp;
1705 	/* hold a reference for nxprov */
1706 	nxdom_prov_retain_locked(nxdom_prov);
1707 
1708 	return nxprov;
1709 }
1710 
1711 static void
nxprov_free(struct kern_nexus_provider * nxprov)1712 nxprov_free(struct kern_nexus_provider *nxprov)
1713 {
1714 	struct kern_nexus_domain_provider *nxdom_prov =
1715 	    nxprov->nxprov_dom_prov;
1716 
1717 	SK_LOCK_ASSERT_HELD();
1718 
1719 	ASSERT(nxdom_prov != NULL);
1720 	(void) nxdom_prov_release_locked(nxdom_prov);
1721 	nxprov->nxprov_dom_prov = NULL;
1722 	ASSERT(nxprov->nxprov_params != NULL);
1723 	nxprov_params_free(nxprov->nxprov_params);
1724 	nxprov->nxprov_params = NULL;
1725 	ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1726 	SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1727 	zfree(nxprov_zone, nxprov);
1728 }
1729 
1730 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1731 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1732 {
1733 	SK_LOCK_ASSERT_HELD();
1734 
1735 	nxprov->nxprov_refcnt++;
1736 	ASSERT(nxprov->nxprov_refcnt != 0);
1737 }
1738 
1739 void
nxprov_retain(struct kern_nexus_provider * nxprov)1740 nxprov_retain(struct kern_nexus_provider *nxprov)
1741 {
1742 	SK_LOCK();
1743 	nxprov_retain_locked(nxprov);
1744 	SK_UNLOCK();
1745 }
1746 
1747 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1748 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1749 {
1750 	int oldref = nxprov->nxprov_refcnt;
1751 
1752 	SK_LOCK_ASSERT_HELD();
1753 
1754 	ASSERT(nxprov->nxprov_refcnt != 0);
1755 	if (--nxprov->nxprov_refcnt == 0) {
1756 		nxprov_free(nxprov);
1757 	}
1758 
1759 	return oldref == 1;
1760 }
1761 
1762 int
nxprov_release(struct kern_nexus_provider * nxprov)1763 nxprov_release(struct kern_nexus_provider *nxprov)
1764 {
1765 	int lastref;
1766 
1767 	SK_LOCK();
1768 	lastref = nxprov_release_locked(nxprov);
1769 	SK_UNLOCK();
1770 
1771 	return lastref;
1772 }
1773 
1774 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1775 nxprov_params_alloc(zalloc_flags_t how)
1776 {
1777 	return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1778 }
1779 
1780 void
nxprov_params_free(struct nxprov_params * nxp)1781 nxprov_params_free(struct nxprov_params *nxp)
1782 {
1783 	SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1784 	zfree(nxprov_params_zone, nxp);
1785 }
1786 
1787 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1788 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1789 {
1790 	struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1791 
1792 	if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1793 		SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1794 		return ENOTSUP;
1795 	}
1796 
1797 	/*
1798 	 * Require that the nexus domain metadata type and the
1799 	 * metadata type of the caller-provided pbufpool match.
1800 	 */
1801 	if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1802 	    pp->pp_md_type ||
1803 	    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1804 	    pp->pp_md_subtype) {
1805 		SK_ERR("Mismatch in metadata type/subtype "
1806 		    "(%u/%u != %u/%u)", pp->pp_md_type,
1807 		    nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1808 		    pp->pp_md_subtype,
1809 		    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1810 		return EINVAL;
1811 	}
1812 
1813 	/*
1814 	 * Require that the nexus provider memory configuration
1815 	 * has the same impedance as the caller-provided one.
1816 	 * Both need to be lacking or present; if one of them
1817 	 * is set and the other isn't, then we bail.
1818 	 */
1819 	if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1820 	    !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1821 		SK_ERR("Memory config mismatch: monolithic mode");
1822 		return EINVAL;
1823 	}
1824 
1825 	return 0;
1826 }
1827 
1828 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1829 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1830     const nexus_type_t dom_type, const void *nx_ctx,
1831     nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1832     struct kern_pbufpool *rx_pp, int *err)
1833 {
1834 	struct kern_nexus_domain_provider *nxdom_prov;
1835 	struct kern_nexus_provider *nxprov = NULL;
1836 	struct kern_nexus *nx = NULL;
1837 #if SK_LOG
1838 	uuid_string_t uuidstr;
1839 #endif /* SK_LOG */
1840 
1841 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1842 
1843 	ASSERT(dom_type < NEXUS_TYPE_MAX);
1844 	ASSERT(!uuid_is_null(nxprov_uuid));
1845 	*err = 0;
1846 
1847 	SK_LOCK();
1848 
1849 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1850 		if (nxctl == nxprov->nxprov_ctl &&
1851 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1852 			break;
1853 		}
1854 	}
1855 
1856 	if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1857 		SK_ERR("Provider not found or has been closed");
1858 		*err = ENOENT;
1859 		goto done;
1860 	}
1861 
1862 	nxdom_prov = nxprov->nxprov_dom_prov;
1863 	if (dom_type != NEXUS_TYPE_UNDEFINED &&
1864 	    (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1865 		SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1866 		    dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1867 		nxdom_prov = NULL;
1868 		nxprov = NULL;
1869 		*err = ENODEV;
1870 		goto done;
1871 	}
1872 
1873 	if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1874 	    (!tx_pp || !rx_pp)) {
1875 #if SK_LOG
1876 		SK_ERR("TX/RX packet pool is required for netif logical link "
1877 		    "nexus provider UUID: %s",
1878 		    sk_uuid_unparse(nxprov_uuid, uuidstr));
1879 #endif /* SK_LOG */
1880 		nxdom_prov = NULL;
1881 		nxprov = NULL;
1882 		*err = EINVAL;
1883 		goto done;
1884 	}
1885 
1886 	if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1887 	    (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1888 		goto done;
1889 	}
1890 
1891 	nx = nx_alloc(Z_WAITOK);
1892 
1893 	STAILQ_INIT(&nx->nx_ch_head);
1894 	STAILQ_INIT(&nx->nx_ch_nonxref_head);
1895 	lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1896 	    &nexus_lock_attr);
1897 	STAILQ_INIT(&nx->nx_ch_if_adv_head);
1898 	uuid_generate_random(nx->nx_uuid);
1899 	nx->nx_prov = nxprov;
1900 	nx->nx_ctx = (void *)(uintptr_t)nx_ctx;
1901 	nx->nx_ctx_release = nx_ctx_release;
1902 	nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1903 
1904 	if (tx_pp != NULL) {
1905 		nx->nx_tx_pp = tx_pp;
1906 		pp_retain(tx_pp);       /* released by nx_free */
1907 	}
1908 
1909 	if (rx_pp != NULL) {
1910 		nx->nx_rx_pp = rx_pp;
1911 		pp_retain(rx_pp);       /* released by nx_free */
1912 	}
1913 
1914 	/* this nexus is alive; tell the nexus constructor to set it up */
1915 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1916 		*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1917 		if (*err != 0) {
1918 			nx->nx_prov = NULL;
1919 			goto done;
1920 		}
1921 	}
1922 
1923 	nxprov_retain_locked(nxprov);   /* hold a ref on the nexus reg */
1924 
1925 	STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1926 	nxprov->nxprov_nx_count++;
1927 	RB_INSERT(kern_nexus_tree, &nx_head, nx);
1928 	os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1929 
1930 	nx_retain_locked(nx);   /* one for the provider list */
1931 	nx_retain_locked(nx);   /* one for the global list */
1932 	nx_retain_locked(nx);   /* one for the caller */
1933 
1934 #if SK_LOG
1935 	SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1936 	    nxdom_prov->nxdom_prov_dom->nxdom_name,
1937 	    nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1938 #endif /* SK_LOG */
1939 done:
1940 	SK_UNLOCK();
1941 
1942 	if (*err != 0) {
1943 		if (nx != NULL) {
1944 			nx_free(nx);
1945 			nx = NULL;
1946 		}
1947 	}
1948 	return nx;
1949 }
1950 
1951 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1952 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1953 {
1954 	struct kern_nexus *nx = NULL;
1955 	struct kern_nexus find;
1956 	int err = 0;
1957 
1958 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1959 
1960 	SK_LOCK();
1961 
1962 	uuid_copy(find.nx_uuid, nx_uuid);
1963 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1964 	if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
1965 		nx = NULL;
1966 	}
1967 
1968 	if (nx != NULL) {
1969 		nx_retain_locked(nx);
1970 	}
1971 
1972 	if (nx == NULL) {
1973 		err = ENOENT;
1974 	} else {
1975 		err = nx_close(nx, TRUE);
1976 		(void) nx_release_locked(nx);
1977 	}
1978 
1979 	SK_UNLOCK();
1980 
1981 	return err;
1982 }
1983 
1984 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)1985 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
1986 {
1987 	return uuid_compare(a->nx_uuid, b->nx_uuid);
1988 }
1989 
1990 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)1991 nx_find(const uuid_t nx_uuid, boolean_t locked)
1992 {
1993 	struct kern_nexus *nx = NULL;
1994 	struct kern_nexus find;
1995 
1996 	if (!locked) {
1997 		SK_LOCK();
1998 	}
1999 
2000 	SK_LOCK_ASSERT_HELD();
2001 
2002 	uuid_copy(find.nx_uuid, nx_uuid);
2003 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2004 	if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2005 		nx = NULL;
2006 	}
2007 
2008 	/* return reference to caller */
2009 	if (nx != NULL) {
2010 		nx_retain_locked(nx);
2011 	}
2012 
2013 	if (!locked) {
2014 		SK_UNLOCK();
2015 	}
2016 
2017 	return nx;
2018 }
2019 
2020 int
nx_close(struct kern_nexus * nx,boolean_t locked)2021 nx_close(struct kern_nexus *nx, boolean_t locked)
2022 {
2023 	int err = 0;
2024 
2025 	if (!locked) {
2026 		SK_LOCK();
2027 	}
2028 
2029 	SK_LOCK_ASSERT_HELD();
2030 
2031 
2032 	if (nx->nx_flags & NXF_CLOSED) {
2033 		err = EALREADY;
2034 	} else {
2035 #if SK_LOG
2036 		uuid_string_t uuidstr;
2037 		SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2038 		    NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2039 		    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2040 		    NXF_BITS);
2041 #endif /* SK_LOG */
2042 
2043 		if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2044 			/* no regular channels open to it, so detach now */
2045 			nx_detach(nx);
2046 		} else {
2047 			/* detach when the last channel closes */
2048 			ASSERT(nx->nx_refcnt > 3);
2049 			os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2050 		}
2051 	}
2052 
2053 	if (!locked) {
2054 		SK_UNLOCK();
2055 	}
2056 
2057 	return err;
2058 }
2059 
2060 void
nx_stop(struct kern_nexus * nx)2061 nx_stop(struct kern_nexus *nx)
2062 {
2063 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2064 
2065 	SK_LOCK_ASSERT_HELD();
2066 
2067 	/* send a stop message */
2068 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2069 		nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2070 	}
2071 }
2072 
2073 void
nx_detach(struct kern_nexus * nx)2074 nx_detach(struct kern_nexus *nx)
2075 {
2076 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2077 
2078 	SK_LOCK_ASSERT_HELD();
2079 
2080 #if SK_LOG
2081 	uuid_string_t uuidstr;
2082 	SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2083 	    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2084 #endif /* SK_LOG */
2085 
2086 	/* Caller must hold extra refs, on top of the two in reg/global lists */
2087 	ASSERT(nx->nx_refcnt >= 3);
2088 	ASSERT(nx->nx_flags & NXF_ATTACHED);
2089 
2090 	/* this nexus is done; let the nexus destructor do final cleanups */
2091 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2092 		nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2093 	}
2094 
2095 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2096 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2097 
2098 	STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2099 	nxprov->nxprov_nx_count--;
2100 	RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2101 	os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2102 	nx->nx_prov = NULL;
2103 	if (nx->nx_ctx_release != NULL) {
2104 		nx->nx_ctx_release(nx->nx_ctx);
2105 	}
2106 	nx->nx_ctx = NULL;
2107 
2108 	(void) nx_release_locked(nx);   /* one for the reg list */
2109 	(void) nx_release_locked(nx);   /* one for the global list */
2110 
2111 	/*
2112 	 * If this was the last nexus and the provider has been closed,
2113 	 * detach the provider and and finish up the postponed job.
2114 	 */
2115 	if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2116 	    (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2117 		nxprov_detach(nxprov, TRUE);
2118 	}
2119 	(void) nxprov_release_locked(nxprov);
2120 }
2121 
2122 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2123 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2124     struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2125 {
2126 	struct __kern_nexus_adv_metadata *adv_md;
2127 
2128 	_CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2129 	_CASSERT((sizeof(struct sk_nexusadv) +
2130 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2131 	_CASSERT((sizeof(struct netif_nexus_advisory) +
2132 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2133 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2134 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2135 	ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2136 	    type == NEXUS_ADVISORY_TYPE_NETIF);
2137 
2138 	if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2139 	    NULL, NULL, NULL)) == NULL) {
2140 		return ENOMEM;
2141 	}
2142 
2143 	nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, NULL,
2144 	    NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC));
2145 	adv_md = nx->nx_adv.nxv_adv;
2146 	adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2147 	adv_md->knam_type = type;
2148 	adv_md->__reserved = 0;
2149 	nx->nx_adv.nxv_adv_type = type;
2150 	nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2151 	if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2152 		nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2153 		    NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2154 	} else {
2155 		nx->nx_adv.netif_nxv_adv->nna_version =
2156 		    NX_NETIF_ADVISORY_CURRENT_VERSION;
2157 	}
2158 	return 0;
2159 }
2160 
2161 void
nx_advisory_free(struct kern_nexus * nx)2162 nx_advisory_free(struct kern_nexus *nx)
2163 {
2164 	if (nx->nx_adv.nxv_reg != NULL) {
2165 		ASSERT(nx->nx_adv.nxv_adv != NULL);
2166 		skmem_region_free(nx->nx_adv.nxv_reg,
2167 		    nx->nx_adv.nxv_adv, NULL);
2168 		nx->nx_adv.nxv_adv = NULL;
2169 		nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2170 		nx->nx_adv.flowswitch_nxv_adv = NULL;
2171 		skmem_region_release(nx->nx_adv.nxv_reg);
2172 		nx->nx_adv.nxv_reg = NULL;
2173 	}
2174 
2175 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2176 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2177 	ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2178 	ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2179 }
2180 
2181 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2182 nx_alloc(zalloc_flags_t how)
2183 {
2184 	SK_LOCK_ASSERT_HELD();
2185 
2186 	return zalloc_flags(nx_zone, how | Z_ZERO);
2187 }
2188 
2189 static void
nx_free(struct kern_nexus * nx)2190 nx_free(struct kern_nexus *nx)
2191 {
2192 	ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2193 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2194 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2195 
2196 	nx_port_free_all(nx);
2197 
2198 	if (nx->nx_tx_pp != NULL) {
2199 		pp_release(nx->nx_tx_pp);
2200 		nx->nx_tx_pp = NULL;
2201 	}
2202 	if (nx->nx_rx_pp != NULL) {
2203 		pp_release(nx->nx_rx_pp);
2204 		nx->nx_rx_pp = NULL;
2205 	}
2206 
2207 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2208 	lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2209 
2210 	SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2211 	zfree(nx_zone, nx);
2212 }
2213 
2214 void
nx_retain_locked(struct kern_nexus * nx)2215 nx_retain_locked(struct kern_nexus *nx)
2216 {
2217 	SK_LOCK_ASSERT_HELD();
2218 
2219 	nx->nx_refcnt++;
2220 	VERIFY(nx->nx_refcnt > 0);
2221 }
2222 
2223 void
nx_retain(struct kern_nexus * nx)2224 nx_retain(struct kern_nexus *nx)
2225 {
2226 	SK_LOCK();
2227 	nx_retain_locked(nx);
2228 	SK_UNLOCK();
2229 }
2230 
2231 int
nx_release_locked(struct kern_nexus * nx)2232 nx_release_locked(struct kern_nexus *nx)
2233 {
2234 	int oldref = nx->nx_refcnt;
2235 
2236 	SK_LOCK_ASSERT_HELD();
2237 
2238 	VERIFY(nx->nx_refcnt > 0);
2239 	if (--nx->nx_refcnt == 0) {
2240 		nx_free(nx);
2241 	}
2242 
2243 	return oldref == 1;
2244 }
2245 
2246 int
nx_release(struct kern_nexus * nx)2247 nx_release(struct kern_nexus *nx)
2248 {
2249 	int lastref;
2250 
2251 	SK_LOCK_ASSERT_NOTHELD();
2252 
2253 	SK_LOCK();
2254 	lastref = nx_release_locked(nx);
2255 	SK_UNLOCK();
2256 
2257 	return lastref;
2258 }
2259 
2260 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2261 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2262 {
2263 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2264 	struct nexus_adapter *na = ch->ch_na;
2265 	boolean_t undo = FALSE;
2266 	int ksd_retains = 0;
2267 	enum txrx t;
2268 	int err = 0;
2269 
2270 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2271 	    CHANF_EXT_PRECONNECT);
2272 
2273 	if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2274 		return 0;
2275 	}
2276 
2277 	for_rx_tx(t) {
2278 		uint32_t i;
2279 
2280 		for (i = 0; i < na_get_nrings(na, t); i++) {
2281 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2282 
2283 			/* skip host rings */
2284 			if (kring->ckr_flags & CKRF_HOST) {
2285 				continue;
2286 			}
2287 
2288 			if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2289 				    nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2290 				    &kring->ckr_ctx)) != 0) {
2291 				SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2292 				    "(0x%llx) krflags %b ring_init error %d",
2293 				    SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2294 				    SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2295 				    kring->ckr_flags, CKRF_BITS, err);
2296 				kring->ckr_ctx = NULL;
2297 				undo = TRUE;
2298 				break;
2299 			}
2300 			kring->ckr_flags |= CKRF_EXT_RING_INITED;
2301 
2302 			if ((err = nx_init_slots(nx, kring)) != 0) {
2303 				undo = TRUE;
2304 				break;
2305 			}
2306 
2307 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2308 				++ksd_retains;
2309 			}
2310 		}
2311 		if (undo) {
2312 			break;
2313 		}
2314 	}
2315 
2316 	/*
2317 	 * Note: retain KSD even in case of error, as we have set
2318 	 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2319 	 * nx_fini_rings would take care of release based on it.
2320 	 */
2321 	if (ksd_retains != 0) {
2322 		/*
2323 		 * Mark the kernel slot descriptor region as busy; this
2324 		 * prevents it from being torn-down at channel defunct
2325 		 * time, as we need to invoke the slot_fini() callback
2326 		 * for each slot and we need the descriptors until then.
2327 		 */
2328 		skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2329 		    ksd_retains);
2330 	}
2331 
2332 	if (err != 0) {
2333 		ASSERT(undo);
2334 		nx_fini_rings(nx, ch);
2335 	}
2336 
2337 	return err;
2338 }
2339 
2340 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2341 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2342 {
2343 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2344 	struct nexus_adapter *na = ch->ch_na;
2345 	int ksd_releases = 0;
2346 	enum txrx t;
2347 
2348 	for_rx_tx(t) {
2349 		uint32_t i;
2350 
2351 		for (i = 0; i < na_get_nrings(na, t); i++) {
2352 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2353 
2354 			if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2355 				continue;
2356 			}
2357 
2358 			ASSERT(!(kring->ckr_flags & CKRF_HOST));
2359 			ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2360 			nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2361 			kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2362 
2363 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2364 				++ksd_releases;
2365 			}
2366 
2367 			/*
2368 			 * Undo the work done in nx_init_slots() and inform
2369 			 * the external domain provider, if applicable, that
2370 			 * the slots for this ring are no longer valid.
2371 			 */
2372 			nx_fini_slots(nx, kring);
2373 			kring->ckr_ctx = NULL;
2374 		}
2375 	}
2376 
2377 	if (ksd_releases != 0) {
2378 		/*
2379 		 * Now that we've finished invoking the slot_fini()
2380 		 * callbacks, release the busy retain counts held
2381 		 * earlier in nx_init_rings().  This will allow the
2382 		 * kernel slot descriptor region to be torn down.
2383 		 */
2384 		skmem_arena_nexus_sd_set_noidle(
2385 			skmem_arena_nexus(na->na_arena), -ksd_releases);
2386 	}
2387 }
2388 
2389 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2390 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2391 {
2392 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2393 	struct __slot_desc *slot = kring->ckr_ksds;
2394 	int err = 0;
2395 	uint32_t i;
2396 
2397 	/*
2398 	 * If the slot init callback was not provided, or if the
2399 	 * kring was not created to hold any slot contexts, don't
2400 	 * go any further.
2401 	 */
2402 	if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2403 	    kring->ckr_slot_ctxs == NULL) {
2404 		return 0;
2405 	}
2406 
2407 	ASSERT(kring->ckr_slot_ctxs_set == 0);
2408 	ASSERT(slot != NULL);
2409 
2410 	for (i = 0; i < kring->ckr_num_slots; i++) {
2411 		struct kern_slot_prop *slot_ctx_prop = NULL;
2412 		void *slot_ctx_arg = NULL;
2413 
2414 		ASSERT(&slot[i] <= kring->ckr_ksds_last);
2415 		if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2416 		    &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2417 			SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2418 			    "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2419 			    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2420 			break;
2421 		}
2422 		/* we don't want this to be used by client, so verify here */
2423 		ASSERT(slot_ctx_prop == NULL);
2424 		kring->ckr_slot_ctxs[i].slot_ctx_arg =
2425 		    (mach_vm_address_t)slot_ctx_arg;
2426 		kring->ckr_slot_ctxs_set++;
2427 	}
2428 
2429 	if (err != 0) {
2430 		nx_fini_slots(nx, kring);
2431 	} else {
2432 		kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2433 	}
2434 
2435 	return err;
2436 }
2437 
2438 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2439 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2440 {
2441 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2442 	struct __slot_desc *slot = kring->ckr_ksds;
2443 	uint32_t i;
2444 
2445 	ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2446 	    nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2447 	ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2448 
2449 	for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2450 		ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2451 		if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2452 			nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2453 			    kring, &slot[i], i);
2454 		}
2455 		if (kring->ckr_slot_ctxs != NULL) {
2456 			kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2457 		}
2458 	}
2459 	kring->ckr_slot_ctxs_set = 0;
2460 
2461 	/* We're done with this kring */
2462 	kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2463 }
2464 
2465 
2466 /* 64-bit mask with range */
2467 #define BMASK64(_beg, _end)     \
2468 	((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2469 
2470 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2471 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2472     nexus_port_t last, nexus_port_t *nx_port)
2473 {
2474 	int err = 0;
2475 
2476 	ASSERT(first < last);
2477 	*nx_port = NEXUS_PORT_ANY;
2478 
2479 	if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2480 		/*
2481 		 * Left edge of the range is beyond the current map;
2482 		 * let nx_port_alloc() handle the growing later.
2483 		 */
2484 		*nx_port = first;
2485 	} else {
2486 		nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2487 		nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2488 		nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2489 		nexus_port_size_t i, j;
2490 		bitmap_t *bmap;
2491 
2492 		/*
2493 		 * The right edge of the range is either within or
2494 		 * beyond the current map; scan thru the current
2495 		 * map and find the first available port.
2496 		 */
2497 		for (i = fc; i <= lc; i++) {
2498 			bitmap_t mask;
2499 			nexus_port_size_t beg = 0, end = 63;
2500 
2501 			if (i == fc) {
2502 				beg = (first % NX_PORT_CHUNK);
2503 			}
2504 			if (i == (last / NX_PORT_CHUNK)) {
2505 				end = (last % NX_PORT_CHUNK);
2506 			}
2507 
2508 			if (i < lim) {
2509 				bmap = &nx->nx_ports_bmap[i];
2510 				mask = BMASK64(beg, end);
2511 
2512 				j = (nexus_port_size_t)ffsll((*bmap) & mask);
2513 				if (j == 0) {
2514 					continue;
2515 				}
2516 
2517 				--j;
2518 				*nx_port = (i * NX_PORT_CHUNK) + j;
2519 			}
2520 			break;
2521 		}
2522 
2523 		/*
2524 		 * If the requested range is within the current map and we
2525 		 * couldn't find a port, return an err.  Otherwise, return
2526 		 * the next port index to trigger growing later.
2527 		 */
2528 		if (*nx_port == NEXUS_PORT_ANY) {
2529 			if (lc == (last / NX_PORT_CHUNK)) {
2530 				err = EBUSY;
2531 				SK_ERR("port unavail in [%u, %u)", first, last);
2532 			} else {
2533 				*nx_port = nx->nx_num_ports;
2534 			}
2535 		}
2536 	}
2537 
2538 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2539 	    (int)*nx_port, err);
2540 
2541 	return err;
2542 }
2543 
2544 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2545 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2546 {
2547 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2548 	nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2549 	struct nx_port_info *ports;
2550 	size_t limit;
2551 	nexus_port_size_t i, num_ports, old_num_ports;
2552 	bitmap_t *bmap;
2553 
2554 	ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2555 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2556 	_CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2557 	ASSERT(powerof2(dom_port_max));
2558 	ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2559 
2560 	old_num_ports = nx->nx_num_ports;
2561 	num_ports = nx->nx_num_ports + grow;
2562 	limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2563 	if (num_ports > limit) {
2564 		SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2565 		    nx->nx_num_ports, grow, num_ports, limit);
2566 		return EDOM;
2567 	}
2568 
2569 	if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2570 	    (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2571 	    (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2572 	    Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2573 		SK_ERR("bmap alloc failed, num_port %u", num_ports);
2574 		return ENOMEM;
2575 	}
2576 	nx->nx_ports_bmap = bmap;
2577 
2578 	if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2579 	    num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2580 		/* can't free bmap here, otherwise nexus won't work */
2581 		SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2582 		return ENOMEM;
2583 	}
2584 
2585 	/* initialize the additional new ports */
2586 	bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2587 	nx->nx_ports = ports;
2588 
2589 	/* initialize new bitmaps (set all bits) */
2590 	for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2591 	    i < (num_ports / NX_PORT_CHUNK); i++) {
2592 		bmap[i] = NX_PORT_CHUNK_FREE;
2593 	}
2594 
2595 	nx->nx_num_ports = num_ports;
2596 
2597 	SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2598 	    SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2599 
2600 	return 0;
2601 }
2602 
2603 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2604 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2605     struct nexus_adapter **na, struct proc *p)
2606 {
2607 	struct nx_port_info *npi = NULL;
2608 	struct nxbind *nxb0;
2609 	size_t g;
2610 	uint32_t i, j;
2611 	bitmap_t *bmap;
2612 	bool refonly = false;
2613 	int err = 0;
2614 
2615 	ASSERT(nx_port != NEXUS_PORT_ANY);
2616 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2617 
2618 	/* port is zero-based, so adjust here */
2619 	if ((nx_port + 1) > nx->nx_num_ports) {
2620 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2621 		VERIFY(g <= NEXUS_PORT_MAX);
2622 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2623 			goto done;
2624 		}
2625 	}
2626 	ASSERT(err == 0);
2627 	ASSERT(nx_port < nx->nx_num_ports);
2628 	npi = &nx->nx_ports[nx_port];
2629 	nxb0 = npi->npi_nxb;
2630 	i = nx_port / NX_PORT_CHUNK;
2631 	j = nx_port % NX_PORT_CHUNK;
2632 	bmap = &nx->nx_ports_bmap[i];
2633 
2634 	if (bit_test(*bmap, j)) {
2635 		/* port is not (yet) bound or allocated */
2636 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2637 		if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2638 			/*
2639 			 * If the port allocation is requested by userland
2640 			 * and the nexus is non-anonymous, then fail the
2641 			 * request.
2642 			 */
2643 			err = EACCES;
2644 			SK_ERR("user proc alloc on named nexus needs binding");
2645 		} else if (na != NULL && *na != NULL) {
2646 			/*
2647 			 * Otherwise claim it (clear bit) if the caller
2648 			 * supplied an adapter for this port; else, it
2649 			 * is just an existential check and so there's
2650 			 * no action needed at this point (we'll skip
2651 			 * the init below since vpna is NULL).
2652 			 */
2653 			bit_clear(*bmap, j);
2654 		}
2655 	} else {
2656 		/* if port is bound, check if credentials match */
2657 		if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2658 		    (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2659 			SK_ERR("nexus binding mismatch");
2660 			err = EACCES;
2661 		} else {
2662 			/*
2663 			 * If port is already occupied by an adapter,
2664 			 * see if the client is requesting a reference
2665 			 * to it; if so, return the adapter.  Otherwise,
2666 			 * if unoccupied and vpna is non-NULL, associate
2667 			 * it with this nexus port via the below init.
2668 			 */
2669 			if (NPI_NA(npi) != NULL) {
2670 				if (na != NULL && *na == NULL) {
2671 					*na = NPI_NA(npi);
2672 					na_retain_locked(*na);
2673 					/* skip the init below */
2674 					refonly = true;
2675 				} else {
2676 					/*
2677 					 * If the client supplied an adapter
2678 					 * (regardless of its value) for a
2679 					 * nexus port that's already occupied,
2680 					 * then we fail the request.
2681 					 */
2682 					SK_ERR("nexus adapted exits");
2683 					err = EEXIST;
2684 				}
2685 			}
2686 		}
2687 	}
2688 
2689 done:
2690 	/* initialize the nexus port and the adapter occupying it */
2691 	if (err == 0 && na != NULL && *na != NULL && !refonly) {
2692 		ASSERT(nx_port < nx->nx_num_ports);
2693 		ASSERT(npi->npi_nah == 0);
2694 		ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2695 		ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2696 		    (nx_port % NX_PORT_CHUNK)));
2697 
2698 		nx->nx_active_ports++;
2699 		npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2700 		(*na)->na_nx_port = nx_port;
2701 	}
2702 
2703 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2704 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2705 	    err);
2706 
2707 	return err;
2708 }
2709 
2710 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2711 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2712 {
2713 	struct nx_port_info *npi = &nx->nx_ports[nx_port];
2714 
2715 	npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2716 	    NEXUS_PORT_STATE_DEFUNCT);
2717 }
2718 
2719 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2720 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2721 {
2722 	struct nx_port_info *npi = NULL;
2723 	bitmap_t *bmap;
2724 	uint32_t i, j;
2725 
2726 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2727 	ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2728 	ASSERT(nx->nx_active_ports != 0);
2729 
2730 	i = nx_port / NX_PORT_CHUNK;
2731 	j = nx_port % NX_PORT_CHUNK;
2732 	bmap = &nx->nx_ports_bmap[i];
2733 	ASSERT(!bit_test(*bmap, j));
2734 
2735 	npi = &nx->nx_ports[nx_port];
2736 	npi->npi_nah = 0;
2737 	if (npi->npi_nxb == NULL) {
2738 		/* it's vacant, release it (set bit) */
2739 		bit_set(*bmap, j);
2740 	}
2741 
2742 	nx->nx_active_ports--;
2743 
2744 	//XXX [email protected] --- try to shrink bitmap & nx_ports ???
2745 
2746 	SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2747 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2748 }
2749 
2750 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2751 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2752     struct nxbind *nxb0, void *info)
2753 {
2754 	struct nx_port_info *npi = NULL;
2755 	size_t g;
2756 	uint32_t i, j;
2757 	bitmap_t *bmap;
2758 	int err = 0;
2759 
2760 	ASSERT(nx_port != NEXUS_PORT_ANY);
2761 	ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2762 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2763 	ASSERT(nxb0 != NULL);
2764 
2765 	if ((nx_port) + 1 > nx->nx_num_ports) {
2766 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2767 		VERIFY(g <= NEXUS_PORT_MAX);
2768 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2769 			goto done;
2770 		}
2771 	}
2772 	ASSERT(err == 0);
2773 
2774 	npi = &nx->nx_ports[nx_port];
2775 	i = nx_port / NX_PORT_CHUNK;
2776 	j = nx_port % NX_PORT_CHUNK;
2777 	bmap = &nx->nx_ports_bmap[i];
2778 	if (bit_test(*bmap, j)) {
2779 		/* port is not (yet) bound or allocated */
2780 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2781 
2782 		bit_clear(*bmap, j);
2783 		struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2784 		nxb_move(nxb0, nxb);
2785 		npi->npi_nxb = nxb;
2786 		npi->npi_info = info;
2787 		/* claim it (clear bit) */
2788 		bit_clear(*bmap, j);
2789 		ASSERT(err == 0);
2790 	} else {
2791 		/* port is already taken */
2792 		ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2793 		err = EEXIST;
2794 	}
2795 done:
2796 
2797 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2798 	    "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2799 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2800 
2801 	return err;
2802 }
2803 
2804 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2805 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2806 {
2807 	return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2808 }
2809 
2810 static int
nx_port_info_size(void * info,size_t * sz)2811 nx_port_info_size(void *info, size_t *sz)
2812 {
2813 	struct nx_port_info_header *hdr = info;
2814 
2815 	switch (hdr->ih_type) {
2816 	case NX_PORT_INFO_TYPE_NETIF:
2817 		break;
2818 	default:
2819 		return EINVAL;
2820 	}
2821 	*sz = hdr->ih_size;
2822 	return 0;
2823 }
2824 
2825 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2826 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2827 {
2828 	struct nx_port_info *npi = NULL;
2829 	struct nxbind *nxb;
2830 	uint32_t i, j;
2831 	bitmap_t *bmap;
2832 	int err = 0;
2833 
2834 	ASSERT(nx_port != NEXUS_PORT_ANY);
2835 
2836 	if (nx_port >= nx->nx_num_ports) {
2837 		err = EDOM;
2838 		goto done;
2839 	}
2840 
2841 	npi = &nx->nx_ports[nx_port];
2842 	i = nx_port / NX_PORT_CHUNK;
2843 	j = nx_port % NX_PORT_CHUNK;
2844 	bmap = &nx->nx_ports_bmap[i];
2845 
2846 	if ((nxb = npi->npi_nxb) == NULL) {
2847 		/* must be either free or allocated */
2848 		ASSERT(NPI_NA(npi) == NULL ||
2849 		    (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2850 		err = ENOENT;
2851 	} else {
2852 		nxb_free(nxb);
2853 		npi->npi_nxb = NULL;
2854 		if (npi->npi_info != NULL) {
2855 			size_t sz;
2856 
2857 			VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2858 			sk_free_data(npi->npi_info, sz);
2859 			npi->npi_info = NULL;
2860 		}
2861 		ASSERT(!bit_test(*bmap, j));
2862 		if (NPI_NA(npi) == NULL) {
2863 			/* it's vacant, release it (set bit) */
2864 			bit_set(*bmap, j);
2865 		}
2866 	}
2867 
2868 done:
2869 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2870 	    "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2871 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2872 
2873 	return err;
2874 }
2875 
2876 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2877 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2878 {
2879 	if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2880 		return NPI_NA(&nx->nx_ports[nx_port]);
2881 	} else {
2882 		return NULL;
2883 	}
2884 }
2885 
2886 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * info,uint32_t len)2887 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2888     nx_port_info_type_t type, void *info, uint32_t len)
2889 {
2890 	struct nx_port_info *npi;
2891 	struct nx_port_info_header *hdr;
2892 
2893 	if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2894 		return ENXIO;
2895 	}
2896 	npi = &nx->nx_ports[port];
2897 	hdr = npi->npi_info;
2898 	if (hdr == NULL) {
2899 		return ENOENT;
2900 	}
2901 
2902 	if (hdr->ih_type != type) {
2903 		return EINVAL;
2904 	}
2905 
2906 	bcopy(npi->npi_info, info, len);
2907 	return 0;
2908 }
2909 
2910 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2911 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2912 {
2913 	return nx_port < nx->nx_num_ports;
2914 }
2915 
2916 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2917 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2918 {
2919 	ASSERT(nx_port_is_valid(nx, nx_port));
2920 
2921 	return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2922 }
2923 
2924 void
nx_port_free_all(struct kern_nexus * nx)2925 nx_port_free_all(struct kern_nexus *nx)
2926 {
2927 	uint32_t num_ports;
2928 
2929 	/* uncrustify doesn't handle C blocks properly */
2930 	/* BEGIN IGNORE CODESTYLE */
2931 	nx_port_foreach(nx, ^(nexus_port_t p) {
2932 		struct nxbind *nxb;
2933 		void *info;
2934 		nxb = nx->nx_ports[p].npi_nxb;
2935 		info = nx->nx_ports[p].npi_info;
2936 		if (nxb != NULL) {
2937 			nxb_free(nxb);
2938 			nx->nx_ports[p].npi_nxb = NULL;
2939 		}
2940 		if (info != NULL) {
2941 			size_t sz;
2942 
2943 			VERIFY(nx_port_info_size(info, &sz) == 0);
2944 			skn_free_data(info, info, sz);
2945 			nx->nx_ports[p].npi_info = NULL;
2946 		}
2947 	});
2948 	/* END IGNORE CODESTYLE */
2949 
2950 	num_ports = nx->nx_num_ports;
2951 	nx->nx_num_ports = 0;
2952 	nx->nx_active_ports = 0;
2953 	skn_free_data(ports_bmap,
2954 	    nx->nx_ports_bmap, (num_ports / NX_PORT_CHUNK) * sizeof(bitmap_t));
2955 	nx->nx_ports_bmap = NULL;
2956 	sk_free_type_array(struct nx_port_info, num_ports, nx->nx_ports);
2957 	nx->nx_ports = NULL;
2958 }
2959 
2960 void
2961 nx_port_foreach(struct kern_nexus *nx,
2962     void (^port_handle)(nexus_port_t nx_port))
2963 {
2964 	for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
2965 		bitmap_t bmap = nx->nx_ports_bmap[i];
2966 
2967 		if (bmap == NX_PORT_CHUNK_FREE) {
2968 			continue;
2969 		}
2970 
2971 		for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
2972 			if (bit_test(bmap, j)) {
2973 				continue;
2974 			}
2975 			port_handle((i * NX_PORT_CHUNK) + j);
2976 		}
2977 	}
2978 }
2979 
2980 /*
2981  * sysctl interfaces
2982  */
2983 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
2984 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
2985 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
2986 
2987 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
2988     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2989     0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
2990 
2991 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
2992     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2993     0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
2994 
2995 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
2996     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2997     0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
2998     "A list of logical links");
2999 
3000 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3001     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3002     0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3003     "Nexus inet flows with stats collected in kernel");
3004 
3005 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3006     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3007     0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3008     "Nexus flow owners");
3009 
3010 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3011     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3012     0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3013     "Nexus flow routes");
3014 
3015 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3016     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3017     0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3018     "Nexus netif statistics collected in kernel");
3019 
3020 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3021     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3022     0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3023     "Nexus flowswitch statistics collected in kernel");
3024 
3025 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3026     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3027     0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3028     "Nexus userstack statistics counter");
3029 
3030 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3031     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3032     0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3033     "Nexus flow advisory dump");
3034 
3035 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3036     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3037     0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3038     "A list of netif queue stats entries");
3039 
3040 /*
3041  * Provider list sysctl
3042  */
3043 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3044 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3045     nexus_provider_info_t info)
3046 {
3047 	struct kern_nexus *nx;
3048 	uuid_t *uuids;
3049 
3050 	SK_LOCK_ASSERT_HELD();
3051 
3052 	/* provider UUID + params */
3053 	uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3054 	bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3055 	    sizeof(struct nxprov_params));
3056 	info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3057 
3058 	/* instance UUID list */
3059 	uuids = info->npi_instance_uuids;
3060 	STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3061 		uuid_copy(*uuids, nx->nx_uuid);
3062 		uuids++;
3063 	}
3064 }
3065 
3066 static int
3067 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3068 {
3069 #pragma unused(arg1, arg2, oidp)
3070 	size_t actual_space;
3071 	caddr_t buffer = NULL;
3072 	size_t buffer_space;
3073 	size_t allocated_space;
3074 	int out_error;
3075 	int error = 0;
3076 	struct kern_nexus_provider *nxprov;
3077 	caddr_t scan;
3078 
3079 	if (!kauth_cred_issuser(kauth_cred_get())) {
3080 		return EPERM;
3081 	}
3082 
3083 	net_update_uptime();
3084 	buffer_space = req->oldlen;
3085 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3086 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3087 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3088 		}
3089 		allocated_space = buffer_space;
3090 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3091 		if (__improbable(buffer == NULL)) {
3092 			return ENOBUFS;
3093 		}
3094 	} else if (req->oldptr == USER_ADDR_NULL) {
3095 		buffer_space = 0;
3096 	}
3097 	actual_space = 0;
3098 	scan = buffer;
3099 	SK_LOCK();
3100 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3101 		size_t                  info_size;
3102 
3103 		info_size
3104 		        = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3105 		if (scan != NULL) {
3106 			if (buffer_space < info_size) {
3107 				/* supplied buffer too small, stop copying */
3108 				error = ENOMEM;
3109 				break;
3110 			}
3111 			nexus_provider_info_populate(nxprov, (void *)scan);
3112 			scan += info_size;
3113 			buffer_space -= info_size;
3114 		}
3115 		actual_space += info_size;
3116 	}
3117 	SK_UNLOCK();
3118 
3119 	out_error = SYSCTL_OUT(req, buffer, actual_space);
3120 	if (out_error != 0) {
3121 		error = out_error;
3122 	}
3123 
3124 	if (buffer != NULL) {
3125 		sk_free_data(buffer, allocated_space);
3126 	}
3127 
3128 	return error;
3129 }
3130 
3131 /*
3132  * Channel list sysctl
3133  */
3134 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3135 channel_ring_count(struct kern_channel *ch, enum txrx which)
3136 {
3137 	return ch->ch_last[which] - ch->ch_first[which];
3138 }
3139 
3140 static void
populate_ring_entries(struct __kern_channel_ring * kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry_t entries)3141 populate_ring_entries(struct __kern_channel_ring *kring,
3142     ring_id_t first, ring_id_t last, nexus_channel_ring_entry_t entries)
3143 {
3144 	ring_id_t i;
3145 	nexus_channel_ring_entry_t scan;
3146 	struct __kern_channel_ring *ring;
3147 
3148 	scan = entries;
3149 	for (i = first; i < last; i++, scan++) {
3150 		ring = &kring[i];
3151 
3152 		DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3153 		    ring);
3154 		if (kr_stat_enable == 0) {
3155 			bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3156 			bzero(&scan->ncre_user_stats,
3157 			    sizeof(scan->ncre_user_stats));
3158 		} else {
3159 			scan->ncre_stats = ring->ckr_stats;
3160 			scan->ncre_user_stats = ring->ckr_usr_stats;
3161 		}
3162 		scan->ncre_error_stats = ring->ckr_err_stats;
3163 		scan->ncre_ring_id = i;
3164 	}
3165 }
3166 
3167 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3168 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3169 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3170 {
3171 	uint32_t flags = 0;
3172 
3173 	flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3174 	flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3175 	flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3176 	flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3177 	flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3178 	flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3179 	flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3180 	flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3181 	flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3182 	flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3183 	flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3184 	flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3185 	flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3186 
3187 	return flags;
3188 }
3189 
3190 SK_NO_INLINE_ATTRIBUTE
3191 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3192 nexus_channel_entry_populate(struct kern_channel *ch,
3193     nexus_channel_entry_t entry)
3194 {
3195 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3196 	uint32_t ch_flags = ch->ch_flags;
3197 	ring_id_t rx_first = ch->ch_first[NR_RX];
3198 	ring_id_t rx_last = ch->ch_last[NR_RX];
3199 	ring_id_t tx_last = ch->ch_last[NR_TX];
3200 	ring_id_t tx_first = ch->ch_first[NR_TX];
3201 
3202 	uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3203 	entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3204 	entry->nce_port = ch->ch_info->cinfo_nx_port;
3205 	entry->nce_pid = ch->ch_pid;
3206 	entry->nce_fd = ch->ch_fd;
3207 	entry->nce_tx_rings = tx_last - tx_first;
3208 	entry->nce_rx_rings = rx_last - rx_first;
3209 	populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3210 	    entry->nce_ring_entries);
3211 	populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3212 	    entry->nce_ring_entries + entry->nce_tx_rings);
3213 }
3214 
3215 SK_NO_INLINE_ATTRIBUTE
3216 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info_t info,size_t buffer_size)3217 nexus_channel_info_populate(struct kern_nexus *nx,
3218     nexus_channel_info_t info, size_t buffer_size)
3219 {
3220 	struct kern_channel *ch = NULL;
3221 	size_t info_size;
3222 	caddr_t scan = NULL;
3223 
3224 	SK_LOCK_ASSERT_HELD();
3225 
3226 	info_size = sizeof(*info);
3227 
3228 	/* channel list */
3229 	if (info != NULL) {
3230 		if (buffer_size < info_size) {
3231 			return info_size;
3232 		}
3233 
3234 		/* instance UUID */
3235 		uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3236 		info->nci_channel_entries_count = nx->nx_ch_count;
3237 		scan = (caddr_t)info->nci_channel_entries;
3238 	}
3239 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3240 		size_t          entry_size;
3241 		uint32_t        ring_count;
3242 
3243 		ring_count = channel_ring_count(ch, NR_TX) +
3244 		    channel_ring_count(ch, NR_RX);
3245 		entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3246 		info_size += entry_size;
3247 		if (scan != NULL) {
3248 			if (buffer_size < info_size) {
3249 				return info_size;
3250 			}
3251 
3252 			nexus_channel_entry_populate(ch, (void *)scan);
3253 			scan += entry_size;
3254 		}
3255 	}
3256 	return info_size;
3257 }
3258 
3259 static int
3260 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3261 {
3262 #pragma unused(arg1, arg2, oidp)
3263 	size_t actual_space;
3264 	caddr_t buffer = NULL;
3265 	size_t buffer_space;
3266 	size_t allocated_space;
3267 	int out_error;
3268 	struct kern_nexus *nx;
3269 	int error = 0;
3270 	caddr_t scan;
3271 
3272 	if (!kauth_cred_issuser(kauth_cred_get())) {
3273 		return EPERM;
3274 	}
3275 
3276 	net_update_uptime();
3277 	buffer_space = req->oldlen;
3278 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3279 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3280 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3281 		}
3282 		allocated_space = buffer_space;
3283 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3284 		if (__improbable(buffer == NULL)) {
3285 			return ENOBUFS;
3286 		}
3287 	} else if (req->oldptr == USER_ADDR_NULL) {
3288 		buffer_space = 0;
3289 	}
3290 	actual_space = 0;
3291 	scan = buffer;
3292 	SK_LOCK();
3293 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3294 		size_t info_size;
3295 
3296 		info_size = nexus_channel_info_populate(nx, (void *)scan,
3297 		    buffer_space);
3298 		if (scan != NULL) {
3299 			if (buffer_space < info_size) {
3300 				/* supplied buffer too small, stop copying */
3301 				error = ENOMEM;
3302 				break;
3303 			}
3304 			scan += info_size;
3305 			buffer_space -= info_size;
3306 		}
3307 		actual_space += info_size;
3308 	}
3309 	SK_UNLOCK();
3310 
3311 	if (actual_space != 0) {
3312 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3313 		if (out_error != 0) {
3314 			error = out_error;
3315 		}
3316 	}
3317 	if (buffer != NULL) {
3318 		sk_free_data(buffer, allocated_space);
3319 	}
3320 
3321 	return error;
3322 }
3323 
3324 static int
3325 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3326 {
3327 #pragma unused(arg1, arg2)
3328 	struct proc *p = req->p;
3329 	struct nexus_mib_filter filter;
3330 	int error = 0;
3331 	size_t actual_space;
3332 	caddr_t buffer = NULL;
3333 	size_t buffer_space;
3334 	size_t allocated_space;
3335 	int out_error;
3336 	struct kern_nexus *nx;
3337 	caddr_t scan;
3338 
3339 	/* Restrict protocol stats access to root user only (like netstat). */
3340 	if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3341 	    !kauth_cred_issuser(kauth_cred_get())) {
3342 		SK_ERR("mib request rejected, EPERM");
3343 		return EPERM;
3344 	}
3345 
3346 	if (req->newptr == USER_ADDR_NULL) {
3347 		/*
3348 		 * For flow stats requests, non-root users need to provide a
3349 		 * 5-tuple. Otherwise, we do not grant access.
3350 		 */
3351 		if (oidp->oid_arg2 == NXMIB_FLOW &&
3352 		    !kauth_cred_issuser(kauth_cred_get())) {
3353 			SK_ERR("mib request rejected: tuple not provided");
3354 			return EPERM;
3355 		}
3356 		/* use subcommand for multiple nodes */
3357 		filter.nmf_type = oidp->oid_arg2;
3358 		filter.nmf_bitmap = 0x0;
3359 	} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3360 		SK_ERR("mis-matching newlen");
3361 		return EINVAL;
3362 	} else {
3363 		error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3364 		if (error != 0) {
3365 			SK_ERR("SYSCTL_IN err %d", error);
3366 			return error;
3367 		}
3368 		if (filter.nmf_type != oidp->oid_arg2) {
3369 			SK_ERR("mis-matching nmf_type");
3370 			return EINVAL;
3371 		}
3372 		/*
3373 		 * For flow stats requests, non-root users need to set the nexus
3374 		 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3375 		 * grant access. This ensures that fsw_mib_get_flow looks for a
3376 		 * flow entry that matches the given tuple of the non-root user.
3377 		 */
3378 		if (filter.nmf_type == NXMIB_FLOW &&
3379 		    (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3380 		    !kauth_cred_issuser(kauth_cred_get())) {
3381 			SK_ERR("mib request rejected: tuple filter not set");
3382 			return EPERM;
3383 		}
3384 	}
3385 
3386 	net_update_uptime();
3387 	buffer_space = req->oldlen;
3388 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3389 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3390 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3391 		}
3392 		allocated_space = buffer_space;
3393 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3394 		if (__improbable(buffer == NULL)) {
3395 			return ENOBUFS;
3396 		}
3397 	} else if (req->oldptr == USER_ADDR_NULL) {
3398 		buffer_space = 0;
3399 	}
3400 	actual_space = 0;
3401 	scan = buffer;
3402 
3403 	SK_LOCK();
3404 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3405 		if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3406 			continue;
3407 		}
3408 
3409 		size_t size;
3410 		struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3411 
3412 		size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3413 		    buffer_space, p);
3414 
3415 		if (scan != NULL) {
3416 			if (buffer_space < size) {
3417 				/* supplied buffer too small, stop copying */
3418 				error = ENOMEM;
3419 				break;
3420 			}
3421 			scan += size;
3422 			buffer_space -= size;
3423 		}
3424 		actual_space += size;
3425 	}
3426 	SK_UNLOCK();
3427 
3428 	if (actual_space != 0) {
3429 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3430 		if (out_error != 0) {
3431 			error = out_error;
3432 		}
3433 	}
3434 	if (buffer != NULL) {
3435 		sk_free_data(buffer, allocated_space);
3436 	}
3437 
3438 	return error;
3439 }
3440 
3441 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3442 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3443     boolean_t is_sk_locked)
3444 {
3445 	struct kern_nexus *nx = NULL;
3446 
3447 	if (!is_sk_locked) {
3448 		SK_LOCK();
3449 	} else {
3450 		SK_LOCK_ASSERT_HELD();
3451 	}
3452 
3453 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3454 		(*f)(nx, arg0);
3455 	}
3456 
3457 	if (!is_sk_locked) {
3458 		SK_UNLOCK();
3459 	}
3460 }
3461 
3462 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3463 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3464     struct kern_pbufpool_memory_info *rx_pool_info,
3465     struct kern_pbufpool_memory_info *tx_pool_info)
3466 {
3467 	struct kern_pbufpool *tpp, *rpp;
3468 	struct kern_nexus *nx;
3469 	errno_t err = 0;
3470 
3471 	nx = nx_find(nx_uuid, FALSE);
3472 	if (nx == NULL) {
3473 		err = ENOENT;
3474 		goto done;
3475 	}
3476 
3477 	if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3478 		err = ENOTSUP;
3479 		goto done;
3480 	}
3481 
3482 	err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3483 	if (err != 0) {
3484 		goto done;
3485 	}
3486 
3487 	if ((tpp == NULL) && (rpp == NULL)) {
3488 		err = ENOENT;
3489 		goto done;
3490 	}
3491 
3492 	if (tx_pool_info != NULL) {
3493 		bzero(tx_pool_info, sizeof(*tx_pool_info));
3494 	}
3495 	if (rx_pool_info != NULL) {
3496 		bzero(rx_pool_info, sizeof(*rx_pool_info));
3497 	}
3498 
3499 	if ((tx_pool_info != NULL) && (tpp != NULL)) {
3500 		err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3501 		if (err != 0) {
3502 			goto done;
3503 		}
3504 	}
3505 
3506 	if ((rx_pool_info != NULL) && (rpp != NULL)) {
3507 		err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3508 	}
3509 
3510 done:
3511 	if (nx != NULL) {
3512 		(void) nx_release(nx);
3513 		nx = NULL;
3514 	}
3515 	return err;
3516 }
3517 
3518 void
nx_interface_advisory_notify(struct kern_nexus * nx)3519 nx_interface_advisory_notify(struct kern_nexus *nx)
3520 {
3521 	struct kern_channel *ch;
3522 	struct netif_stats *nifs;
3523 	struct fsw_stats *fsw_stats;
3524 	nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3525 
3526 	if (nxdom_type == NEXUS_TYPE_NET_IF) {
3527 		nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3528 	} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3529 		fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3530 	} else {
3531 		VERIFY(0);
3532 		__builtin_unreachable();
3533 	}
3534 	if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3535 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3536 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3537 		} else {
3538 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3539 		}
3540 		return;
3541 	}
3542 	/*
3543 	 * if the channel is in "nx_ch_if_adv_head" list, then we can
3544 	 * safely assume that the channel is not closed yet.
3545 	 * In ch_close_common(), the channel is removed from the
3546 	 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3547 	 * exclusive mode, prior to closing the channel.
3548 	 */
3549 	STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3550 		struct nexus_adapter *na = ch->ch_na;
3551 
3552 		ASSERT(na != NULL);
3553 		na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3554 		    TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3555 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3556 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3557 		} else {
3558 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3559 		}
3560 	}
3561 	lck_rw_done(&nx->nx_ch_if_adv_lock);
3562 }
3563