xref: /xnu-8796.141.3/bsd/skywalk/nexus/nexus.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33 
34 static uint32_t disable_nxctl_check = 0;
35 #if (DEVELOPMENT || DEBUG)
36 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37     CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38 #endif
39 
40 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44 
45 static STAILQ_HEAD(, nxctl) nxctl_head =
46     STAILQ_HEAD_INITIALIZER(nxctl_head);
47 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48     STAILQ_HEAD_INITIALIZER(nxprov_head);
49 
50 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51 RB_HEAD(kern_nexus_tree, kern_nexus);
52 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54 static struct kern_nexus_tree   nx_head;
55 
56 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63 static void nxctl_retain_locked(struct nxctl *);
64 static int nxctl_release_locked(struct nxctl *);
65 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67 static void nxctl_free(struct nxctl *);
68 
69 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70     struct kern_nexus_domain_provider *, struct nxprov_reg *,
71     const struct kern_nexus_provider_init *init, int *);
72 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73 static void nxprov_retain_locked(struct kern_nexus_provider *);
74 static int nxprov_release_locked(struct kern_nexus_provider *);
75 static struct kern_nexus_provider *nxprov_alloc(
76 	struct kern_nexus_domain_provider *, zalloc_flags_t);
77 static void nxprov_free(struct kern_nexus_provider *);
78 
79 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83 static struct kern_nexus *nx_alloc(zalloc_flags_t);
84 static void nx_free(struct kern_nexus *);
85 
86 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87 
88 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89 
90 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91 
92 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93 
94 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95 
96 static int __nx_inited = 0;
97 
98 #define SKMEM_TAG_NX_KEY        "com.apple.skywalk.nexus.key"
99 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100 
101 #define SKMEM_TAG_NX_MIB        "com.apple.skywalk.nexus.mib"
102 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103 
104 #define SKMEM_TAG_NX_PORT        "com.apple.skywalk.nexus.port"
105 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106 
107 #define SKMEM_TAG_NX_PORT_INFO        "com.apple.skywalk.nexus.port.info"
108 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109 
110 /*
111  * Special nexus controller handle for Skywalk internal use.  Unlike all
112  * other nexus controller handles that are created by userland or kernel
113  * clients, this one never gets closed or freed.  It is also not part of
114  * the global nxctl_head list.
115  */
116 static struct nxctl _kernnxctl;
117 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
118 
119 int
nexus_init(void)120 nexus_init(void)
121 {
122 	SK_LOCK_ASSERT_HELD();
123 	ASSERT(!__nx_inited);
124 
125 	RB_INIT(&nx_head);
126 
127 	na_init();
128 
129 	/* attach system built-in domains and domain providers */
130 	nxdom_attach_all();
131 
132 	/*
133 	 * Initialize private kernel nexus controller handle; this is used
134 	 * internally for creating nexus providers and nexus instances from
135 	 * within the Skywalk code (e.g. netif_compat).
136 	 */
137 	nxctl_init(&_kernnxctl, kernproc, NULL);
138 	nxctl_retain_locked(&_kernnxctl);       /* one for us */
139 	nxctl_traffic_rule_init();
140 
141 	__nx_inited = 1;
142 
143 	return 0;
144 }
145 
146 void
nexus_fini(void)147 nexus_fini(void)
148 {
149 	SK_LOCK_ASSERT_HELD();
150 
151 	if (__nx_inited) {
152 		nxctl_traffic_rule_fini();
153 		nxctl_release_locked(&_kernnxctl);
154 
155 		/* tell all domains they're going away */
156 		nxdom_detach_all();
157 
158 		ASSERT(RB_EMPTY(&nx_head));
159 
160 		na_fini();
161 
162 		__nx_inited = 0;
163 	}
164 }
165 
166 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)167 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
168     int *err)
169 {
170 	struct nxctl *nxctl = NULL;
171 
172 	ASSERT(!uuid_is_null(nxctl_uuid));
173 
174 	/* privilege checks would be done when performing nxctl operations */
175 
176 	SK_LOCK();
177 
178 	nxctl = nxctl_alloc(p, fp, Z_WAITOK);
179 
180 	STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
181 	nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
182 	uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
183 
184 	nxctl_retain_locked(nxctl);     /* one for being in the list */
185 	nxctl_retain_locked(nxctl);     /* one for the caller */
186 
187 #if SK_LOG
188 	uuid_string_t uuidstr;
189 	SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
190 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
191 #endif /* SK_LOG */
192 
193 	SK_UNLOCK();
194 
195 	if (*err != 0) {
196 		nxctl_free(nxctl);
197 		nxctl = NULL;
198 	}
199 	return nxctl;
200 }
201 
202 void
nxctl_close(struct nxctl * nxctl)203 nxctl_close(struct nxctl *nxctl)
204 {
205 	struct kern_nexus_provider *nxprov = NULL, *tnxprov;
206 
207 	lck_mtx_lock(&nxctl->nxctl_lock);
208 	SK_LOCK();
209 
210 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
211 
212 #if SK_LOG
213 	uuid_string_t uuidstr;
214 	SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
215 	    sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
216 	    nxctl->nxctl_flags, NEXUSCTLF_BITS);
217 #endif /* SK_LOG */
218 
219 	if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
220 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
221 		nxctl->nxctl_fp = NULL;
222 	}
223 
224 	/* may be called as part of failure cleanup, so check */
225 	if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
226 		/* caller must hold an extra ref */
227 		ASSERT(nxctl->nxctl_refcnt > 1);
228 		(void) nxctl_release_locked(nxctl);
229 
230 		STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
231 		nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
232 	}
233 
234 repeat:
235 	STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
236 		/*
237 		 * Close provider only for those which are owned by
238 		 * this control instance.  Note that if we close the
239 		 * provider, we need to repeat this search as the
240 		 * list might have been changed by another thread.
241 		 * That's possible since SK_UNLOCK() may be called
242 		 * as a result of calling nxprov_close().
243 		 */
244 		if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
245 		    nxprov->nxprov_ctl == nxctl) {
246 			nxprov_retain_locked(nxprov);
247 			(void) nxprov_close(nxprov, TRUE);
248 			(void) nxprov_release_locked(nxprov);
249 			goto repeat;
250 		}
251 	}
252 
253 	SK_UNLOCK();
254 	lck_mtx_unlock(&nxctl->nxctl_lock);
255 	nxctl_traffic_rule_clean(nxctl);
256 }
257 
258 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)259 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
260 {
261 #pragma unused(nxctl)
262 	int err = 0;
263 
264 	NXCTL_LOCK_ASSERT_HELD(nxctl);
265 
266 	if (sopt->sopt_dir != SOPT_SET) {
267 		sopt->sopt_dir = SOPT_SET;
268 	}
269 
270 	switch (sopt->sopt_name) {
271 	case NXOPT_NEXUS_BIND:
272 		err = nxctl_nexus_bind(nxctl, sopt);
273 		break;
274 
275 	case NXOPT_NEXUS_UNBIND:
276 		err = nxctl_nexus_unbind(nxctl, sopt);
277 		break;
278 
279 	case NXOPT_NEXUS_CONFIG:
280 		err = nxctl_nexus_config(nxctl, sopt);
281 		break;
282 
283 	default:
284 		err = ENOPROTOOPT;
285 		break;
286 	}
287 
288 	return err;
289 }
290 
291 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)292 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
293 {
294 #pragma unused(nxctl)
295 	int err = 0;
296 
297 	NXCTL_LOCK_ASSERT_HELD(nxctl);
298 
299 	if (sopt->sopt_dir != SOPT_GET) {
300 		sopt->sopt_dir = SOPT_GET;
301 	}
302 
303 	switch (sopt->sopt_name) {
304 	case NXOPT_NEXUS_PROV_LIST:
305 		err = nxctl_get_nexus_prov_list(nxctl, sopt);
306 		break;
307 
308 	case NXOPT_NEXUS_PROV_ENTRY:
309 		err = nxctl_get_nexus_prov_entry(nxctl, sopt);
310 		break;
311 
312 	case NXOPT_NEXUS_LIST:
313 		err = nxctl_get_nexus_list(nxctl, sopt);
314 		break;
315 
316 	case NXOPT_CHANNEL_LIST:
317 		err = nxctl_get_channel_list(nxctl, sopt);
318 		break;
319 
320 	default:
321 		err = ENOPROTOOPT;
322 		break;
323 	}
324 
325 	return err;
326 }
327 
328 /* Upper bound on # of nrl_num_regs that we'd return to user space */
329 #define MAX_NUM_REG_ENTRIES     256
330 
331 /* Hoisted out of line to reduce kernel stack footprint */
332 SK_NO_INLINE_ATTRIBUTE
333 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)334 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
335 {
336 	user_addr_t tmp_ptr = USER_ADDR_NULL;
337 	struct nxprov_reg_ent *pnre, *nres = NULL;
338 	struct nxprov_list_req nrlr;
339 	struct kern_nexus_provider *nxprov = NULL;
340 	uint32_t nregs = 0, ncregs = 0;
341 	int err = 0, observeall;
342 	size_t nres_sz;
343 
344 	NXCTL_LOCK_ASSERT_HELD(nxctl);
345 
346 	ASSERT(sopt->sopt_p != NULL);
347 	if (sopt->sopt_val == USER_ADDR_NULL) {
348 		return EINVAL;
349 	}
350 
351 	err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
352 	if (err != 0) {
353 		return err;
354 	}
355 
356 	if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
357 		nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
358 	}
359 
360 	/*
361 	 * If the caller specified a buffer, copy out the Nexus provider
362 	 * entries to caller gracefully.  We only copy out the number of
363 	 * entries which caller has asked for, but we always tell caller
364 	 * how big the buffer really needs to be.
365 	 */
366 	tmp_ptr = nrlr.nrl_regs;
367 	if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
368 		nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
369 		nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
370 		if (__improbable(nres == NULL)) {
371 			return ENOBUFS;
372 		}
373 	}
374 
375 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
376 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
377 
378 	SK_LOCK();
379 	/*
380 	 * Count number of providers.  If buffer space exists and
381 	 * remains, copy out provider entries.
382 	 */
383 	nregs = nrlr.nrl_num_regs;
384 	pnre = nres;
385 
386 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
387 		/*
388 		 * Return only entries that are visible to the caller,
389 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
390 		 */
391 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
392 			continue;
393 		}
394 
395 		if (nres != NULL && nregs > 0) {
396 			uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
397 			bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
398 			    sizeof(struct nxprov_params));
399 			--nregs;
400 			++pnre;
401 			++ncregs;
402 		}
403 	}
404 	SK_UNLOCK();
405 
406 	if (ncregs == 0) {
407 		err = ENOENT;
408 	}
409 
410 	if (nres != NULL) {
411 		if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
412 			if (sopt->sopt_p != kernproc) {
413 				err = copyout(nres, tmp_ptr,
414 				    ncregs * sizeof(*nres));
415 			} else {
416 				bcopy(nres, CAST_DOWN(caddr_t, tmp_ptr),
417 				    ncregs * sizeof(*nres));
418 			}
419 		}
420 		sk_free_data(nres, nres_sz);
421 		nres = NULL;
422 	}
423 
424 	if (err == 0) {
425 		nrlr.nrl_num_regs = ncregs;
426 		err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
427 	}
428 
429 	return err;
430 }
431 
432 /* Hoisted out of line to reduce kernel stack footprint */
433 SK_NO_INLINE_ATTRIBUTE
434 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)435 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
436 {
437 	struct nxprov_reg_ent nre;
438 	struct kern_nexus_provider *nxprov = NULL;
439 	int err = 0;
440 
441 	NXCTL_LOCK_ASSERT_HELD(nxctl);
442 
443 	ASSERT(sopt->sopt_p != NULL);
444 	if (sopt->sopt_val == USER_ADDR_NULL) {
445 		return EINVAL;
446 	}
447 
448 	bzero(&nre, sizeof(nre));
449 	err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
450 	if (err != 0) {
451 		return err;
452 	}
453 
454 	if (uuid_is_null(nre.npre_prov_uuid)) {
455 		return EINVAL;
456 	}
457 
458 	SK_LOCK();
459 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
460 		if (uuid_compare(nxprov->nxprov_uuid,
461 		    nre.npre_prov_uuid) == 0) {
462 			/*
463 			 * Return only entries that are visible to the caller,
464 			 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
465 			 */
466 			if (nxprov->nxprov_ctl != nxctl) {
467 				if (skywalk_priv_check_cred(sopt->sopt_p,
468 				    nxctl->nxctl_cred,
469 				    PRIV_SKYWALK_OBSERVE_ALL) != 0) {
470 					nxprov = NULL;
471 					break;
472 				}
473 			}
474 
475 			bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
476 			    sizeof(struct nxprov_params));
477 			break;
478 		}
479 	}
480 	SK_UNLOCK();
481 
482 	if (nxprov != NULL) {
483 		err = sooptcopyout(sopt, &nre, sizeof(nre));
484 	} else {
485 		err = ENOENT;
486 	}
487 
488 	return err;
489 }
490 
491 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
492 #define MAX_NUM_NX_UUIDS        4096
493 
494 /* Hoisted out of line to reduce kernel stack footprint */
495 SK_NO_INLINE_ATTRIBUTE
496 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)497 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
498 {
499 	user_addr_t tmp_ptr = USER_ADDR_NULL;
500 	uint32_t nuuids = 0, ncuuids = 0;
501 	uuid_t *puuid, *uuids = NULL;
502 	size_t uuids_sz;
503 	struct nx_list_req nlr;
504 	struct kern_nexus_provider *nxprov = NULL;
505 	struct kern_nexus *nx = NULL;
506 	int err = 0, observeall;
507 
508 	NXCTL_LOCK_ASSERT_HELD(nxctl);
509 
510 	ASSERT(sopt->sopt_p != NULL);
511 	if (sopt->sopt_val == USER_ADDR_NULL) {
512 		return EINVAL;
513 	}
514 
515 	err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
516 	if (err != 0) {
517 		return err;
518 	}
519 
520 	if (uuid_is_null(nlr.nl_prov_uuid)) {
521 		return EINVAL;
522 	} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
523 		nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
524 	}
525 
526 	/*
527 	 * If the caller specified a buffer, copy out the Nexus UUIDs to
528 	 * caller gracefully.  We only copy out the number of UUIDs which
529 	 * caller has asked for, but we always tell caller how big the
530 	 * buffer really needs to be.
531 	 */
532 	tmp_ptr = nlr.nl_nx_uuids;
533 	if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
534 		uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
535 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
536 		if (__improbable(uuids == NULL)) {
537 			return ENOBUFS;
538 		}
539 	}
540 
541 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
542 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
543 
544 	SK_LOCK();
545 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
546 		/*
547 		 * Return only entries that are visible to the caller,
548 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
549 		 */
550 		if (nxprov->nxprov_ctl != nxctl && !observeall) {
551 			continue;
552 		}
553 
554 		if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
555 			break;
556 		}
557 	}
558 
559 	if (nxprov != NULL) {
560 		/*
561 		 * Count number of Nexus.  If buffer space exists
562 		 * and remains, copy out the Nexus UUIDs.
563 		 */
564 		nuuids = nlr.nl_num_nx_uuids;
565 		puuid = uuids;
566 
567 		STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
568 			++ncuuids;
569 			if (uuids != NULL && nuuids > 0) {
570 				uuid_copy(*puuid, nx->nx_uuid);
571 				--nuuids;
572 				++puuid;
573 			}
574 		}
575 	} else {
576 		err = ENOENT;
577 	}
578 	SK_UNLOCK();
579 
580 	if (uuids != NULL) {
581 		if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
582 			uintptr_t cnt_uuid;
583 
584 			/* Note: Pointer arithmetic */
585 			cnt_uuid = (uintptr_t)(puuid - uuids);
586 			if (cnt_uuid > 0) {
587 				if (sopt->sopt_p != kernproc) {
588 					err = copyout(uuids, tmp_ptr,
589 					    cnt_uuid * sizeof(uuid_t));
590 				} else {
591 					bcopy(uuids,
592 					    CAST_DOWN(caddr_t, tmp_ptr),
593 					    cnt_uuid * sizeof(uuid_t));
594 				}
595 			}
596 		}
597 		sk_free_data(uuids, uuids_sz);
598 		uuids = NULL;
599 	}
600 
601 	if (err == 0) {
602 		nlr.nl_num_nx_uuids = ncuuids;
603 		err = sooptcopyout(sopt, &nlr, sizeof(nlr));
604 	}
605 
606 	return err;
607 }
608 
609 /* Hoisted out of line to reduce kernel stack footprint */
610 SK_NO_INLINE_ATTRIBUTE
611 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)612 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
613 {
614 	boolean_t m_pid, m_exec_uuid, m_key;
615 	struct nx_bind_req nbr;
616 	struct proc *p = PROC_NULL;
617 	struct nxbind *nxb = NULL;
618 	uint64_t p_uniqueid = -1;
619 	pid_t p_pid = -1;
620 	struct kern_nexus *nx = NULL;
621 #if SK_LOG
622 	uuid_string_t exec_uuidstr;
623 #endif /* SK_LOG */
624 	uuid_t p_uuid;
625 	void *key = NULL;
626 	int err = 0;
627 
628 	NXCTL_LOCK_ASSERT_HELD(nxctl);
629 
630 	if (sopt->sopt_val == USER_ADDR_NULL) {
631 		return EINVAL;
632 	}
633 
634 	uuid_clear(p_uuid);
635 	bzero(&nbr, sizeof(nbr));
636 	err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
637 	if (err != 0) {
638 		return err;
639 	}
640 
641 	if (uuid_is_null(nbr.nb_nx_uuid)) {
642 		err = EINVAL;
643 		goto done_unlocked;
644 	}
645 
646 	nbr.nb_flags &= NBR_MATCH_MASK;
647 	if (nbr.nb_flags == 0) {
648 		/* must choose one of the match criteria */
649 		err = EINVAL;
650 		goto done_unlocked;
651 	}
652 	m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
653 	m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
654 	m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
655 
656 	if (m_pid || m_exec_uuid) {
657 		/*
658 		 * Validate process ID.  A valid PID is needed when we're
659 		 * asked to match by PID, or if asked to match by executable
660 		 * UUID with a NULL nb_exec_uuid supplied.  The latter is
661 		 * to support the case when a userland Nexus provider isn't
662 		 * able to acquire its client's executable UUID, but is
663 		 * able to identify it via PID.
664 		 */
665 		if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
666 		    (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
667 			err = ESRCH;
668 			goto done_unlocked;
669 		}
670 		/* exclude kernel from the match criteria */
671 		if (p == kernproc) {
672 			err = EACCES;
673 			goto done_unlocked;
674 		} else if (p != PROC_NULL) {
675 			proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
676 			p_uniqueid = proc_uniqueid(p);
677 			p_pid = proc_pid(p);
678 		} else {
679 			uuid_copy(p_uuid, nbr.nb_exec_uuid);
680 		}
681 	}
682 
683 	if (m_key) {
684 		if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
685 		    nbr.nb_key == USER_ADDR_NULL) {
686 			err = EINVAL;
687 			goto done_unlocked;
688 		}
689 
690 		key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
691 		if (__improbable(key == NULL)) {
692 			err = ENOMEM;
693 			goto done_unlocked;
694 		}
695 
696 		if (sopt->sopt_p != kernproc) {
697 			err = copyin(nbr.nb_key, key, nbr.nb_key_len);
698 			if (err != 0) {
699 				goto done_unlocked;
700 			}
701 		} else {
702 			bcopy((void *)nbr.nb_key, key, nbr.nb_key_len);
703 		}
704 	}
705 
706 	SK_LOCK();
707 	nx = nx_find(nbr.nb_nx_uuid, TRUE);
708 	if (nx == NULL || (disable_nxctl_check == 0 &&
709 	    nx->nx_prov->nxprov_ctl != nxctl &&
710 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
711 		err = ENOENT;
712 		goto done;
713 	}
714 
715 	/* bind isn't applicable on anonymous nexus provider */
716 	if (NX_ANONYMOUS_PROV(nx)) {
717 		err = ENXIO;
718 		goto done;
719 	}
720 
721 	/* port must be within the domain's range */
722 	if (nbr.nb_port != NEXUS_PORT_ANY &&
723 	    nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
724 		err = EDOM;
725 		goto done;
726 	} else if (nbr.nb_port == NEXUS_PORT_ANY) {
727 		/* for now, this is allowed only for kernel clients */
728 		if (sopt->sopt_p != kernproc) {
729 			err = EPERM;
730 			goto done;
731 		}
732 	}
733 
734 	nxb = nxb_alloc(Z_WAITOK);
735 
736 	if (m_pid) {
737 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
738 		nxb->nxb_uniqueid = p_uniqueid;
739 		nxb->nxb_pid = p_pid;
740 	}
741 	if (m_exec_uuid) {
742 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
743 		ASSERT(!uuid_is_null(p_uuid));
744 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
745 	}
746 	if (m_key) {
747 		nxb->nxb_flags |= NXBF_MATCH_KEY;
748 		ASSERT(key != NULL);
749 		nxb->nxb_key = key;
750 		key = NULL;     /* let nxb_free() free it */
751 		ASSERT(nbr.nb_key_len != 0 &&
752 		    nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
753 		nxb->nxb_key_len = nbr.nb_key_len;
754 	}
755 
756 	/*
757 	 * Bind the creds to the nexus port.  If client doesn't have a port,
758 	 * find one, claim it, and associate the creds to it.  Upon success,
759 	 * the nexus may move the nxbind contents (including the key) to
760 	 * its own nxbind instance; in that case, nxb_free() below will not
761 	 * be freeing the key within.
762 	 */
763 	err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
764 	if (err != 0) {
765 		goto done;
766 	}
767 
768 	ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
769 	(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
770 
771 	SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
772 	    "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
773 	    SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
774 	    NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
775 	    sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
776 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
777 	    nxb->nxb_key_len);
778 
779 done:
780 	if (nx != NULL) {
781 		(void) nx_release_locked(nx);
782 		nx = NULL;
783 	}
784 	SK_UNLOCK();
785 
786 done_unlocked:
787 	ASSERT(nx == NULL);
788 
789 	if (nxb != NULL) {
790 		nxb_free(nxb);
791 		nxb = NULL;
792 	}
793 	if (key != NULL) {
794 		sk_free_data(key, nbr.nb_key_len);
795 		key = NULL;
796 	}
797 	if (p != PROC_NULL) {
798 		proc_rele(p);
799 	}
800 
801 	return err;
802 }
803 
804 /* Hoisted out of line to reduce kernel stack footprint */
805 SK_NO_INLINE_ATTRIBUTE
806 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)807 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
808 {
809 	struct nx_unbind_req nur;
810 	struct kern_nexus *nx = NULL;
811 	int err = 0;
812 
813 	NXCTL_LOCK_ASSERT_HELD(nxctl);
814 
815 	if (sopt->sopt_val == USER_ADDR_NULL) {
816 		return EINVAL;
817 	}
818 
819 	bzero(&nur, sizeof(nur));
820 	err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
821 	if (err != 0) {
822 		return err;
823 	}
824 
825 	if (uuid_is_null(nur.nu_nx_uuid)) {
826 		return EINVAL;
827 	}
828 
829 	SK_LOCK();
830 	nx = nx_find(nur.nu_nx_uuid, TRUE);
831 	if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
832 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
833 		err = ENOENT;
834 		goto done;
835 	}
836 
837 	/* unbind isn't applicable on anonymous nexus provider */
838 	if (NX_ANONYMOUS_PROV(nx)) {
839 		err = ENXIO;
840 		goto done;
841 	}
842 
843 	if (nur.nu_port == NEXUS_PORT_ANY) {
844 		err = EINVAL;
845 		goto done;
846 	}
847 
848 	err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
849 
850 done:
851 	if (nx != NULL) {
852 		(void) nx_release_locked(nx);
853 		nx = NULL;
854 	}
855 	SK_UNLOCK();
856 
857 	return err;
858 }
859 
860 /* Hoisted out of line to reduce kernel stack footprint */
861 SK_NO_INLINE_ATTRIBUTE
862 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)863 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
864 {
865 	struct kern_nexus *nx = NULL;
866 	struct nx_cfg_req ncr;
867 	int err = 0;
868 
869 	NXCTL_LOCK_ASSERT_HELD(nxctl);
870 
871 	if (sopt->sopt_val == USER_ADDR_NULL) {
872 		return EINVAL;
873 	}
874 
875 	bzero(&ncr, sizeof(ncr));
876 	err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
877 	if (err != 0) {
878 		return err;
879 	}
880 
881 	if (uuid_is_null(ncr.nc_nx_uuid)) {
882 		return EINVAL;
883 	}
884 
885 	SK_LOCK();
886 	nx = nx_find(ncr.nc_nx_uuid, TRUE);
887 	if (nx == NULL || (disable_nxctl_check == 0 &&
888 	    nx->nx_prov->nxprov_ctl != nxctl &&
889 	    nxctl != &_kernnxctl)) {    /* make exception for kernnxctl */
890 		err = ENOENT;
891 		goto done;
892 	}
893 
894 	if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
895 		err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
896 		    nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
897 	} else {
898 		err = EPERM;
899 	}
900 
901 	if (err == 0) {
902 		(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
903 	}
904 done:
905 	if (nx != NULL) {
906 		(void) nx_release_locked(nx);
907 		nx = NULL;
908 	}
909 	SK_UNLOCK();
910 
911 	return err;
912 }
913 
914 struct nxbind *
nxb_alloc(zalloc_flags_t how)915 nxb_alloc(zalloc_flags_t how)
916 {
917 	struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
918 
919 	if (nxb) {
920 		SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
921 	}
922 	return nxb;
923 }
924 
925 void
nxb_free(struct nxbind * nxb)926 nxb_free(struct nxbind *nxb)
927 {
928 	SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
929 	    (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
930 
931 	if (nxb->nxb_key != NULL) {
932 		sk_free_data(nxb->nxb_key, nxb->nxb_key_len);
933 		nxb->nxb_key = NULL;
934 	}
935 	zfree(nxbind_zone, nxb);
936 }
937 
938 /*
939  * nxb0 is assumed to possess the truth, compare nxb1 against it.
940  */
941 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)942 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
943 {
944 	ASSERT(nxb0 != NULL && nxb1 != NULL);
945 	ASSERT(nxb0 != nxb1);
946 
947 	/* we always compare using uniqueid and not pid */
948 	if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
949 	    nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
950 		return FALSE;
951 	}
952 
953 	if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
954 	    uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
955 		return FALSE;
956 	}
957 
958 	ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
959 	    (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
960 
961 	if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
962 	    (nxb0->nxb_key_len != nxb1->nxb_key_len ||
963 	    nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
964 	    nxb1->nxb_key_len) != 0)) {
965 		return FALSE;
966 	}
967 
968 	return TRUE;
969 }
970 
971 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)972 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
973 {
974 	ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
975 	    (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
976 
977 	/* in case the destination has a key attached, free it first */
978 	if (dnxb->nxb_key != NULL) {
979 		sk_free_data(dnxb->nxb_key, dnxb->nxb_key_len);
980 		dnxb->nxb_key = NULL;
981 	}
982 
983 	/* move everything from src to dst, and then wipe out src */
984 	bcopy(snxb, dnxb, sizeof(*dnxb));
985 	bzero(snxb, sizeof(*snxb));
986 }
987 
988 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
989 #define MAX_NUM_CH_UUIDS        4096
990 
991 /* Hoisted out of line to reduce kernel stack footprint */
992 SK_NO_INLINE_ATTRIBUTE
993 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)994 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
995 {
996 	user_addr_t tmp_ptr = USER_ADDR_NULL;
997 	uint32_t nuuids = 0, ncuuids = 0;
998 	uuid_t *puuid, *uuids = NULL;
999 	size_t uuids_sz;
1000 	struct ch_list_req clr;
1001 	struct kern_channel *ch = NULL;
1002 	struct kern_nexus *nx = NULL;
1003 	struct kern_nexus find;
1004 	int err = 0, observeall;
1005 
1006 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1007 
1008 	ASSERT(sopt->sopt_p != NULL);
1009 	if (sopt->sopt_val == USER_ADDR_NULL) {
1010 		return EINVAL;
1011 	}
1012 
1013 	err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1014 	if (err != 0) {
1015 		return err;
1016 	}
1017 
1018 	if (uuid_is_null(clr.cl_nx_uuid)) {
1019 		return EINVAL;
1020 	} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1021 		clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1022 	}
1023 
1024 	/*
1025 	 * If the caller specified a buffer, copy out the Channel UUIDs to
1026 	 * caller gracefully.  We only copy out the number of UUIDs which
1027 	 * caller has asked for, but we always tell caller how big the
1028 	 * buffer really needs to be.
1029 	 */
1030 	tmp_ptr = clr.cl_ch_uuids;
1031 	if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1032 		uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1033 		uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1034 		if (uuids == NULL) {
1035 			return ENOBUFS;
1036 		}
1037 	}
1038 
1039 	observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1040 	    PRIV_SKYWALK_OBSERVE_ALL) == 0);
1041 
1042 	SK_LOCK();
1043 	uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1044 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1045 	if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1046 		/*
1047 		 * Return only entries that are visible to the caller,
1048 		 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1049 		 */
1050 		nx = NULL;
1051 	}
1052 	if (nx != NULL) {
1053 		/*
1054 		 * Count number of Channels.  If buffer space exists
1055 		 * and remains, copy out the Channel UUIDs.
1056 		 */
1057 		nuuids = clr.cl_num_ch_uuids;
1058 		puuid = uuids;
1059 
1060 		STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1061 			++ncuuids;
1062 			if (uuids != NULL && nuuids > 0) {
1063 				uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1064 				--nuuids;
1065 				++puuid;
1066 			}
1067 		}
1068 	} else {
1069 		err = ENOENT;
1070 	}
1071 	SK_UNLOCK();
1072 
1073 	if (uuids != NULL) {
1074 		if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1075 			uintptr_t cnt_uuid;
1076 
1077 			/* Note: Pointer arithmetic */
1078 			cnt_uuid = (uintptr_t)(puuid - uuids);
1079 			ASSERT(cnt_uuid > 0);
1080 
1081 			if (sopt->sopt_p != kernproc) {
1082 				err = copyout(uuids, tmp_ptr,
1083 				    cnt_uuid * sizeof(uuid_t));
1084 			} else {
1085 				bcopy(uuids, CAST_DOWN(caddr_t, tmp_ptr),
1086 				    cnt_uuid * sizeof(uuid_t));
1087 			}
1088 		}
1089 		sk_free_data(uuids, uuids_sz);
1090 		uuids = NULL;
1091 	}
1092 
1093 	if (err == 0) {
1094 		clr.cl_num_ch_uuids = ncuuids;
1095 		err = sooptcopyout(sopt, &clr, sizeof(clr));
1096 	}
1097 
1098 	return err;
1099 }
1100 
1101 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1102 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1103 {
1104 	uuid_t p_uuid;
1105 
1106 	bzero(nxctl, sizeof(*nxctl));
1107 
1108 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1109 
1110 	lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1111 	uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1112 	nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1113 	nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1114 	nxctl->nxctl_fp = fp;
1115 	if (nxctl == &_kernnxctl) {
1116 		ASSERT(p == kernproc);
1117 		nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1118 	}
1119 	if (fp == NULL) {
1120 		nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1121 	}
1122 }
1123 
1124 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1125 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1126 {
1127 	struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1128 
1129 	if (nxctl != NULL) {
1130 		nxctl_init(nxctl, p, fp);
1131 	}
1132 	return nxctl;
1133 }
1134 
1135 static void
nxctl_free(struct nxctl * nxctl)1136 nxctl_free(struct nxctl *nxctl)
1137 {
1138 	ASSERT(nxctl->nxctl_refcnt == 0);
1139 	ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1140 	kauth_cred_unref(&nxctl->nxctl_cred);
1141 	lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1142 	SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1143 	if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1144 		zfree(nxctl_zone, nxctl);
1145 	}
1146 }
1147 
1148 static void
nxctl_retain_locked(struct nxctl * nxctl)1149 nxctl_retain_locked(struct nxctl *nxctl)
1150 {
1151 	SK_LOCK_ASSERT_HELD();
1152 
1153 	nxctl->nxctl_refcnt++;
1154 	ASSERT(nxctl->nxctl_refcnt != 0);
1155 }
1156 
1157 void
nxctl_retain(struct nxctl * nxctl)1158 nxctl_retain(struct nxctl *nxctl)
1159 {
1160 	SK_LOCK();
1161 	nxctl_retain_locked(nxctl);
1162 	SK_UNLOCK();
1163 }
1164 
1165 static int
nxctl_release_locked(struct nxctl * nxctl)1166 nxctl_release_locked(struct nxctl *nxctl)
1167 {
1168 	int oldref = nxctl->nxctl_refcnt;
1169 
1170 	SK_LOCK_ASSERT_HELD();
1171 
1172 	ASSERT(nxctl->nxctl_refcnt != 0);
1173 	if (--nxctl->nxctl_refcnt == 0) {
1174 		nxctl_free(nxctl);
1175 	}
1176 
1177 	return oldref == 1;
1178 }
1179 
1180 int
nxctl_release(struct nxctl * nxctl)1181 nxctl_release(struct nxctl *nxctl)
1182 {
1183 	int lastref;
1184 
1185 	SK_LOCK();
1186 	lastref = nxctl_release_locked(nxctl);
1187 	SK_UNLOCK();
1188 
1189 	return lastref;
1190 }
1191 
1192 void
nxctl_dtor(void * arg)1193 nxctl_dtor(void *arg)
1194 {
1195 	struct nxctl *nxctl = arg;
1196 
1197 	nxctl_close(nxctl);
1198 	SK_LOCK();
1199 	(void) nxctl_release_locked(nxctl);
1200 	SK_UNLOCK();
1201 }
1202 
1203 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1204 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1205     struct proc *p)
1206 {
1207 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1208 	int err = 0;
1209 
1210 	ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1211 	ASSERT(ch->ch_ctx == NULL);
1212 
1213 	SK_LOCK_ASSERT_HELD();
1214 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1215 
1216 	/* monitor channels aren't externally visible/usable, so ignore */
1217 	if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1218 	    (ch->ch_flags & CHANF_EXT_SKIP) ||
1219 	    (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1220 	    nxprov->nxprov_ext.nxpi_connected == NULL)) {
1221 		return 0;
1222 	}
1223 
1224 	ch_retain_locked(ch);
1225 	lck_mtx_unlock(&ch->ch_lock);
1226 	SK_UNLOCK();
1227 	lck_mtx_lock(&ch->ch_lock);
1228 
1229 	err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1230 	    ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1231 	if (err != 0) {
1232 		SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1233 		    "error %d", SK_KVA(ch), ch->ch_flags,
1234 		    CHANF_BITS, SK_KVA(nx), err);
1235 		ch->ch_ctx = NULL;
1236 		goto done;
1237 	}
1238 	/*
1239 	 * Upon ring/slot init failure, this is cleared
1240 	 * by nxprov_advise_disconnect() below.
1241 	 */
1242 	atomic_bitset_32(&ch->ch_flags, CHANF_EXT_PRECONNECT);
1243 	if (NXPROV_LLINK(nxprov)) {
1244 		err = nx_netif_llink_ext_init_default_queues(nx);
1245 	} else {
1246 		err = nx_init_rings(nx, ch);
1247 	}
1248 	if (err != 0) {
1249 		goto done;
1250 	}
1251 	ASSERT(err == 0);
1252 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1253 	    CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1254 
1255 	err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1256 	if (err != 0) {
1257 		SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1258 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1259 		goto done;
1260 	}
1261 	atomic_bitset_32(&ch->ch_flags, CHANF_EXT_CONNECTED);
1262 	SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1263 	    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1264 
1265 
1266 done:
1267 	lck_mtx_unlock(&ch->ch_lock);
1268 	SK_LOCK();
1269 	lck_mtx_lock(&ch->ch_lock);
1270 	if ((err != 0) &&
1271 	    (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1272 		nxprov_advise_disconnect(nx, ch);
1273 	}
1274 	/* caller is expected to hold one, in addition to ourselves */
1275 	VERIFY(ch->ch_refcnt >= 2);
1276 	ch_release_locked(ch);
1277 
1278 	return err;
1279 }
1280 
1281 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1282 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1283 {
1284 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
1285 
1286 	SK_LOCK_ASSERT_HELD();
1287 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1288 
1289 	/* check as we might be called in the error handling path */
1290 	if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1291 		ch_retain_locked(ch);
1292 		lck_mtx_unlock(&ch->ch_lock);
1293 		SK_UNLOCK();
1294 		lck_mtx_lock(&ch->ch_lock);
1295 
1296 		ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1297 		if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1298 			nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1299 			atomic_bitclear_32(&ch->ch_flags, CHANF_EXT_CONNECTED);
1300 		}
1301 
1302 		/*
1303 		 * Inform the external domain provider that the rings
1304 		 * and slots for this channel are no longer valid.
1305 		 */
1306 		if (NXPROV_LLINK(nxprov)) {
1307 			nx_netif_llink_ext_fini_default_queues(nx);
1308 		} else {
1309 			nx_fini_rings(nx, ch);
1310 		}
1311 
1312 		ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1313 		nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1314 		atomic_bitclear_32(&ch->ch_flags, CHANF_EXT_PRECONNECT);
1315 
1316 		SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1317 		    SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1318 
1319 		/* We're done with this channel */
1320 		ch->ch_ctx = NULL;
1321 
1322 		lck_mtx_unlock(&ch->ch_lock);
1323 		SK_LOCK();
1324 		lck_mtx_lock(&ch->ch_lock);
1325 		/* caller is expected to hold one, in addition to ourselves */
1326 		VERIFY(ch->ch_refcnt >= 2);
1327 		ch_release_locked(ch);
1328 	}
1329 	ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1330 	ASSERT(ch->ch_ctx == NULL);
1331 }
1332 
1333 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1334 nxprov_create_common(struct nxctl *nxctl,
1335     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1336     const struct kern_nexus_provider_init *init, int *err)
1337 {
1338 	struct skmem_region_params srp[SKMEM_REGIONS];
1339 	struct kern_nexus_provider *nxprov = NULL;
1340 	struct nxprov_params nxp;
1341 	uint32_t override = 0;
1342 	uint32_t pp_region_config_flags;
1343 	int i;
1344 
1345 	_CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1346 	_CASSERT(sizeof(*init) >=
1347 	    sizeof(struct kern_nexus_netif_provider_init));
1348 
1349 	SK_LOCK_ASSERT_HELD();
1350 	ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1351 
1352 	pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1353 	    PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1354 	/*
1355 	 * Special handling for external nexus providers; similar
1356 	 * logic to what's done in kern_pbufpool_create().
1357 	 */
1358 	if (init != NULL) {
1359 		if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1360 			pp_region_config_flags |=
1361 			    PP_REGION_CONFIG_BUF_MONOLITHIC;
1362 		}
1363 
1364 		if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1365 			pp_region_config_flags |=
1366 			    PP_REGION_CONFIG_BUF_NOCACHE;
1367 		}
1368 	}
1369 
1370 	/*
1371 	 * For network devices, set the packet metadata memory as persistent
1372 	 * so that it is wired at segment creation.  This allows us to access
1373 	 * it with preemption disabled, as well as for rdar://problem/46511741.
1374 	 */
1375 	if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1376 		pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1377 	}
1378 
1379 	/* process and validate provider parameters */
1380 	if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1381 	    &nxp, srp, override, pp_region_config_flags)) != 0) {
1382 		goto done;
1383 	}
1384 
1385 	nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1386 	ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1387 
1388 	STAILQ_INIT(&nxprov->nxprov_nx_head);
1389 	STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1390 	nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1391 	nxprov->nxprov_ctl = nxctl;
1392 	uuid_generate_random(nxprov->nxprov_uuid);
1393 	bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1394 
1395 	if (init != NULL) {
1396 		if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1397 			ASSERT(NXPROV_LLINK(nxprov));
1398 			bcopy(init, &nxprov->nxprov_netif_ext,
1399 			    sizeof(nxprov->nxprov_netif_ext));
1400 		} else {
1401 			ASSERT(!NXPROV_LLINK(nxprov));
1402 			ASSERT(init->nxpi_version ==
1403 			    KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1404 			bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1405 		}
1406 		nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1407 	}
1408 
1409 	/* store validated region parameters to the provider */
1410 	for (i = 0; i < SKMEM_REGIONS; i++) {
1411 		nxprov->nxprov_region_params[i] = srp[i];
1412 	}
1413 
1414 	if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1415 		uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1416 
1417 		if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1418 			nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1419 		}
1420 	} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1421 	    NEXUS_TYPE_NET_IF) {
1422 		/*
1423 		 * Treat non-netif built-in nexus providers as those
1424 		 * meant for inter-process communications, i.e. there
1425 		 * is no actual networking hardware involved.
1426 		 */
1427 		nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1428 	}
1429 
1430 	nxprov_retain_locked(nxprov);   /* one for being in the list */
1431 	nxprov_retain_locked(nxprov);   /* one for the caller */
1432 
1433 #if SK_LOG
1434 	uuid_string_t uuidstr;
1435 	SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1436 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1437 #endif /* SK_LOG */
1438 
1439 done:
1440 	return nxprov;
1441 }
1442 
1443 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1444 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1445     int *err)
1446 {
1447 	struct nxprov_params *nxp = &reg->nxpreg_params;
1448 	struct kern_nexus_domain_provider *nxdom_prov = NULL;
1449 	struct kern_nexus_provider *nxprov = NULL;
1450 
1451 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1452 
1453 	ASSERT(nxctl->nxctl_cred != proc_ucred(kernproc));
1454 	*err = 0;
1455 
1456 	switch (nxp->nxp_type) {
1457 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1458 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1459 		    PRIV_SKYWALK_REGISTER_USER_PIPE);
1460 		break;
1461 
1462 	case NEXUS_TYPE_FLOW_SWITCH:    /* allowed for userland */
1463 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1464 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1465 		break;
1466 
1467 	case NEXUS_TYPE_NET_IF:         /* allowed for userland */
1468 		*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1469 		    PRIV_SKYWALK_REGISTER_NET_IF);
1470 		break;
1471 
1472 	case NEXUS_TYPE_KERNEL_PIPE:    /* only for kernel */
1473 	case NEXUS_TYPE_MONITOR:        /* invalid */
1474 	default:
1475 		*err = EINVAL;
1476 		goto done;
1477 	}
1478 
1479 	if (*err != 0) {
1480 		goto done;
1481 	}
1482 
1483 	ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1484 	if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1485 		*err = ENXIO;
1486 		goto done;
1487 	}
1488 
1489 #if CONFIG_NEXUS_NETIF
1490 	/* make sure netif_compat is the default here */
1491 	ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1492 	    strcmp(nxdom_prov->nxdom_prov_name,
1493 	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1494 #endif /* CONFIG_NEXUS_NETIF */
1495 
1496 	SK_LOCK();
1497 	/* callee holds a reference for our caller upon success */
1498 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1499 	SK_UNLOCK();
1500 done:
1501 	return nxprov;
1502 }
1503 
1504 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1505 nxprov_create_kern(struct nxctl *nxctl,
1506     struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1507     const struct kern_nexus_provider_init *init, int *err)
1508 {
1509 	struct nxprov_params *nxp = &reg->nxpreg_params;
1510 	struct kern_nexus_provider *nxprov = NULL;
1511 
1512 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1513 	SK_LOCK_ASSERT_HELD();
1514 
1515 	ASSERT(nxctl->nxctl_cred == proc_ucred(kernproc));
1516 	ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1517 	ASSERT(init == NULL ||
1518 	    init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1519 	    init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1520 
1521 	*err = 0;
1522 
1523 	switch (nxp->nxp_type) {
1524 	case NEXUS_TYPE_NET_IF:
1525 		break;
1526 	case NEXUS_TYPE_KERNEL_PIPE:
1527 		if (init == NULL) {
1528 			*err = EINVAL;
1529 			goto done;
1530 		}
1531 		break;
1532 	case NEXUS_TYPE_FLOW_SWITCH:
1533 		if (init != NULL) {
1534 			*err = EINVAL;
1535 			goto done;
1536 		}
1537 		break;
1538 
1539 	case NEXUS_TYPE_USER_PIPE:      /* only for userland */
1540 	case NEXUS_TYPE_MONITOR:        /* invalid */
1541 	default:
1542 		*err = EINVAL;
1543 		goto done;
1544 	}
1545 
1546 	/* callee holds a reference for our caller upon success */
1547 	nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1548 
1549 done:
1550 	return nxprov;
1551 }
1552 
1553 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1554 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1555 {
1556 	struct kern_nexus_provider *nxprov = NULL;
1557 	int err = 0;
1558 
1559 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1560 
1561 	SK_LOCK();
1562 
1563 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1564 		if (nxctl == nxprov->nxprov_ctl &&
1565 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1566 			nxprov_retain_locked(nxprov);
1567 			break;
1568 		}
1569 	}
1570 
1571 	if (nxprov == NULL) {
1572 		err = ENOENT;
1573 	} else {
1574 		err = nxprov_close(nxprov, TRUE);
1575 	}
1576 
1577 	if (nxprov != NULL) {
1578 		(void) nxprov_release_locked(nxprov);
1579 	}
1580 
1581 	SK_UNLOCK();
1582 
1583 	return err;
1584 }
1585 
1586 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1587 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1588 {
1589 	int err = 0;
1590 
1591 	if (!locked) {
1592 		SK_LOCK();
1593 	}
1594 
1595 	SK_LOCK_ASSERT_HELD();
1596 
1597 #if SK_LOG
1598 	uuid_string_t uuidstr;
1599 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1600 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1601 	    nxprov->nxprov_flags, NXPROVF_BITS);
1602 #endif /* SK_LOG */
1603 
1604 	if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1605 		err = EALREADY;
1606 	} else {
1607 		struct kern_nexus *nx, *tnx;
1608 
1609 		nxprov->nxprov_ctl = NULL;
1610 
1611 		STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1612 		    nx_prov_link, tnx) {
1613 			nx_retain_locked(nx);
1614 			(void) nx_close(nx, TRUE);
1615 			(void) nx_release_locked(nx);
1616 		}
1617 
1618 		if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1619 			/* no nexus created on this, so detach now */
1620 			nxprov_detach(nxprov, TRUE);
1621 		} else {
1622 			/* detach when last nexus is destroyed */
1623 			ASSERT(nxprov->nxprov_refcnt > 1);
1624 			nxprov->nxprov_flags |= NXPROVF_CLOSED;
1625 		}
1626 	}
1627 
1628 	if (!locked) {
1629 		SK_UNLOCK();
1630 	}
1631 
1632 	return err;
1633 }
1634 
1635 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1636 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1637 {
1638 	if (!locked) {
1639 		SK_LOCK();
1640 	}
1641 
1642 	SK_LOCK_ASSERT_HELD();
1643 
1644 #if SK_LOG
1645 	uuid_string_t uuidstr;
1646 	SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1647 	    sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1648 	    nxprov->nxprov_flags, NXPROVF_BITS);
1649 #endif /* SK_LOG */
1650 
1651 	ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1652 	STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1653 	nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1654 
1655 	/* caller must hold an extra ref */
1656 	ASSERT(nxprov->nxprov_refcnt > 1);
1657 	(void) nxprov_release_locked(nxprov);
1658 
1659 	if (!locked) {
1660 		SK_UNLOCK();
1661 	}
1662 }
1663 
1664 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1665 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1666 {
1667 	struct kern_nexus_provider *nxprov;
1668 	struct nxprov_params *nxp;
1669 
1670 	ASSERT(nxdom_prov != NULL);
1671 
1672 	nxp = nxprov_params_alloc(how);
1673 	if (nxp == NULL) {
1674 		SK_ERR("Failed to allocate nxprov_params");
1675 		return NULL;
1676 	}
1677 
1678 	nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1679 	if (nxprov == NULL) {
1680 		SK_ERR("Failed to allocate nxprov");
1681 		nxprov_params_free(nxp);
1682 		return NULL;
1683 	}
1684 
1685 	nxprov->nxprov_dom_prov = nxdom_prov;
1686 	nxprov->nxprov_params = nxp;
1687 	/* hold a reference for nxprov */
1688 	nxdom_prov_retain_locked(nxdom_prov);
1689 
1690 	return nxprov;
1691 }
1692 
1693 static void
nxprov_free(struct kern_nexus_provider * nxprov)1694 nxprov_free(struct kern_nexus_provider *nxprov)
1695 {
1696 	struct kern_nexus_domain_provider *nxdom_prov =
1697 	    nxprov->nxprov_dom_prov;
1698 
1699 	SK_LOCK_ASSERT_HELD();
1700 
1701 	ASSERT(nxdom_prov != NULL);
1702 	(void) nxdom_prov_release_locked(nxdom_prov);
1703 	nxprov->nxprov_dom_prov = NULL;
1704 	ASSERT(nxprov->nxprov_params != NULL);
1705 	nxprov_params_free(nxprov->nxprov_params);
1706 	nxprov->nxprov_params = NULL;
1707 	ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1708 	SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1709 	zfree(nxprov_zone, nxprov);
1710 }
1711 
1712 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1713 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1714 {
1715 	SK_LOCK_ASSERT_HELD();
1716 
1717 	nxprov->nxprov_refcnt++;
1718 	ASSERT(nxprov->nxprov_refcnt != 0);
1719 }
1720 
1721 void
nxprov_retain(struct kern_nexus_provider * nxprov)1722 nxprov_retain(struct kern_nexus_provider *nxprov)
1723 {
1724 	SK_LOCK();
1725 	nxprov_retain_locked(nxprov);
1726 	SK_UNLOCK();
1727 }
1728 
1729 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1730 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1731 {
1732 	int oldref = nxprov->nxprov_refcnt;
1733 
1734 	SK_LOCK_ASSERT_HELD();
1735 
1736 	ASSERT(nxprov->nxprov_refcnt != 0);
1737 	if (--nxprov->nxprov_refcnt == 0) {
1738 		nxprov_free(nxprov);
1739 	}
1740 
1741 	return oldref == 1;
1742 }
1743 
1744 int
nxprov_release(struct kern_nexus_provider * nxprov)1745 nxprov_release(struct kern_nexus_provider *nxprov)
1746 {
1747 	int lastref;
1748 
1749 	SK_LOCK();
1750 	lastref = nxprov_release_locked(nxprov);
1751 	SK_UNLOCK();
1752 
1753 	return lastref;
1754 }
1755 
1756 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1757 nxprov_params_alloc(zalloc_flags_t how)
1758 {
1759 	return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1760 }
1761 
1762 void
nxprov_params_free(struct nxprov_params * nxp)1763 nxprov_params_free(struct nxprov_params *nxp)
1764 {
1765 	SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1766 	zfree(nxprov_params_zone, nxp);
1767 }
1768 
1769 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1770 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1771 {
1772 	struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1773 
1774 	if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1775 		SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1776 		return ENOTSUP;
1777 	}
1778 
1779 	/*
1780 	 * Require that the nexus domain metadata type and the
1781 	 * metadata type of the caller-provided pbufpool match.
1782 	 */
1783 	if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1784 	    pp->pp_md_type ||
1785 	    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1786 	    pp->pp_md_subtype) {
1787 		SK_ERR("Mismatch in metadata type/subtype "
1788 		    "(%u/%u != %u/%u)", pp->pp_md_type,
1789 		    nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1790 		    pp->pp_md_subtype,
1791 		    nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1792 		return EINVAL;
1793 	}
1794 
1795 	/*
1796 	 * Require that the nexus provider memory configuration
1797 	 * has the same impedance as the caller-provided one.
1798 	 * Both need to be lacking or present; if one of them
1799 	 * is set and the other isn't, then we bail.
1800 	 */
1801 	if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1802 	    !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1803 		SK_ERR("Memory config mismatch: monolithic mode");
1804 		return EINVAL;
1805 	}
1806 
1807 	return 0;
1808 }
1809 
1810 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1811 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1812     const nexus_type_t dom_type, const void *nx_ctx,
1813     nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1814     struct kern_pbufpool *rx_pp, int *err)
1815 {
1816 	struct kern_nexus_domain_provider *nxdom_prov;
1817 	struct kern_nexus_provider *nxprov = NULL;
1818 	struct kern_nexus *nx = NULL;
1819 #if SK_LOG
1820 	uuid_string_t uuidstr;
1821 #endif /* SK_LOG */
1822 
1823 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1824 
1825 	ASSERT(dom_type < NEXUS_TYPE_MAX);
1826 	ASSERT(!uuid_is_null(nxprov_uuid));
1827 	*err = 0;
1828 
1829 	SK_LOCK();
1830 
1831 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1832 		if (nxctl == nxprov->nxprov_ctl &&
1833 		    uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1834 			break;
1835 		}
1836 	}
1837 
1838 	if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1839 		SK_ERR("Provider not found or has been closed");
1840 		*err = ENOENT;
1841 		goto done;
1842 	}
1843 
1844 	nxdom_prov = nxprov->nxprov_dom_prov;
1845 	if (dom_type != NEXUS_TYPE_UNDEFINED &&
1846 	    (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1847 		SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1848 		    dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1849 		nxdom_prov = NULL;
1850 		nxprov = NULL;
1851 		*err = ENODEV;
1852 		goto done;
1853 	}
1854 
1855 	if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1856 	    (!tx_pp || !rx_pp)) {
1857 #if SK_LOG
1858 		SK_ERR("TX/RX packet pool is required for netif logical link "
1859 		    "nexus provider UUID: %s",
1860 		    sk_uuid_unparse(nxprov_uuid, uuidstr));
1861 #endif /* SK_LOG */
1862 		nxdom_prov = NULL;
1863 		nxprov = NULL;
1864 		*err = EINVAL;
1865 		goto done;
1866 	}
1867 
1868 	if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1869 	    (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1870 		goto done;
1871 	}
1872 
1873 	nx = nx_alloc(Z_WAITOK);
1874 
1875 	STAILQ_INIT(&nx->nx_ch_head);
1876 	STAILQ_INIT(&nx->nx_ch_nonxref_head);
1877 	lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1878 	    &nexus_lock_attr);
1879 	STAILQ_INIT(&nx->nx_ch_if_adv_head);
1880 	uuid_generate_random(nx->nx_uuid);
1881 	nx->nx_prov = nxprov;
1882 	nx->nx_ctx = (void *)(uintptr_t)nx_ctx;
1883 	nx->nx_ctx_release = nx_ctx_release;
1884 	nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1885 
1886 	if (tx_pp != NULL) {
1887 		nx->nx_tx_pp = tx_pp;
1888 		pp_retain(tx_pp);       /* released by nx_free */
1889 	}
1890 
1891 	if (rx_pp != NULL) {
1892 		nx->nx_rx_pp = rx_pp;
1893 		pp_retain(rx_pp);       /* released by nx_free */
1894 	}
1895 
1896 	/* this nexus is alive; tell the nexus constructor to set it up */
1897 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1898 		*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1899 		if (*err != 0) {
1900 			nx->nx_prov = NULL;
1901 			goto done;
1902 		}
1903 	}
1904 
1905 	nxprov_retain_locked(nxprov);   /* hold a ref on the nexus reg */
1906 
1907 	STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1908 	nxprov->nxprov_nx_count++;
1909 	RB_INSERT(kern_nexus_tree, &nx_head, nx);
1910 	atomic_bitset_32(&nx->nx_flags, NXF_ATTACHED);
1911 
1912 	nx_retain_locked(nx);   /* one for the provider list */
1913 	nx_retain_locked(nx);   /* one for the global list */
1914 	nx_retain_locked(nx);   /* one for the caller */
1915 
1916 #if SK_LOG
1917 	SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1918 	    nxdom_prov->nxdom_prov_dom->nxdom_name,
1919 	    nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1920 #endif /* SK_LOG */
1921 done:
1922 	SK_UNLOCK();
1923 
1924 	if (*err != 0) {
1925 		if (nx != NULL) {
1926 			nx_free(nx);
1927 			nx = NULL;
1928 		}
1929 	}
1930 	return nx;
1931 }
1932 
1933 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1934 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1935 {
1936 	struct kern_nexus *nx = NULL;
1937 	struct kern_nexus find;
1938 	int err = 0;
1939 
1940 	NXCTL_LOCK_ASSERT_HELD(nxctl);
1941 
1942 	SK_LOCK();
1943 
1944 	uuid_copy(find.nx_uuid, nx_uuid);
1945 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1946 	if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
1947 		nx = NULL;
1948 	}
1949 
1950 	if (nx != NULL) {
1951 		nx_retain_locked(nx);
1952 	}
1953 
1954 	if (nx == NULL) {
1955 		err = ENOENT;
1956 	} else {
1957 		err = nx_close(nx, TRUE);
1958 		(void) nx_release_locked(nx);
1959 	}
1960 
1961 	SK_UNLOCK();
1962 
1963 	return err;
1964 }
1965 
1966 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)1967 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
1968 {
1969 	return uuid_compare(a->nx_uuid, b->nx_uuid);
1970 }
1971 
1972 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)1973 nx_find(const uuid_t nx_uuid, boolean_t locked)
1974 {
1975 	struct kern_nexus *nx = NULL;
1976 	struct kern_nexus find;
1977 
1978 	if (!locked) {
1979 		SK_LOCK();
1980 	}
1981 
1982 	SK_LOCK_ASSERT_HELD();
1983 
1984 	uuid_copy(find.nx_uuid, nx_uuid);
1985 	nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1986 	if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
1987 		nx = NULL;
1988 	}
1989 
1990 	/* return reference to caller */
1991 	if (nx != NULL) {
1992 		nx_retain_locked(nx);
1993 	}
1994 
1995 	if (!locked) {
1996 		SK_UNLOCK();
1997 	}
1998 
1999 	return nx;
2000 }
2001 
2002 int
nx_close(struct kern_nexus * nx,boolean_t locked)2003 nx_close(struct kern_nexus *nx, boolean_t locked)
2004 {
2005 	int err = 0;
2006 
2007 	if (!locked) {
2008 		SK_LOCK();
2009 	}
2010 
2011 	SK_LOCK_ASSERT_HELD();
2012 
2013 
2014 	if (nx->nx_flags & NXF_CLOSED) {
2015 		err = EALREADY;
2016 	} else {
2017 #if SK_LOG
2018 		uuid_string_t uuidstr;
2019 		SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2020 		    NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2021 		    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2022 		    NXF_BITS);
2023 #endif /* SK_LOG */
2024 
2025 		if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2026 			/* no regular channels open to it, so detach now */
2027 			nx_detach(nx);
2028 		} else {
2029 			/* detach when the last channel closes */
2030 			ASSERT(nx->nx_refcnt > 3);
2031 			atomic_bitset_32(&nx->nx_flags, NXF_CLOSED);
2032 		}
2033 	}
2034 
2035 	if (!locked) {
2036 		SK_UNLOCK();
2037 	}
2038 
2039 	return err;
2040 }
2041 
2042 void
nx_stop(struct kern_nexus * nx)2043 nx_stop(struct kern_nexus *nx)
2044 {
2045 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2046 
2047 	SK_LOCK_ASSERT_HELD();
2048 
2049 	/* send a stop message */
2050 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2051 		nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2052 	}
2053 }
2054 
2055 void
nx_detach(struct kern_nexus * nx)2056 nx_detach(struct kern_nexus *nx)
2057 {
2058 	struct kern_nexus_provider *nxprov = nx->nx_prov;
2059 
2060 	SK_LOCK_ASSERT_HELD();
2061 
2062 #if SK_LOG
2063 	uuid_string_t uuidstr;
2064 	SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2065 	    sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2066 #endif /* SK_LOG */
2067 
2068 	/* Caller must hold extra refs, on top of the two in reg/global lists */
2069 	ASSERT(nx->nx_refcnt >= 3);
2070 	ASSERT(nx->nx_flags & NXF_ATTACHED);
2071 
2072 	/* this nexus is done; let the nexus destructor do final cleanups */
2073 	if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2074 		nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2075 	}
2076 
2077 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2078 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2079 
2080 	STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2081 	nxprov->nxprov_nx_count--;
2082 	RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2083 	atomic_bitclear_32(&nx->nx_flags, NXF_ATTACHED);
2084 	nx->nx_prov = NULL;
2085 	if (nx->nx_ctx_release != NULL) {
2086 		nx->nx_ctx_release(nx->nx_ctx);
2087 	}
2088 	nx->nx_ctx = NULL;
2089 
2090 	(void) nx_release_locked(nx);   /* one for the reg list */
2091 	(void) nx_release_locked(nx);   /* one for the global list */
2092 
2093 	/*
2094 	 * If this was the last nexus and the provider has been closed,
2095 	 * detach the provider and and finish up the postponed job.
2096 	 */
2097 	if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2098 	    (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2099 		nxprov_detach(nxprov, TRUE);
2100 	}
2101 	(void) nxprov_release_locked(nxprov);
2102 }
2103 
2104 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2105 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2106     struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2107 {
2108 	struct __kern_nexus_adv_metadata *adv_md;
2109 
2110 	_CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2111 	_CASSERT((sizeof(struct sk_nexusadv) +
2112 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2113 	_CASSERT((sizeof(struct netif_nexus_advisory) +
2114 	    sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2115 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2116 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2117 	ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2118 	    type == NEXUS_ADVISORY_TYPE_NETIF);
2119 
2120 	if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2121 	    NULL, NULL, NULL)) == NULL) {
2122 		return ENOMEM;
2123 	}
2124 
2125 	nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, NULL,
2126 	    NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC));
2127 	adv_md = nx->nx_adv.nxv_adv;
2128 	adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2129 	adv_md->knam_type = type;
2130 	adv_md->__reserved = 0;
2131 	nx->nx_adv.nxv_adv_type = type;
2132 	nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2133 	if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2134 		nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2135 		    NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2136 	} else {
2137 		nx->nx_adv.netif_nxv_adv->nna_version =
2138 		    NX_NETIF_ADVISORY_CURRENT_VERSION;
2139 	}
2140 	return 0;
2141 }
2142 
2143 void
nx_advisory_free(struct kern_nexus * nx)2144 nx_advisory_free(struct kern_nexus *nx)
2145 {
2146 	if (nx->nx_adv.nxv_reg != NULL) {
2147 		ASSERT(nx->nx_adv.nxv_adv != NULL);
2148 		skmem_region_free(nx->nx_adv.nxv_reg,
2149 		    nx->nx_adv.nxv_adv, NULL);
2150 		nx->nx_adv.nxv_adv = NULL;
2151 		nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2152 		nx->nx_adv.flowswitch_nxv_adv = NULL;
2153 		skmem_region_release(nx->nx_adv.nxv_reg);
2154 		nx->nx_adv.nxv_reg = NULL;
2155 	}
2156 
2157 	ASSERT(nx->nx_adv.nxv_reg == NULL);
2158 	ASSERT(nx->nx_adv.nxv_adv == NULL);
2159 	ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2160 	ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2161 }
2162 
2163 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2164 nx_alloc(zalloc_flags_t how)
2165 {
2166 	SK_LOCK_ASSERT_HELD();
2167 
2168 	return zalloc_flags(nx_zone, how | Z_ZERO);
2169 }
2170 
2171 static void
nx_free(struct kern_nexus * nx)2172 nx_free(struct kern_nexus *nx)
2173 {
2174 	ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2175 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2176 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2177 
2178 	nx_port_free_all(nx);
2179 
2180 	if (nx->nx_tx_pp != NULL) {
2181 		pp_release(nx->nx_tx_pp);
2182 		nx->nx_tx_pp = NULL;
2183 	}
2184 	if (nx->nx_rx_pp != NULL) {
2185 		pp_release(nx->nx_rx_pp);
2186 		nx->nx_rx_pp = NULL;
2187 	}
2188 
2189 	ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2190 	lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2191 
2192 	SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2193 	zfree(nx_zone, nx);
2194 }
2195 
2196 void
nx_retain_locked(struct kern_nexus * nx)2197 nx_retain_locked(struct kern_nexus *nx)
2198 {
2199 	SK_LOCK_ASSERT_HELD();
2200 
2201 	nx->nx_refcnt++;
2202 	VERIFY(nx->nx_refcnt > 0);
2203 }
2204 
2205 void
nx_retain(struct kern_nexus * nx)2206 nx_retain(struct kern_nexus *nx)
2207 {
2208 	SK_LOCK();
2209 	nx_retain_locked(nx);
2210 	SK_UNLOCK();
2211 }
2212 
2213 int
nx_release_locked(struct kern_nexus * nx)2214 nx_release_locked(struct kern_nexus *nx)
2215 {
2216 	int oldref = nx->nx_refcnt;
2217 
2218 	SK_LOCK_ASSERT_HELD();
2219 
2220 	VERIFY(nx->nx_refcnt > 0);
2221 	if (--nx->nx_refcnt == 0) {
2222 		nx_free(nx);
2223 	}
2224 
2225 	return oldref == 1;
2226 }
2227 
2228 int
nx_release(struct kern_nexus * nx)2229 nx_release(struct kern_nexus *nx)
2230 {
2231 	int lastref;
2232 
2233 	SK_LOCK_ASSERT_NOTHELD();
2234 
2235 	SK_LOCK();
2236 	lastref = nx_release_locked(nx);
2237 	SK_UNLOCK();
2238 
2239 	return lastref;
2240 }
2241 
2242 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2243 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2244 {
2245 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2246 	struct nexus_adapter *na = ch->ch_na;
2247 	boolean_t undo = FALSE;
2248 	int ksd_retains = 0;
2249 	enum txrx t;
2250 	int err = 0;
2251 
2252 	ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2253 	    CHANF_EXT_PRECONNECT);
2254 
2255 	if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2256 		return 0;
2257 	}
2258 
2259 	for_rx_tx(t) {
2260 		uint32_t i;
2261 
2262 		for (i = 0; i < na_get_nrings(na, t); i++) {
2263 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2264 
2265 			/* skip host rings */
2266 			if (kring->ckr_flags & CKRF_HOST) {
2267 				continue;
2268 			}
2269 
2270 			if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2271 				    nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2272 				    &kring->ckr_ctx)) != 0) {
2273 				SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2274 				    "(0x%llx) krflags %b ring_init error %d",
2275 				    SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2276 				    SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2277 				    kring->ckr_flags, CKRF_BITS, err);
2278 				kring->ckr_ctx = NULL;
2279 				undo = TRUE;
2280 				break;
2281 			}
2282 			kring->ckr_flags |= CKRF_EXT_RING_INITED;
2283 
2284 			if ((err = nx_init_slots(nx, kring)) != 0) {
2285 				undo = TRUE;
2286 				break;
2287 			}
2288 
2289 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2290 				++ksd_retains;
2291 			}
2292 		}
2293 		if (undo) {
2294 			break;
2295 		}
2296 	}
2297 
2298 	/*
2299 	 * Note: retain KSD even in case of error, as we have set
2300 	 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2301 	 * nx_fini_rings would take care of release based on it.
2302 	 */
2303 	if (ksd_retains != 0) {
2304 		/*
2305 		 * Mark the kernel slot descriptor region as busy; this
2306 		 * prevents it from being torn-down at channel defunct
2307 		 * time, as we need to invoke the slot_fini() callback
2308 		 * for each slot and we need the descriptors until then.
2309 		 */
2310 		skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2311 		    ksd_retains);
2312 	}
2313 
2314 	if (err != 0) {
2315 		ASSERT(undo);
2316 		nx_fini_rings(nx, ch);
2317 	}
2318 
2319 	return err;
2320 }
2321 
2322 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2323 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2324 {
2325 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2326 	struct nexus_adapter *na = ch->ch_na;
2327 	int ksd_releases = 0;
2328 	enum txrx t;
2329 
2330 	for_rx_tx(t) {
2331 		uint32_t i;
2332 
2333 		for (i = 0; i < na_get_nrings(na, t); i++) {
2334 			struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2335 
2336 			if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2337 				continue;
2338 			}
2339 
2340 			ASSERT(!(kring->ckr_flags & CKRF_HOST));
2341 			ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2342 			nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2343 			kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2344 
2345 			if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2346 				++ksd_releases;
2347 			}
2348 
2349 			/*
2350 			 * Undo the work done in nx_init_slots() and inform
2351 			 * the external domain provider, if applicable, that
2352 			 * the slots for this ring are no longer valid.
2353 			 */
2354 			nx_fini_slots(nx, kring);
2355 			kring->ckr_ctx = NULL;
2356 		}
2357 	}
2358 
2359 	if (ksd_releases != 0) {
2360 		/*
2361 		 * Now that we've finished invoking the slot_fini()
2362 		 * callbacks, release the busy retain counts held
2363 		 * earlier in nx_init_rings().  This will allow the
2364 		 * kernel slot descriptor region to be torn down.
2365 		 */
2366 		skmem_arena_nexus_sd_set_noidle(
2367 			skmem_arena_nexus(na->na_arena), -ksd_releases);
2368 	}
2369 }
2370 
2371 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2372 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2373 {
2374 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2375 	struct __slot_desc *slot = kring->ckr_ksds;
2376 	int err = 0;
2377 	uint32_t i;
2378 
2379 	/*
2380 	 * If the slot init callback was not provided, or if the
2381 	 * kring was not created to hold any slot contexts, don't
2382 	 * go any further.
2383 	 */
2384 	if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2385 	    kring->ckr_slot_ctxs == NULL) {
2386 		return 0;
2387 	}
2388 
2389 	ASSERT(kring->ckr_slot_ctxs_set == 0);
2390 	ASSERT(slot != NULL);
2391 
2392 	for (i = 0; i < kring->ckr_num_slots; i++) {
2393 		struct kern_slot_prop *slot_ctx_prop = NULL;
2394 		void *slot_ctx_arg = NULL;
2395 
2396 		ASSERT(&slot[i] <= kring->ckr_ksds_last);
2397 		if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2398 		    &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2399 			SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2400 			    "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2401 			    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2402 			break;
2403 		}
2404 		/* we don't want this to be used by client, so verify here */
2405 		ASSERT(slot_ctx_prop == NULL);
2406 		kring->ckr_slot_ctxs[i].slot_ctx_arg =
2407 		    (mach_vm_address_t)slot_ctx_arg;
2408 		kring->ckr_slot_ctxs_set++;
2409 	}
2410 
2411 	if (err != 0) {
2412 		nx_fini_slots(nx, kring);
2413 	} else {
2414 		kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2415 	}
2416 
2417 	return err;
2418 }
2419 
2420 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2421 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2422 {
2423 	struct kern_nexus_provider *nxprov = NX_PROV(nx);
2424 	struct __slot_desc *slot = kring->ckr_ksds;
2425 	uint32_t i;
2426 
2427 	ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2428 	    nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2429 	ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2430 
2431 	for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2432 		ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2433 		if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2434 			nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2435 			    kring, &slot[i], i);
2436 		}
2437 		if (kring->ckr_slot_ctxs != NULL) {
2438 			kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2439 		}
2440 	}
2441 	kring->ckr_slot_ctxs_set = 0;
2442 
2443 	/* We're done with this kring */
2444 	kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2445 }
2446 
2447 
2448 /* 64-bit mask with range */
2449 #define BMASK64(_beg, _end)     \
2450 	((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2451 
2452 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2453 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2454     nexus_port_t last, nexus_port_t *nx_port)
2455 {
2456 	int err = 0;
2457 
2458 	ASSERT(first < last);
2459 	*nx_port = NEXUS_PORT_ANY;
2460 
2461 	if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2462 		/*
2463 		 * Left edge of the range is beyond the current map;
2464 		 * let nx_port_alloc() handle the growing later.
2465 		 */
2466 		*nx_port = first;
2467 	} else {
2468 		nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2469 		nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2470 		nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2471 		nexus_port_size_t i, j;
2472 		bitmap_t *bmap;
2473 
2474 		/*
2475 		 * The right edge of the range is either within or
2476 		 * beyond the current map; scan thru the current
2477 		 * map and find the first available port.
2478 		 */
2479 		for (i = fc; i <= lc; i++) {
2480 			bitmap_t mask;
2481 			nexus_port_size_t beg = 0, end = 63;
2482 
2483 			if (i == fc) {
2484 				beg = (first % NX_PORT_CHUNK);
2485 			}
2486 			if (i == (last / NX_PORT_CHUNK)) {
2487 				end = (last % NX_PORT_CHUNK);
2488 			}
2489 
2490 			if (i < lim) {
2491 				bmap = &nx->nx_ports_bmap[i];
2492 				mask = BMASK64(beg, end);
2493 
2494 				j = (nexus_port_size_t)ffsll((*bmap) & mask);
2495 				if (j == 0) {
2496 					continue;
2497 				}
2498 
2499 				--j;
2500 				*nx_port = (i * NX_PORT_CHUNK) + j;
2501 			}
2502 			break;
2503 		}
2504 
2505 		/*
2506 		 * If the requested range is within the current map and we
2507 		 * couldn't find a port, return an err.  Otherwise, return
2508 		 * the next port index to trigger growing later.
2509 		 */
2510 		if (*nx_port == NEXUS_PORT_ANY) {
2511 			if (lc == (last / NX_PORT_CHUNK)) {
2512 				err = EBUSY;
2513 				SK_ERR("port unavail in [%u, %u)", first, last);
2514 			} else {
2515 				*nx_port = nx->nx_num_ports;
2516 			}
2517 		}
2518 	}
2519 
2520 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2521 	    (int)*nx_port, err);
2522 
2523 	return err;
2524 }
2525 
2526 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2527 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2528 {
2529 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2530 	nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2531 	struct nx_port_info *ports;
2532 	size_t limit;
2533 	nexus_port_size_t i, num_ports, old_num_ports;
2534 	bitmap_t *bmap;
2535 
2536 	ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2537 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2538 	_CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2539 	ASSERT(powerof2(dom_port_max));
2540 	ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2541 
2542 	old_num_ports = nx->nx_num_ports;
2543 	num_ports = nx->nx_num_ports + grow;
2544 	limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2545 	if (num_ports > limit) {
2546 		SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2547 		    nx->nx_num_ports, grow, num_ports, limit);
2548 		return EDOM;
2549 	}
2550 
2551 	if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2552 	    (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2553 	    (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2554 	    Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2555 		SK_ERR("bmap alloc failed, num_port %u", num_ports);
2556 		return ENOMEM;
2557 	}
2558 	nx->nx_ports_bmap = bmap;
2559 
2560 	if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2561 	    num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2562 		/* can't free bmap here, otherwise nexus won't work */
2563 		SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2564 		return ENOMEM;
2565 	}
2566 
2567 	/* initialize the additional new ports */
2568 	bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2569 	nx->nx_ports = ports;
2570 
2571 	/* initialize new bitmaps (set all bits) */
2572 	for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2573 	    i < (num_ports / NX_PORT_CHUNK); i++) {
2574 		bmap[i] = NX_PORT_CHUNK_FREE;
2575 	}
2576 
2577 	nx->nx_num_ports = num_ports;
2578 
2579 	SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2580 	    SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2581 
2582 	return 0;
2583 }
2584 
2585 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2586 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2587     struct nexus_adapter **na, struct proc *p)
2588 {
2589 	struct nx_port_info *npi = NULL;
2590 	struct nxbind *nxb0;
2591 	size_t g;
2592 	uint32_t i, j;
2593 	bitmap_t *bmap;
2594 	bool refonly = false;
2595 	int err = 0;
2596 
2597 	ASSERT(nx_port != NEXUS_PORT_ANY);
2598 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2599 
2600 	/* port is zero-based, so adjust here */
2601 	if ((nx_port + 1) > nx->nx_num_ports) {
2602 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2603 		VERIFY(g <= NEXUS_PORT_MAX);
2604 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2605 			goto done;
2606 		}
2607 	}
2608 	ASSERT(err == 0);
2609 	ASSERT(nx_port < nx->nx_num_ports);
2610 	npi = &nx->nx_ports[nx_port];
2611 	nxb0 = npi->npi_nxb;
2612 	i = nx_port / NX_PORT_CHUNK;
2613 	j = nx_port % NX_PORT_CHUNK;
2614 	bmap = &nx->nx_ports_bmap[i];
2615 
2616 	if (bit_test(*bmap, j)) {
2617 		/* port is not (yet) bound or allocated */
2618 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2619 		if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2620 			/*
2621 			 * If the port allocation is requested by userland
2622 			 * and the nexus is non-anonymous, then fail the
2623 			 * request.
2624 			 */
2625 			err = EACCES;
2626 			SK_ERR("user proc alloc on named nexus needs binding");
2627 		} else if (na != NULL && *na != NULL) {
2628 			/*
2629 			 * Otherwise claim it (clear bit) if the caller
2630 			 * supplied an adapter for this port; else, it
2631 			 * is just an existential check and so there's
2632 			 * no action needed at this point (we'll skip
2633 			 * the init below since vpna is NULL).
2634 			 */
2635 			bit_clear(*bmap, j);
2636 		}
2637 	} else {
2638 		/* if port is bound, check if credentials match */
2639 		if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2640 		    (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2641 			SK_ERR("nexus binding mismatch");
2642 			err = EACCES;
2643 		} else {
2644 			/*
2645 			 * If port is already occupied by an adapter,
2646 			 * see if the client is requesting a reference
2647 			 * to it; if so, return the adapter.  Otherwise,
2648 			 * if unoccupied and vpna is non-NULL, associate
2649 			 * it with this nexus port via the below init.
2650 			 */
2651 			if (NPI_NA(npi) != NULL) {
2652 				if (na != NULL && *na == NULL) {
2653 					*na = NPI_NA(npi);
2654 					na_retain_locked(*na);
2655 					/* skip the init below */
2656 					refonly = true;
2657 				} else {
2658 					/*
2659 					 * If the client supplied an adapter
2660 					 * (regardless of its value) for a
2661 					 * nexus port that's already occupied,
2662 					 * then we fail the request.
2663 					 */
2664 					SK_ERR("nexus adapted exits");
2665 					err = EEXIST;
2666 				}
2667 			}
2668 		}
2669 	}
2670 
2671 done:
2672 	/* initialize the nexus port and the adapter occupying it */
2673 	if (err == 0 && na != NULL && *na != NULL && !refonly) {
2674 		ASSERT(nx_port < nx->nx_num_ports);
2675 		ASSERT(npi->npi_nah == 0);
2676 		ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2677 		ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2678 		    (nx_port % NX_PORT_CHUNK)));
2679 
2680 		nx->nx_active_ports++;
2681 		npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2682 		(*na)->na_nx_port = nx_port;
2683 	}
2684 
2685 	SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2686 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2687 	    err);
2688 
2689 	return err;
2690 }
2691 
2692 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2693 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2694 {
2695 	struct nx_port_info *npi = &nx->nx_ports[nx_port];
2696 
2697 	npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2698 	    NEXUS_PORT_STATE_DEFUNCT);
2699 }
2700 
2701 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2702 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2703 {
2704 	struct nx_port_info *npi = NULL;
2705 	bitmap_t *bmap;
2706 	uint32_t i, j;
2707 
2708 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2709 	ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2710 	ASSERT(nx->nx_active_ports != 0);
2711 
2712 	i = nx_port / NX_PORT_CHUNK;
2713 	j = nx_port % NX_PORT_CHUNK;
2714 	bmap = &nx->nx_ports_bmap[i];
2715 	ASSERT(!bit_test(*bmap, j));
2716 
2717 	npi = &nx->nx_ports[nx_port];
2718 	npi->npi_nah = 0;
2719 	if (npi->npi_nxb == NULL) {
2720 		/* it's vacant, release it (set bit) */
2721 		bit_set(*bmap, j);
2722 	}
2723 
2724 	nx->nx_active_ports--;
2725 
2726 	//XXX [email protected] --- try to shrink bitmap & nx_ports ???
2727 
2728 	SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2729 	    SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2730 }
2731 
2732 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2733 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2734     struct nxbind *nxb0, void *info)
2735 {
2736 	struct nx_port_info *npi = NULL;
2737 	size_t g;
2738 	uint32_t i, j;
2739 	bitmap_t *bmap;
2740 	int err = 0;
2741 
2742 	ASSERT(nx_port != NEXUS_PORT_ANY);
2743 	ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2744 	ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2745 	ASSERT(nxb0 != NULL);
2746 
2747 	if ((nx_port) + 1 > nx->nx_num_ports) {
2748 		g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2749 		VERIFY(g <= NEXUS_PORT_MAX);
2750 		if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2751 			goto done;
2752 		}
2753 	}
2754 	ASSERT(err == 0);
2755 
2756 	npi = &nx->nx_ports[nx_port];
2757 	i = nx_port / NX_PORT_CHUNK;
2758 	j = nx_port % NX_PORT_CHUNK;
2759 	bmap = &nx->nx_ports_bmap[i];
2760 	if (bit_test(*bmap, j)) {
2761 		/* port is not (yet) bound or allocated */
2762 		ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2763 
2764 		bit_clear(*bmap, j);
2765 		struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2766 		nxb_move(nxb0, nxb);
2767 		npi->npi_nxb = nxb;
2768 		npi->npi_info = info;
2769 		/* claim it (clear bit) */
2770 		bit_clear(*bmap, j);
2771 		ASSERT(err == 0);
2772 	} else {
2773 		/* port is already taken */
2774 		ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2775 		err = EEXIST;
2776 	}
2777 done:
2778 
2779 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2780 	    "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2781 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2782 
2783 	return err;
2784 }
2785 
2786 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2787 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2788 {
2789 	return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2790 }
2791 
2792 static int
nx_port_info_size(void * info,size_t * sz)2793 nx_port_info_size(void *info, size_t *sz)
2794 {
2795 	struct nx_port_info_header *hdr = info;
2796 
2797 	switch (hdr->ih_type) {
2798 	case NX_PORT_INFO_TYPE_NETIF:
2799 		break;
2800 	default:
2801 		return EINVAL;
2802 	}
2803 	*sz = hdr->ih_size;
2804 	return 0;
2805 }
2806 
2807 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2808 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2809 {
2810 	struct nx_port_info *npi = NULL;
2811 	struct nxbind *nxb;
2812 	uint32_t i, j;
2813 	bitmap_t *bmap;
2814 	int err = 0;
2815 
2816 	ASSERT(nx_port != NEXUS_PORT_ANY);
2817 
2818 	if (nx_port >= nx->nx_num_ports) {
2819 		err = EDOM;
2820 		goto done;
2821 	}
2822 
2823 	npi = &nx->nx_ports[nx_port];
2824 	i = nx_port / NX_PORT_CHUNK;
2825 	j = nx_port % NX_PORT_CHUNK;
2826 	bmap = &nx->nx_ports_bmap[i];
2827 
2828 	if ((nxb = npi->npi_nxb) == NULL) {
2829 		/* must be either free or allocated */
2830 		ASSERT(NPI_NA(npi) == NULL ||
2831 		    (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2832 		err = ENOENT;
2833 	} else {
2834 		nxb_free(nxb);
2835 		npi->npi_nxb = NULL;
2836 		if (npi->npi_info != NULL) {
2837 			size_t sz;
2838 
2839 			VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2840 			sk_free_data(npi->npi_info, sz);
2841 			npi->npi_info = NULL;
2842 		}
2843 		ASSERT(!bit_test(*bmap, j));
2844 		if (NPI_NA(npi) == NULL) {
2845 			/* it's vacant, release it (set bit) */
2846 			bit_set(*bmap, j);
2847 		}
2848 	}
2849 
2850 done:
2851 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2852 	    "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2853 	    (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2854 
2855 	return err;
2856 }
2857 
2858 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2859 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2860 {
2861 	if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2862 		return NPI_NA(&nx->nx_ports[nx_port]);
2863 	} else {
2864 		return NULL;
2865 	}
2866 }
2867 
2868 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * info,uint32_t len)2869 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2870     nx_port_info_type_t type, void *info, uint32_t len)
2871 {
2872 	struct nx_port_info *npi;
2873 	struct nx_port_info_header *hdr;
2874 
2875 	if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2876 		return ENXIO;
2877 	}
2878 	npi = &nx->nx_ports[port];
2879 	hdr = npi->npi_info;
2880 	if (hdr == NULL) {
2881 		return ENOENT;
2882 	}
2883 
2884 	if (hdr->ih_type != type) {
2885 		return EINVAL;
2886 	}
2887 
2888 	bcopy(npi->npi_info, info, len);
2889 	return 0;
2890 }
2891 
2892 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2893 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2894 {
2895 	return nx_port < nx->nx_num_ports;
2896 }
2897 
2898 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2899 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2900 {
2901 	ASSERT(nx_port_is_valid(nx, nx_port));
2902 
2903 	return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2904 }
2905 
2906 void
nx_port_free_all(struct kern_nexus * nx)2907 nx_port_free_all(struct kern_nexus *nx)
2908 {
2909 	uint32_t num_ports;
2910 
2911 	/* uncrustify doesn't handle C blocks properly */
2912 	/* BEGIN IGNORE CODESTYLE */
2913 	nx_port_foreach(nx, ^(nexus_port_t p) {
2914 		struct nxbind *nxb;
2915 		void *info;
2916 		nxb = nx->nx_ports[p].npi_nxb;
2917 		info = nx->nx_ports[p].npi_info;
2918 		if (nxb != NULL) {
2919 			nxb_free(nxb);
2920 			nx->nx_ports[p].npi_nxb = NULL;
2921 		}
2922 		if (info != NULL) {
2923 			size_t sz;
2924 
2925 			VERIFY(nx_port_info_size(info, &sz) == 0);
2926 			skn_free_data(info, info, sz);
2927 			nx->nx_ports[p].npi_info = NULL;
2928 		}
2929 	});
2930 	/* END IGNORE CODESTYLE */
2931 
2932 	num_ports = nx->nx_num_ports;
2933 	nx->nx_num_ports = 0;
2934 	nx->nx_active_ports = 0;
2935 	skn_free_data(ports_bmap,
2936 	    nx->nx_ports_bmap, (num_ports / NX_PORT_CHUNK) * sizeof(bitmap_t));
2937 	nx->nx_ports_bmap = NULL;
2938 	sk_free_type_array(struct nx_port_info, num_ports, nx->nx_ports);
2939 	nx->nx_ports = NULL;
2940 }
2941 
2942 void
2943 nx_port_foreach(struct kern_nexus *nx,
2944     void (^port_handle)(nexus_port_t nx_port))
2945 {
2946 	for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
2947 		bitmap_t bmap = nx->nx_ports_bmap[i];
2948 
2949 		if (bmap == NX_PORT_CHUNK_FREE) {
2950 			continue;
2951 		}
2952 
2953 		for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
2954 			if (bit_test(bmap, j)) {
2955 				continue;
2956 			}
2957 			port_handle((i * NX_PORT_CHUNK) + j);
2958 		}
2959 	}
2960 }
2961 
2962 /*
2963  * sysctl interfaces
2964  */
2965 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
2966 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
2967 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
2968 
2969 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
2970     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2971     0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
2972 
2973 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
2974     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2975     0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
2976 
2977 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
2978     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2979     0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
2980     "A list of logical links");
2981 
2982 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
2983     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
2984     0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
2985     "Nexus inet flows with stats collected in kernel");
2986 
2987 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
2988     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2989     0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
2990     "Nexus flow owners");
2991 
2992 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
2993     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2994     0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
2995     "Nexus flow routes");
2996 
2997 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
2998     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2999     0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3000     "Nexus netif statistics collected in kernel");
3001 
3002 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3003     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3004     0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3005     "Nexus flowswitch statistics collected in kernel");
3006 
3007 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3008     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3009     0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3010     "Nexus userstack statistics counter");
3011 
3012 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3013     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3014     0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3015     "Nexus flow advisory dump");
3016 
3017 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3018     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3019     0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3020     "A list of netif queue stats entries");
3021 
3022 /*
3023  * Provider list sysctl
3024  */
3025 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3026 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3027     nexus_provider_info_t info)
3028 {
3029 	struct kern_nexus *nx;
3030 	uuid_t *uuids;
3031 
3032 	SK_LOCK_ASSERT_HELD();
3033 
3034 	/* provider UUID + params */
3035 	uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3036 	bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3037 	    sizeof(struct nxprov_params));
3038 	info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3039 
3040 	/* instance UUID list */
3041 	uuids = info->npi_instance_uuids;
3042 	STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3043 		uuid_copy(*uuids, nx->nx_uuid);
3044 		uuids++;
3045 	}
3046 }
3047 
3048 static int
3049 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3050 {
3051 #pragma unused(arg1, arg2, oidp)
3052 	size_t actual_space;
3053 	caddr_t buffer = NULL;
3054 	size_t buffer_space;
3055 	size_t allocated_space;
3056 	int out_error;
3057 	int error = 0;
3058 	struct kern_nexus_provider *nxprov;
3059 	caddr_t scan;
3060 
3061 	if (!kauth_cred_issuser(kauth_cred_get())) {
3062 		return EPERM;
3063 	}
3064 
3065 	net_update_uptime();
3066 	buffer_space = req->oldlen;
3067 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3068 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3069 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3070 		}
3071 		allocated_space = buffer_space;
3072 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3073 		if (__improbable(buffer == NULL)) {
3074 			return ENOBUFS;
3075 		}
3076 	} else if (req->oldptr == USER_ADDR_NULL) {
3077 		buffer_space = 0;
3078 	}
3079 	actual_space = 0;
3080 	scan = buffer;
3081 	SK_LOCK();
3082 	STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3083 		size_t                  info_size;
3084 
3085 		info_size
3086 		        = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3087 		if (scan != NULL) {
3088 			if (buffer_space < info_size) {
3089 				/* supplied buffer too small, stop copying */
3090 				error = ENOMEM;
3091 				break;
3092 			}
3093 			nexus_provider_info_populate(nxprov, (void *)scan);
3094 			scan += info_size;
3095 			buffer_space -= info_size;
3096 		}
3097 		actual_space += info_size;
3098 	}
3099 	SK_UNLOCK();
3100 
3101 	out_error = SYSCTL_OUT(req, buffer, actual_space);
3102 	if (out_error != 0) {
3103 		error = out_error;
3104 	}
3105 
3106 	if (buffer != NULL) {
3107 		sk_free_data(buffer, allocated_space);
3108 	}
3109 
3110 	return error;
3111 }
3112 
3113 /*
3114  * Channel list sysctl
3115  */
3116 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3117 channel_ring_count(struct kern_channel *ch, enum txrx which)
3118 {
3119 	return ch->ch_last[which] - ch->ch_first[which];
3120 }
3121 
3122 static void
populate_ring_entries(struct __kern_channel_ring * kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry_t entries)3123 populate_ring_entries(struct __kern_channel_ring *kring,
3124     ring_id_t first, ring_id_t last, nexus_channel_ring_entry_t entries)
3125 {
3126 	ring_id_t i;
3127 	nexus_channel_ring_entry_t scan;
3128 	struct __kern_channel_ring *ring;
3129 
3130 	scan = entries;
3131 	for (i = first; i < last; i++, scan++) {
3132 		ring = &kring[i];
3133 
3134 		DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3135 		    ring);
3136 		if (kr_stat_enable == 0) {
3137 			bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3138 			bzero(&scan->ncre_user_stats,
3139 			    sizeof(scan->ncre_user_stats));
3140 		} else {
3141 			scan->ncre_stats = ring->ckr_stats;
3142 			scan->ncre_user_stats = ring->ckr_usr_stats;
3143 		}
3144 		scan->ncre_error_stats = ring->ckr_err_stats;
3145 		scan->ncre_ring_id = i;
3146 	}
3147 }
3148 
3149 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3150 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3151 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3152 {
3153 	uint32_t flags = 0;
3154 
3155 	flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3156 	flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3157 	flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3158 	flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3159 	flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3160 	flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3161 	flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3162 	flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3163 	flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3164 	flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3165 	flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3166 	flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3167 	flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3168 
3169 	return flags;
3170 }
3171 
3172 SK_NO_INLINE_ATTRIBUTE
3173 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3174 nexus_channel_entry_populate(struct kern_channel *ch,
3175     nexus_channel_entry_t entry)
3176 {
3177 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3178 	uint32_t ch_flags = ch->ch_flags;
3179 	ring_id_t rx_first = ch->ch_first[NR_RX];
3180 	ring_id_t rx_last = ch->ch_last[NR_RX];
3181 	ring_id_t tx_last = ch->ch_last[NR_TX];
3182 	ring_id_t tx_first = ch->ch_first[NR_TX];
3183 
3184 	uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3185 	entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3186 	entry->nce_port = ch->ch_info->cinfo_nx_port;
3187 	entry->nce_pid = ch->ch_pid;
3188 	entry->nce_fd = ch->ch_fd;
3189 	entry->nce_tx_rings = tx_last - tx_first;
3190 	entry->nce_rx_rings = rx_last - rx_first;
3191 	populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3192 	    entry->nce_ring_entries);
3193 	populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3194 	    entry->nce_ring_entries + entry->nce_tx_rings);
3195 }
3196 
3197 SK_NO_INLINE_ATTRIBUTE
3198 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info_t info,size_t buffer_size)3199 nexus_channel_info_populate(struct kern_nexus *nx,
3200     nexus_channel_info_t info, size_t buffer_size)
3201 {
3202 	struct kern_channel *ch = NULL;
3203 	size_t info_size;
3204 	caddr_t scan = NULL;
3205 
3206 	SK_LOCK_ASSERT_HELD();
3207 
3208 	info_size = sizeof(*info);
3209 
3210 	/* channel list */
3211 	if (info != NULL) {
3212 		if (buffer_size < info_size) {
3213 			return info_size;
3214 		}
3215 
3216 		/* instance UUID */
3217 		uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3218 		info->nci_channel_entries_count = nx->nx_ch_count;
3219 		scan = (caddr_t)info->nci_channel_entries;
3220 	}
3221 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3222 		size_t          entry_size;
3223 		uint32_t        ring_count;
3224 
3225 		ring_count = channel_ring_count(ch, NR_TX) +
3226 		    channel_ring_count(ch, NR_RX);
3227 		entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3228 		info_size += entry_size;
3229 		if (scan != NULL) {
3230 			if (buffer_size < info_size) {
3231 				return info_size;
3232 			}
3233 
3234 			nexus_channel_entry_populate(ch, (void *)scan);
3235 			scan += entry_size;
3236 		}
3237 	}
3238 	return info_size;
3239 }
3240 
3241 static int
3242 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3243 {
3244 #pragma unused(arg1, arg2, oidp)
3245 	size_t actual_space;
3246 	caddr_t buffer = NULL;
3247 	size_t buffer_space;
3248 	size_t allocated_space;
3249 	int out_error;
3250 	struct kern_nexus *nx;
3251 	int error = 0;
3252 	caddr_t scan;
3253 
3254 	if (!kauth_cred_issuser(kauth_cred_get())) {
3255 		return EPERM;
3256 	}
3257 
3258 	net_update_uptime();
3259 	buffer_space = req->oldlen;
3260 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3261 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3262 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3263 		}
3264 		allocated_space = buffer_space;
3265 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3266 		if (__improbable(buffer == NULL)) {
3267 			return ENOBUFS;
3268 		}
3269 	} else if (req->oldptr == USER_ADDR_NULL) {
3270 		buffer_space = 0;
3271 	}
3272 	actual_space = 0;
3273 	scan = buffer;
3274 	SK_LOCK();
3275 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3276 		size_t info_size;
3277 
3278 		info_size = nexus_channel_info_populate(nx, (void *)scan,
3279 		    buffer_space);
3280 		if (scan != NULL) {
3281 			if (buffer_space < info_size) {
3282 				/* supplied buffer too small, stop copying */
3283 				error = ENOMEM;
3284 				break;
3285 			}
3286 			scan += info_size;
3287 			buffer_space -= info_size;
3288 		}
3289 		actual_space += info_size;
3290 	}
3291 	SK_UNLOCK();
3292 
3293 	if (actual_space != 0) {
3294 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3295 		if (out_error != 0) {
3296 			error = out_error;
3297 		}
3298 	}
3299 	if (buffer != NULL) {
3300 		sk_free_data(buffer, allocated_space);
3301 	}
3302 
3303 	return error;
3304 }
3305 
3306 static int
3307 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3308 {
3309 #pragma unused(arg1, arg2)
3310 	struct proc *p = req->p;
3311 	struct nexus_mib_filter filter;
3312 	int error = 0;
3313 	size_t actual_space;
3314 	caddr_t buffer = NULL;
3315 	size_t buffer_space;
3316 	size_t allocated_space;
3317 	int out_error;
3318 	struct kern_nexus *nx;
3319 	caddr_t scan;
3320 
3321 	/* Restrict protocol stats access to root user only (like netstat). */
3322 	if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3323 	    !kauth_cred_issuser(kauth_cred_get())) {
3324 		SK_ERR("mib request rejected, EPERM");
3325 		return EPERM;
3326 	}
3327 
3328 	if (req->newptr == USER_ADDR_NULL) {
3329 		/*
3330 		 * For flow stats requests, non-root users need to provide a
3331 		 * 5-tuple. Otherwise, we do not grant access.
3332 		 */
3333 		if (oidp->oid_arg2 == NXMIB_FLOW &&
3334 		    !kauth_cred_issuser(kauth_cred_get())) {
3335 			SK_ERR("mib request rejected: tuple not provided");
3336 			return EPERM;
3337 		}
3338 		/* use subcommand for multiple nodes */
3339 		filter.nmf_type = oidp->oid_arg2;
3340 		filter.nmf_bitmap = 0x0;
3341 	} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3342 		SK_ERR("mis-matching newlen");
3343 		return EINVAL;
3344 	} else {
3345 		error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3346 		if (error != 0) {
3347 			SK_ERR("SYSCTL_IN err %d", error);
3348 			return error;
3349 		}
3350 		if (filter.nmf_type != oidp->oid_arg2) {
3351 			SK_ERR("mis-matching nmf_type");
3352 			return EINVAL;
3353 		}
3354 		/*
3355 		 * For flow stats requests, non-root users need to set the nexus
3356 		 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3357 		 * grant access. This ensures that fsw_mib_get_flow looks for a
3358 		 * flow entry that matches the given tuple of the non-root user.
3359 		 */
3360 		if (filter.nmf_type == NXMIB_FLOW &&
3361 		    (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3362 		    !kauth_cred_issuser(kauth_cred_get())) {
3363 			SK_ERR("mib request rejected: tuple filter not set");
3364 			return EPERM;
3365 		}
3366 	}
3367 
3368 	net_update_uptime();
3369 	buffer_space = req->oldlen;
3370 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3371 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3372 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3373 		}
3374 		allocated_space = buffer_space;
3375 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3376 		if (__improbable(buffer == NULL)) {
3377 			return ENOBUFS;
3378 		}
3379 	} else if (req->oldptr == USER_ADDR_NULL) {
3380 		buffer_space = 0;
3381 	}
3382 	actual_space = 0;
3383 	scan = buffer;
3384 
3385 	SK_LOCK();
3386 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3387 		if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3388 			continue;
3389 		}
3390 
3391 		size_t size;
3392 		struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3393 
3394 		size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3395 		    buffer_space, p);
3396 
3397 		if (scan != NULL) {
3398 			if (buffer_space < size) {
3399 				/* supplied buffer too small, stop copying */
3400 				error = ENOMEM;
3401 				break;
3402 			}
3403 			scan += size;
3404 			buffer_space -= size;
3405 		}
3406 		actual_space += size;
3407 	}
3408 	SK_UNLOCK();
3409 
3410 	if (actual_space != 0) {
3411 		out_error = SYSCTL_OUT(req, buffer, actual_space);
3412 		if (out_error != 0) {
3413 			error = out_error;
3414 		}
3415 	}
3416 	if (buffer != NULL) {
3417 		sk_free_data(buffer, allocated_space);
3418 	}
3419 
3420 	return error;
3421 }
3422 
3423 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3424 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3425     boolean_t is_sk_locked)
3426 {
3427 	struct kern_nexus *nx = NULL;
3428 
3429 	if (!is_sk_locked) {
3430 		SK_LOCK();
3431 	} else {
3432 		SK_LOCK_ASSERT_HELD();
3433 	}
3434 
3435 	RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3436 		(*f)(nx, arg0);
3437 	}
3438 
3439 	if (!is_sk_locked) {
3440 		SK_UNLOCK();
3441 	}
3442 }
3443 
3444 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3445 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3446     struct kern_pbufpool_memory_info *rx_pool_info,
3447     struct kern_pbufpool_memory_info *tx_pool_info)
3448 {
3449 	struct kern_pbufpool *tpp, *rpp;
3450 	struct kern_nexus *nx;
3451 	errno_t err = 0;
3452 
3453 	nx = nx_find(nx_uuid, FALSE);
3454 	if (nx == NULL) {
3455 		err = ENOENT;
3456 		goto done;
3457 	}
3458 
3459 	if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3460 		err = ENOTSUP;
3461 		goto done;
3462 	}
3463 
3464 	err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3465 	if (err != 0) {
3466 		goto done;
3467 	}
3468 
3469 	if ((tpp == NULL) && (rpp == NULL)) {
3470 		err = ENOENT;
3471 		goto done;
3472 	}
3473 
3474 	if (tx_pool_info != NULL) {
3475 		bzero(tx_pool_info, sizeof(*tx_pool_info));
3476 	}
3477 	if (rx_pool_info != NULL) {
3478 		bzero(rx_pool_info, sizeof(*rx_pool_info));
3479 	}
3480 
3481 	if ((tx_pool_info != NULL) && (tpp != NULL)) {
3482 		err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3483 		if (err != 0) {
3484 			goto done;
3485 		}
3486 	}
3487 
3488 	if ((rx_pool_info != NULL) && (rpp != NULL)) {
3489 		err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3490 	}
3491 
3492 done:
3493 	if (nx != NULL) {
3494 		(void) nx_release(nx);
3495 		nx = NULL;
3496 	}
3497 	return err;
3498 }
3499 
3500 void
nx_interface_advisory_notify(struct kern_nexus * nx)3501 nx_interface_advisory_notify(struct kern_nexus *nx)
3502 {
3503 	struct kern_channel *ch;
3504 	struct netif_stats *nifs;
3505 	struct fsw_stats *fsw_stats;
3506 	nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3507 
3508 	if (nxdom_type == NEXUS_TYPE_NET_IF) {
3509 		nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3510 	} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3511 		fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3512 	} else {
3513 		VERIFY(0);
3514 		__builtin_unreachable();
3515 	}
3516 	if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3517 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3518 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3519 		} else {
3520 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3521 		}
3522 		return;
3523 	}
3524 	/*
3525 	 * if the channel is in "nx_ch_if_adv_head" list, then we can
3526 	 * safely assume that the channel is not closed yet.
3527 	 * In ch_close_common(), the channel is removed from the
3528 	 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3529 	 * exclusive mode, prior to closing the channel.
3530 	 */
3531 	STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3532 		struct nexus_adapter *na = ch->ch_na;
3533 
3534 		ASSERT(na != NULL);
3535 		na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3536 		    TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3537 		if (nxdom_type == NEXUS_TYPE_NET_IF) {
3538 			STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3539 		} else {
3540 			STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3541 		}
3542 	}
3543 	lck_rw_done(&nx->nx_ch_if_adv_lock);
3544 }
3545