1 /*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <sys/sdt.h>
33
34 static uint32_t disable_nxctl_check = 0;
35 #if (DEVELOPMENT || DEBUG)
36 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37 CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38 #endif
39
40 LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41 LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42 LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43 LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44
45 static STAILQ_HEAD(, nxctl) nxctl_head =
46 STAILQ_HEAD_INITIALIZER(nxctl_head);
47 static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48 STAILQ_HEAD_INITIALIZER(nxprov_head);
49
50 static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51 RB_HEAD(kern_nexus_tree, kern_nexus);
52 RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53 RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54 static struct kern_nexus_tree nx_head;
55
56 static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57 static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58 static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59 static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60 static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61 static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62 static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63 static void nxctl_retain_locked(struct nxctl *);
64 static int nxctl_release_locked(struct nxctl *);
65 static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66 static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67 static void nxctl_free(struct nxctl *);
68
69 static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70 struct kern_nexus_domain_provider *, struct nxprov_reg *,
71 const struct kern_nexus_provider_init *init, int *);
72 static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73 static void nxprov_retain_locked(struct kern_nexus_provider *);
74 static int nxprov_release_locked(struct kern_nexus_provider *);
75 static struct kern_nexus_provider *nxprov_alloc(
76 struct kern_nexus_domain_provider *, zalloc_flags_t);
77 static void nxprov_free(struct kern_nexus_provider *);
78
79 static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80 static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81 static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82 static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83 static struct kern_nexus *nx_alloc(zalloc_flags_t);
84 static void nx_free(struct kern_nexus *);
85
86 static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87
88 static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89
90 static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91
92 static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93
94 static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95
96 static int __nx_inited = 0;
97
98 #define SKMEM_TAG_NX_KEY "com.apple.skywalk.nexus.key"
99 SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100
101 #define SKMEM_TAG_NX_MIB "com.apple.skywalk.nexus.mib"
102 static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103
104 #define SKMEM_TAG_NX_PORT "com.apple.skywalk.nexus.port"
105 SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106
107 #define SKMEM_TAG_NX_PORT_INFO "com.apple.skywalk.nexus.port.info"
108 SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109
110 /*
111 * Special nexus controller handle for Skywalk internal use. Unlike all
112 * other nexus controller handles that are created by userland or kernel
113 * clients, this one never gets closed or freed. It is also not part of
114 * the global nxctl_head list.
115 */
116 static struct nxctl _kernnxctl;
117 static struct nxctl _usernxctl;
118 struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
119 struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
120
121 /*
122 * -fbounds-safety: For static functions where additional size variables are
123 * added, we need to mark them __unused if this file is being built without
124 * -fbounds-safety.
125 */
126 #if !__has_ptrcheck
127 #define NX_FB_ARG __unused
128 #else
129 #define NX_FB_ARG
130 #endif
131
132 int
nexus_init(void)133 nexus_init(void)
134 {
135 SK_LOCK_ASSERT_HELD();
136 ASSERT(!__nx_inited);
137
138 RB_INIT(&nx_head);
139
140 na_init();
141
142 /* attach system built-in domains and domain providers */
143 nxdom_attach_all();
144
145 /*
146 * Initialize private kernel and shared user nexus controller handle;
147 *
148 * Shared Kernel controller is used internally for creating nexus providers
149 * and nexus instances from within the Skywalk code (e.g. netif_compat).
150 *
151 * Shared User controller is used userspace by clients(e.g. libnetcore)
152 * that would like to call nexus instances for use cases like
153 * configuring flow entry that they own indirectly (e.g. via NECP), so
154 * that the nexus would perform permission check based on other info
155 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
156 * credentials).
157 */
158 nxctl_init(&_kernnxctl, kernproc, NULL);
159 nxctl_retain_locked(&_kernnxctl); /* one for us */
160 nxctl_init(&_usernxctl, kernproc, NULL);
161 nxctl_retain_locked(&_usernxctl); /* one for us */
162 nxctl_traffic_rule_init();
163
164 __nx_inited = 1;
165
166 return 0;
167 }
168
169 void
nexus_fini(void)170 nexus_fini(void)
171 {
172 SK_LOCK_ASSERT_HELD();
173
174 if (__nx_inited) {
175 nxctl_traffic_rule_fini();
176 nxctl_release_locked(&_kernnxctl);
177 nxctl_release_locked(&_usernxctl);
178
179 /* tell all domains they're going away */
180 nxdom_detach_all();
181
182 ASSERT(RB_EMPTY(&nx_head));
183
184 na_fini();
185
186 __nx_inited = 0;
187 }
188 }
189
190 struct nxctl *
nxctl_create(struct proc * p,struct fileproc * fp,const uuid_t nxctl_uuid,int * err)191 nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
192 int *err)
193 {
194 struct nxctl *nxctl = NULL;
195
196 ASSERT(!uuid_is_null(nxctl_uuid));
197
198 /* privilege checks would be done when performing nxctl operations */
199
200 SK_LOCK();
201
202 nxctl = nxctl_alloc(p, fp, Z_WAITOK);
203
204 STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
205 nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
206 uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
207
208 nxctl_retain_locked(nxctl); /* one for being in the list */
209 nxctl_retain_locked(nxctl); /* one for the caller */
210
211 #if SK_LOG
212 uuid_string_t uuidstr;
213 SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
214 sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
215 #endif /* SK_LOG */
216
217 SK_UNLOCK();
218
219 if (*err != 0) {
220 nxctl_free(nxctl);
221 nxctl = NULL;
222 }
223 return nxctl;
224 }
225
226 void
nxctl_close(struct nxctl * nxctl)227 nxctl_close(struct nxctl *nxctl)
228 {
229 struct kern_nexus_provider *nxprov = NULL, *tnxprov;
230
231 lck_mtx_lock(&nxctl->nxctl_lock);
232 SK_LOCK();
233
234 ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
235
236 #if SK_LOG
237 uuid_string_t uuidstr;
238 SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
239 sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
240 nxctl->nxctl_flags, NEXUSCTLF_BITS);
241 #endif /* SK_LOG */
242
243 if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
244 nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
245 nxctl->nxctl_fp = NULL;
246 }
247
248 /* may be called as part of failure cleanup, so check */
249 if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
250 /* caller must hold an extra ref */
251 ASSERT(nxctl->nxctl_refcnt > 1);
252 (void) nxctl_release_locked(nxctl);
253
254 STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
255 nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
256 }
257
258 repeat:
259 STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
260 /*
261 * Close provider only for those which are owned by
262 * this control instance. Note that if we close the
263 * provider, we need to repeat this search as the
264 * list might have been changed by another thread.
265 * That's possible since SK_UNLOCK() may be called
266 * as a result of calling nxprov_close().
267 */
268 if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
269 nxprov->nxprov_ctl == nxctl) {
270 nxprov_retain_locked(nxprov);
271 (void) nxprov_close(nxprov, TRUE);
272 (void) nxprov_release_locked(nxprov);
273 goto repeat;
274 }
275 }
276
277 SK_UNLOCK();
278 lck_mtx_unlock(&nxctl->nxctl_lock);
279 nxctl_traffic_rule_clean(nxctl);
280 }
281
282 int
nxctl_set_opt(struct nxctl * nxctl,struct sockopt * sopt)283 nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
284 {
285 #pragma unused(nxctl)
286 int err = 0;
287
288 NXCTL_LOCK_ASSERT_HELD(nxctl);
289
290 if (sopt->sopt_dir != SOPT_SET) {
291 sopt->sopt_dir = SOPT_SET;
292 }
293
294 switch (sopt->sopt_name) {
295 case NXOPT_NEXUS_BIND:
296 err = nxctl_nexus_bind(nxctl, sopt);
297 break;
298
299 case NXOPT_NEXUS_UNBIND:
300 err = nxctl_nexus_unbind(nxctl, sopt);
301 break;
302
303 case NXOPT_NEXUS_CONFIG:
304 err = nxctl_nexus_config(nxctl, sopt);
305 break;
306
307 default:
308 err = ENOPROTOOPT;
309 break;
310 }
311
312 return err;
313 }
314
315 int
nxctl_get_opt(struct nxctl * nxctl,struct sockopt * sopt)316 nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
317 {
318 #pragma unused(nxctl)
319 int err = 0;
320
321 NXCTL_LOCK_ASSERT_HELD(nxctl);
322
323 if (sopt->sopt_dir != SOPT_GET) {
324 sopt->sopt_dir = SOPT_GET;
325 }
326
327 switch (sopt->sopt_name) {
328 case NXOPT_NEXUS_PROV_LIST:
329 err = nxctl_get_nexus_prov_list(nxctl, sopt);
330 break;
331
332 case NXOPT_NEXUS_PROV_ENTRY:
333 err = nxctl_get_nexus_prov_entry(nxctl, sopt);
334 break;
335
336 case NXOPT_NEXUS_LIST:
337 err = nxctl_get_nexus_list(nxctl, sopt);
338 break;
339
340 case NXOPT_CHANNEL_LIST:
341 err = nxctl_get_channel_list(nxctl, sopt);
342 break;
343
344 default:
345 err = ENOPROTOOPT;
346 break;
347 }
348
349 return err;
350 }
351
352 /* Upper bound on # of nrl_num_regs that we'd return to user space */
353 #define MAX_NUM_REG_ENTRIES 256
354
355 /* Hoisted out of line to reduce kernel stack footprint */
356 SK_NO_INLINE_ATTRIBUTE
357 static int
nxctl_get_nexus_prov_list(struct nxctl * nxctl,struct sockopt * sopt)358 nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
359 {
360 user_addr_t tmp_ptr = USER_ADDR_NULL;
361 struct nxprov_reg_ent *pnre, *nres = NULL;
362 struct nxprov_list_req nrlr;
363 struct kern_nexus_provider *nxprov = NULL;
364 uint32_t nregs = 0, ncregs = 0;
365 int err = 0, observeall;
366 size_t nres_sz;
367
368 NXCTL_LOCK_ASSERT_HELD(nxctl);
369
370 ASSERT(sopt->sopt_p != NULL);
371 if (sopt->sopt_val == USER_ADDR_NULL) {
372 return EINVAL;
373 }
374
375 err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
376 if (err != 0) {
377 return err;
378 }
379
380 if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
381 nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
382 }
383
384 /*
385 * If the caller specified a buffer, copy out the Nexus provider
386 * entries to caller gracefully. We only copy out the number of
387 * entries which caller has asked for, but we always tell caller
388 * how big the buffer really needs to be.
389 */
390 tmp_ptr = nrlr.nrl_regs;
391 if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
392 nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
393 nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
394 if (__improbable(nres == NULL)) {
395 return ENOBUFS;
396 }
397 }
398
399 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
400 PRIV_SKYWALK_OBSERVE_ALL) == 0);
401
402 SK_LOCK();
403 /*
404 * Count number of providers. If buffer space exists and
405 * remains, copy out provider entries.
406 */
407 nregs = nrlr.nrl_num_regs;
408 pnre = nres;
409
410 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
411 /*
412 * Return only entries that are visible to the caller,
413 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
414 */
415 if (nxprov->nxprov_ctl != nxctl && !observeall) {
416 continue;
417 }
418
419 if (nres != NULL && nregs > 0) {
420 uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
421 bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
422 sizeof(struct nxprov_params));
423 --nregs;
424 ++pnre;
425 ++ncregs;
426 }
427 }
428 SK_UNLOCK();
429
430 if (ncregs == 0) {
431 err = ENOENT;
432 }
433
434 if (nres != NULL) {
435 if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
436 if (sopt->sopt_p != kernproc) {
437 err = copyout(nres, tmp_ptr,
438 ncregs * sizeof(*nres));
439 } else {
440 caddr_t tmp;
441 tmp = __unsafe_forge_bidi_indexable(caddr_t,
442 CAST_DOWN(caddr_t, tmp_ptr),
443 ncregs * sizeof(*nres));
444 bcopy(nres, tmp, ncregs * sizeof(*nres));
445 }
446 }
447 sk_free_data(nres, nres_sz);
448 nres = NULL;
449 }
450
451 if (err == 0) {
452 nrlr.nrl_num_regs = ncregs;
453 err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
454 }
455
456 return err;
457 }
458
459 /* Hoisted out of line to reduce kernel stack footprint */
460 SK_NO_INLINE_ATTRIBUTE
461 static int
nxctl_get_nexus_prov_entry(struct nxctl * nxctl,struct sockopt * sopt)462 nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
463 {
464 struct nxprov_reg_ent nre;
465 struct kern_nexus_provider *nxprov = NULL;
466 int err = 0;
467
468 NXCTL_LOCK_ASSERT_HELD(nxctl);
469
470 ASSERT(sopt->sopt_p != NULL);
471 if (sopt->sopt_val == USER_ADDR_NULL) {
472 return EINVAL;
473 }
474
475 bzero(&nre, sizeof(nre));
476 err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
477 if (err != 0) {
478 return err;
479 }
480
481 if (uuid_is_null(nre.npre_prov_uuid)) {
482 return EINVAL;
483 }
484
485 SK_LOCK();
486 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
487 if (uuid_compare(nxprov->nxprov_uuid,
488 nre.npre_prov_uuid) == 0) {
489 /*
490 * Return only entries that are visible to the caller,
491 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
492 */
493 if (nxprov->nxprov_ctl != nxctl) {
494 if (skywalk_priv_check_cred(sopt->sopt_p,
495 nxctl->nxctl_cred,
496 PRIV_SKYWALK_OBSERVE_ALL) != 0) {
497 nxprov = NULL;
498 break;
499 }
500 }
501
502 bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
503 sizeof(struct nxprov_params));
504 break;
505 }
506 }
507 SK_UNLOCK();
508
509 if (nxprov != NULL) {
510 err = sooptcopyout(sopt, &nre, sizeof(nre));
511 } else {
512 err = ENOENT;
513 }
514
515 return err;
516 }
517
518 /* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
519 #define MAX_NUM_NX_UUIDS 4096
520
521 /* Hoisted out of line to reduce kernel stack footprint */
522 SK_NO_INLINE_ATTRIBUTE
523 static int
nxctl_get_nexus_list(struct nxctl * nxctl,struct sockopt * sopt)524 nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
525 {
526 user_addr_t tmp_ptr = USER_ADDR_NULL;
527 uint32_t nuuids = 0, ncuuids = 0;
528 uuid_t *puuid, *uuids = NULL;
529 size_t uuids_sz;
530 struct nx_list_req nlr;
531 struct kern_nexus_provider *nxprov = NULL;
532 struct kern_nexus *nx = NULL;
533 int err = 0, observeall;
534
535 NXCTL_LOCK_ASSERT_HELD(nxctl);
536
537 ASSERT(sopt->sopt_p != NULL);
538 if (sopt->sopt_val == USER_ADDR_NULL) {
539 return EINVAL;
540 }
541
542 err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
543 if (err != 0) {
544 return err;
545 }
546
547 if (uuid_is_null(nlr.nl_prov_uuid)) {
548 return EINVAL;
549 } else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
550 nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
551 }
552
553 /*
554 * If the caller specified a buffer, copy out the Nexus UUIDs to
555 * caller gracefully. We only copy out the number of UUIDs which
556 * caller has asked for, but we always tell caller how big the
557 * buffer really needs to be.
558 */
559 tmp_ptr = nlr.nl_nx_uuids;
560 if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
561 uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
562 uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
563 if (__improbable(uuids == NULL)) {
564 return ENOBUFS;
565 }
566 }
567
568 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
569 PRIV_SKYWALK_OBSERVE_ALL) == 0);
570
571 SK_LOCK();
572 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
573 /*
574 * Return only entries that are visible to the caller,
575 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
576 */
577 if (nxprov->nxprov_ctl != nxctl && !observeall) {
578 continue;
579 }
580
581 if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
582 break;
583 }
584 }
585
586 if (nxprov != NULL) {
587 /*
588 * Count number of Nexus. If buffer space exists
589 * and remains, copy out the Nexus UUIDs.
590 */
591 nuuids = nlr.nl_num_nx_uuids;
592 puuid = uuids;
593
594 STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
595 ++ncuuids;
596 if (uuids != NULL && nuuids > 0) {
597 uuid_copy(*puuid, nx->nx_uuid);
598 --nuuids;
599 ++puuid;
600 }
601 }
602 } else {
603 err = ENOENT;
604 }
605 SK_UNLOCK();
606
607 if (uuids != NULL) {
608 if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
609 uintptr_t cnt_uuid;
610
611 /* Note: Pointer arithmetic */
612 cnt_uuid = (uintptr_t)(puuid - uuids);
613 if (cnt_uuid > 0) {
614 if (sopt->sopt_p != kernproc) {
615 err = copyout(uuids, tmp_ptr,
616 cnt_uuid * sizeof(uuid_t));
617 } else {
618 caddr_t tmp;
619 tmp = __unsafe_forge_bidi_indexable(caddr_t,
620 CAST_DOWN(caddr_t, tmp_ptr),
621 cnt_uuid * sizeof(uuid_t));
622 bcopy(uuids, tmp,
623 cnt_uuid * sizeof(uuid_t));
624 }
625 }
626 }
627 sk_free_data(uuids, uuids_sz);
628 uuids = NULL;
629 }
630
631 if (err == 0) {
632 nlr.nl_num_nx_uuids = ncuuids;
633 err = sooptcopyout(sopt, &nlr, sizeof(nlr));
634 }
635
636 return err;
637 }
638
639 /* Hoisted out of line to reduce kernel stack footprint */
640 SK_NO_INLINE_ATTRIBUTE
641 static int
nxctl_nexus_bind(struct nxctl * nxctl,struct sockopt * sopt)642 nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
643 {
644 boolean_t m_pid, m_exec_uuid, m_key;
645 struct nx_bind_req nbr;
646 struct proc *p = PROC_NULL;
647 struct nxbind *nxb = NULL;
648 uint64_t p_uniqueid = -1;
649 pid_t p_pid = -1;
650 struct kern_nexus *nx = NULL;
651 #if SK_LOG
652 uuid_string_t exec_uuidstr;
653 #endif /* SK_LOG */
654 uuid_t p_uuid;
655 void *key = NULL;
656 int err = 0;
657
658 NXCTL_LOCK_ASSERT_HELD(nxctl);
659
660 if (sopt->sopt_val == USER_ADDR_NULL) {
661 return EINVAL;
662 }
663
664 uuid_clear(p_uuid);
665 bzero(&nbr, sizeof(nbr));
666 err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
667 if (err != 0) {
668 return err;
669 }
670
671 if (uuid_is_null(nbr.nb_nx_uuid)) {
672 err = EINVAL;
673 goto done_unlocked;
674 }
675
676 nbr.nb_flags &= NBR_MATCH_MASK;
677 if (nbr.nb_flags == 0) {
678 /* must choose one of the match criteria */
679 err = EINVAL;
680 goto done_unlocked;
681 }
682 m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
683 m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
684 m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
685
686 if (m_pid || m_exec_uuid) {
687 /*
688 * Validate process ID. A valid PID is needed when we're
689 * asked to match by PID, or if asked to match by executable
690 * UUID with a NULL nb_exec_uuid supplied. The latter is
691 * to support the case when a userland Nexus provider isn't
692 * able to acquire its client's executable UUID, but is
693 * able to identify it via PID.
694 */
695 if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
696 (p = proc_find(nbr.nb_pid)) == PROC_NULL) {
697 err = ESRCH;
698 goto done_unlocked;
699 }
700 /* exclude kernel from the match criteria */
701 if (p == kernproc) {
702 err = EACCES;
703 goto done_unlocked;
704 } else if (p != PROC_NULL) {
705 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
706 p_uniqueid = proc_uniqueid(p);
707 p_pid = proc_pid(p);
708 } else {
709 uuid_copy(p_uuid, nbr.nb_exec_uuid);
710 }
711 }
712
713 if (m_key) {
714 if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
715 nbr.nb_key == USER_ADDR_NULL) {
716 err = EINVAL;
717 goto done_unlocked;
718 }
719
720 key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
721 if (__improbable(key == NULL)) {
722 err = ENOMEM;
723 goto done_unlocked;
724 }
725
726 if (sopt->sopt_p != kernproc) {
727 err = copyin(nbr.nb_key, key, nbr.nb_key_len);
728 if (err != 0) {
729 goto done_unlocked;
730 }
731 } else {
732 /*
733 * -fbounds-safety: nbr.nb_key is user_addr_t. Changing
734 * it to a pointer type is risky, so we just forge it
735 * here instead.
736 */
737 void *nb_key = __unsafe_forge_bidi_indexable(void *,
738 nbr.nb_key, nbr.nb_key_len);
739 bcopy(nb_key, key, nbr.nb_key_len);
740 }
741 }
742
743 SK_LOCK();
744 nx = nx_find(nbr.nb_nx_uuid, TRUE);
745 if (nx == NULL || (disable_nxctl_check == 0 &&
746 nx->nx_prov->nxprov_ctl != nxctl &&
747 nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
748 err = ENOENT;
749 goto done;
750 }
751
752 /* bind isn't applicable on anonymous nexus provider */
753 if (NX_ANONYMOUS_PROV(nx)) {
754 err = ENXIO;
755 goto done;
756 }
757
758 /* port must be within the domain's range */
759 if (nbr.nb_port != NEXUS_PORT_ANY &&
760 nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
761 err = EDOM;
762 goto done;
763 } else if (nbr.nb_port == NEXUS_PORT_ANY) {
764 /* for now, this is allowed only for kernel clients */
765 if (sopt->sopt_p != kernproc) {
766 err = EPERM;
767 goto done;
768 }
769 }
770
771 nxb = nxb_alloc(Z_WAITOK);
772
773 if (m_pid) {
774 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
775 nxb->nxb_uniqueid = p_uniqueid;
776 nxb->nxb_pid = p_pid;
777 }
778 if (m_exec_uuid) {
779 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
780 ASSERT(!uuid_is_null(p_uuid));
781 uuid_copy(nxb->nxb_exec_uuid, p_uuid);
782 }
783 if (m_key) {
784 nxb->nxb_flags |= NXBF_MATCH_KEY;
785 ASSERT(key != NULL);
786 ASSERT(nbr.nb_key_len != 0 &&
787 nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
788 /*
789 * -fbounds-safety: since nxb_key is __sized_by(nxb_key_len),
790 * its assignment needs to be done side-by-side to nxb_key_len.
791 */
792 nxb->nxb_key = key;
793 key = NULL; /* let nxb_free() free it */
794 nxb->nxb_key_len = nbr.nb_key_len;
795 }
796
797 /*
798 * Bind the creds to the nexus port. If client doesn't have a port,
799 * find one, claim it, and associate the creds to it. Upon success,
800 * the nexus may move the nxbind contents (including the key) to
801 * its own nxbind instance; in that case, nxb_free() below will not
802 * be freeing the key within.
803 */
804 err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
805 if (err != 0) {
806 goto done;
807 }
808
809 ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
810 (void) sooptcopyout(sopt, &nbr, sizeof(nbr));
811
812 SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
813 "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
814 SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
815 NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
816 sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
817 (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
818 nxb->nxb_key_len);
819
820 done:
821 if (nx != NULL) {
822 (void) nx_release_locked(nx);
823 nx = NULL;
824 }
825 SK_UNLOCK();
826
827 done_unlocked:
828 ASSERT(nx == NULL);
829
830 if (nxb != NULL) {
831 nxb_free(nxb);
832 nxb = NULL;
833 }
834 if (key != NULL) {
835 sk_free_data(key, nbr.nb_key_len);
836 key = NULL;
837 }
838 if (p != PROC_NULL) {
839 proc_rele(p);
840 }
841
842 return err;
843 }
844
845 /* Hoisted out of line to reduce kernel stack footprint */
846 SK_NO_INLINE_ATTRIBUTE
847 static int
nxctl_nexus_unbind(struct nxctl * nxctl,struct sockopt * sopt)848 nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
849 {
850 struct nx_unbind_req nur;
851 struct kern_nexus *nx = NULL;
852 int err = 0;
853
854 NXCTL_LOCK_ASSERT_HELD(nxctl);
855
856 if (sopt->sopt_val == USER_ADDR_NULL) {
857 return EINVAL;
858 }
859
860 bzero(&nur, sizeof(nur));
861 err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
862 if (err != 0) {
863 return err;
864 }
865
866 if (uuid_is_null(nur.nu_nx_uuid)) {
867 return EINVAL;
868 }
869
870 SK_LOCK();
871 nx = nx_find(nur.nu_nx_uuid, TRUE);
872 if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
873 nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
874 err = ENOENT;
875 goto done;
876 }
877
878 /* unbind isn't applicable on anonymous nexus provider */
879 if (NX_ANONYMOUS_PROV(nx)) {
880 err = ENXIO;
881 goto done;
882 }
883
884 if (nur.nu_port == NEXUS_PORT_ANY) {
885 err = EINVAL;
886 goto done;
887 }
888
889 err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
890
891 done:
892 if (nx != NULL) {
893 (void) nx_release_locked(nx);
894 nx = NULL;
895 }
896 SK_UNLOCK();
897
898 return err;
899 }
900
901 /* Hoisted out of line to reduce kernel stack footprint */
902 SK_NO_INLINE_ATTRIBUTE
903 static int
nxctl_nexus_config(struct nxctl * nxctl,struct sockopt * sopt)904 nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
905 {
906 struct kern_nexus *nx = NULL;
907 struct nx_cfg_req ncr;
908 int err = 0;
909
910 NXCTL_LOCK_ASSERT_HELD(nxctl);
911
912 if (sopt->sopt_val == USER_ADDR_NULL) {
913 return EINVAL;
914 }
915
916 bzero(&ncr, sizeof(ncr));
917 err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
918 if (err != 0) {
919 return err;
920 }
921
922 if (uuid_is_null(ncr.nc_nx_uuid)) {
923 return EINVAL;
924 }
925
926 SK_LOCK();
927 nx = nx_find(ncr.nc_nx_uuid, TRUE);
928 if (nx == NULL || (disable_nxctl_check == 0 &&
929 nx->nx_prov->nxprov_ctl != nxctl &&
930 nxctl != &_kernnxctl && /* allow kernel/shared user nxctl */
931 nxctl != &_usernxctl)) {
932 err = ENOENT;
933 goto done;
934 }
935
936 if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
937 err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
938 nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
939 } else {
940 err = EPERM;
941 }
942
943 if (err == 0) {
944 (void) sooptcopyout(sopt, &ncr, sizeof(ncr));
945 }
946 done:
947 if (nx != NULL) {
948 (void) nx_release_locked(nx);
949 nx = NULL;
950 }
951 SK_UNLOCK();
952
953 return err;
954 }
955
956 struct nxbind *
nxb_alloc(zalloc_flags_t how)957 nxb_alloc(zalloc_flags_t how)
958 {
959 struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
960
961 if (nxb) {
962 SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
963 }
964 return nxb;
965 }
966
967 void
nxb_free(struct nxbind * nxb)968 nxb_free(struct nxbind *nxb)
969 {
970 SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
971 (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
972
973 if (nxb->nxb_key != NULL) {
974 sk_free_data_sized_by(nxb->nxb_key, nxb->nxb_key_len);
975 nxb->nxb_key = NULL;
976 nxb->nxb_key_len = 0;
977 }
978 zfree(nxbind_zone, nxb);
979 }
980
981 /*
982 * nxb0 is assumed to possess the truth, compare nxb1 against it.
983 */
984 boolean_t
nxb_is_equal(struct nxbind * nxb0,struct nxbind * nxb1)985 nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
986 {
987 ASSERT(nxb0 != NULL && nxb1 != NULL);
988 ASSERT(nxb0 != nxb1);
989
990 /* we always compare using uniqueid and not pid */
991 if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
992 nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
993 return FALSE;
994 }
995
996 if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
997 uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
998 return FALSE;
999 }
1000
1001 ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
1002 (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
1003
1004 if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
1005 (nxb0->nxb_key_len != nxb1->nxb_key_len ||
1006 nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
1007 nxb1->nxb_key_len) != 0)) {
1008 return FALSE;
1009 }
1010
1011 return TRUE;
1012 }
1013
1014 void
nxb_move(struct nxbind * snxb,struct nxbind * dnxb)1015 nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
1016 {
1017 ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
1018 (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
1019
1020 /* in case the destination has a key attached, free it first */
1021 if (dnxb->nxb_key != NULL) {
1022 sk_free_data_sized_by(dnxb->nxb_key, dnxb->nxb_key_len);
1023 dnxb->nxb_key = NULL;
1024 dnxb->nxb_key_len = 0;
1025 }
1026
1027 /* move everything from src to dst, and then wipe out src */
1028 bcopy(snxb, dnxb, sizeof(*dnxb));
1029 bzero(snxb, sizeof(*snxb));
1030 }
1031
1032 /* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1033 #define MAX_NUM_CH_UUIDS 4096
1034
1035 /* Hoisted out of line to reduce kernel stack footprint */
1036 SK_NO_INLINE_ATTRIBUTE
1037 static int
nxctl_get_channel_list(struct nxctl * nxctl,struct sockopt * sopt)1038 nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1039 {
1040 user_addr_t tmp_ptr = USER_ADDR_NULL;
1041 uint32_t nuuids = 0, ncuuids = 0;
1042 uuid_t *puuid, *uuids = NULL;
1043 size_t uuids_sz;
1044 struct ch_list_req clr;
1045 struct kern_channel *ch = NULL;
1046 struct kern_nexus *nx = NULL;
1047 struct kern_nexus find;
1048 int err = 0, observeall;
1049
1050 NXCTL_LOCK_ASSERT_HELD(nxctl);
1051
1052 ASSERT(sopt->sopt_p != NULL);
1053 if (sopt->sopt_val == USER_ADDR_NULL) {
1054 return EINVAL;
1055 }
1056
1057 err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
1058 if (err != 0) {
1059 return err;
1060 }
1061
1062 if (uuid_is_null(clr.cl_nx_uuid)) {
1063 return EINVAL;
1064 } else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1065 clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1066 }
1067
1068 /*
1069 * If the caller specified a buffer, copy out the Channel UUIDs to
1070 * caller gracefully. We only copy out the number of UUIDs which
1071 * caller has asked for, but we always tell caller how big the
1072 * buffer really needs to be.
1073 */
1074 tmp_ptr = clr.cl_ch_uuids;
1075 if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1076 uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1077 uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1078 if (uuids == NULL) {
1079 return ENOBUFS;
1080 }
1081 }
1082
1083 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1084 PRIV_SKYWALK_OBSERVE_ALL) == 0);
1085
1086 SK_LOCK();
1087 uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
1088 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1089 if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1090 /*
1091 * Return only entries that are visible to the caller,
1092 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1093 */
1094 nx = NULL;
1095 }
1096 if (nx != NULL) {
1097 /*
1098 * Count number of Channels. If buffer space exists
1099 * and remains, copy out the Channel UUIDs.
1100 */
1101 nuuids = clr.cl_num_ch_uuids;
1102 puuid = uuids;
1103
1104 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1105 ++ncuuids;
1106 if (uuids != NULL && nuuids > 0) {
1107 uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
1108 --nuuids;
1109 ++puuid;
1110 }
1111 }
1112 } else {
1113 err = ENOENT;
1114 }
1115 SK_UNLOCK();
1116
1117 if (uuids != NULL) {
1118 if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1119 uintptr_t cnt_uuid;
1120
1121 /* Note: Pointer arithmetic */
1122 cnt_uuid = (uintptr_t)(puuid - uuids);
1123 ASSERT(cnt_uuid > 0);
1124
1125 if (sopt->sopt_p != kernproc) {
1126 err = copyout(uuids, tmp_ptr,
1127 cnt_uuid * sizeof(uuid_t));
1128 } else {
1129 caddr_t tmp;
1130 tmp = __unsafe_forge_bidi_indexable(caddr_t,
1131 CAST_DOWN(caddr_t, tmp_ptr),
1132 cnt_uuid * sizeof(uuid_t));
1133 bcopy(uuids, tmp, cnt_uuid * sizeof(uuid_t));
1134 }
1135 }
1136 sk_free_data(uuids, uuids_sz);
1137 uuids = NULL;
1138 }
1139
1140 if (err == 0) {
1141 clr.cl_num_ch_uuids = ncuuids;
1142 err = sooptcopyout(sopt, &clr, sizeof(clr));
1143 }
1144
1145 return err;
1146 }
1147
1148 static void
nxctl_init(struct nxctl * nxctl,struct proc * p,struct fileproc * fp)1149 nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1150 {
1151 uuid_t p_uuid;
1152
1153 bzero(nxctl, sizeof(*nxctl));
1154
1155 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1156
1157 lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
1158 uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
1159 nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1160 nxctl->nxctl_cred = kauth_cred_proc_ref(p);
1161 nxctl->nxctl_fp = fp;
1162 if (nxctl == &_kernnxctl) {
1163 ASSERT(p == kernproc);
1164 nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1165 }
1166 if (nxctl == &_usernxctl) {
1167 ASSERT(p == kernproc);
1168 nxctl->nxctl_cred = NULL;
1169 }
1170 if (fp == NULL) {
1171 nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1172 }
1173 }
1174
1175 static struct nxctl *
nxctl_alloc(struct proc * p,struct fileproc * fp,zalloc_flags_t how)1176 nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1177 {
1178 struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1179
1180 if (nxctl != NULL) {
1181 nxctl_init(nxctl, p, fp);
1182 }
1183 return nxctl;
1184 }
1185
1186 static void
nxctl_free(struct nxctl * nxctl)1187 nxctl_free(struct nxctl *nxctl)
1188 {
1189 ASSERT(nxctl->nxctl_refcnt == 0);
1190 ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1191 kauth_cred_unref(&nxctl->nxctl_cred);
1192 lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
1193 SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1194 if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1195 zfree(nxctl_zone, nxctl);
1196 }
1197 }
1198
1199 static void
nxctl_retain_locked(struct nxctl * nxctl)1200 nxctl_retain_locked(struct nxctl *nxctl)
1201 {
1202 SK_LOCK_ASSERT_HELD();
1203
1204 nxctl->nxctl_refcnt++;
1205 ASSERT(nxctl->nxctl_refcnt != 0);
1206 }
1207
1208 void
nxctl_retain(struct nxctl * nxctl)1209 nxctl_retain(struct nxctl *nxctl)
1210 {
1211 SK_LOCK();
1212 nxctl_retain_locked(nxctl);
1213 SK_UNLOCK();
1214 }
1215
1216 static int
nxctl_release_locked(struct nxctl * nxctl)1217 nxctl_release_locked(struct nxctl *nxctl)
1218 {
1219 int oldref = nxctl->nxctl_refcnt;
1220
1221 SK_LOCK_ASSERT_HELD();
1222
1223 ASSERT(nxctl->nxctl_refcnt != 0);
1224 if (--nxctl->nxctl_refcnt == 0) {
1225 nxctl_free(nxctl);
1226 }
1227
1228 return oldref == 1;
1229 }
1230
1231 int
nxctl_release(struct nxctl * nxctl)1232 nxctl_release(struct nxctl *nxctl)
1233 {
1234 int lastref;
1235
1236 SK_LOCK();
1237 lastref = nxctl_release_locked(nxctl);
1238 SK_UNLOCK();
1239
1240 return lastref;
1241 }
1242
1243 /* XXX
1244 * -fbounds-safety: Why is this taking a void *? All callers are passing nxctl.
1245 * How come there's no nxctl_ctor?
1246 */
1247 void
nxctl_dtor(struct nxctl * arg)1248 nxctl_dtor(struct nxctl *arg)
1249 {
1250 struct nxctl *nxctl = arg;
1251
1252 nxctl_close(nxctl);
1253 SK_LOCK();
1254 (void) nxctl_release_locked(nxctl);
1255 SK_UNLOCK();
1256 }
1257
1258 int
nxprov_advise_connect(struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)1259 nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1260 struct proc *p)
1261 {
1262 struct kern_nexus_provider *nxprov = NX_PROV(nx);
1263 int err = 0;
1264
1265 ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1266 ASSERT(ch->ch_ctx == NULL);
1267
1268 SK_LOCK_ASSERT_HELD();
1269 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1270
1271 /* monitor channels aren't externally visible/usable, so ignore */
1272 if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1273 (ch->ch_flags & CHANF_EXT_SKIP) ||
1274 (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1275 nxprov->nxprov_ext.nxpi_connected == NULL)) {
1276 return 0;
1277 }
1278
1279 ch_retain_locked(ch);
1280 lck_mtx_unlock(&ch->ch_lock);
1281 SK_UNLOCK();
1282 lck_mtx_lock(&ch->ch_lock);
1283
1284 err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1285 ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1286 if (err != 0) {
1287 SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1288 "error %d", SK_KVA(ch), ch->ch_flags,
1289 CHANF_BITS, SK_KVA(nx), err);
1290 ch->ch_ctx = NULL;
1291 goto done;
1292 }
1293 /*
1294 * Upon ring/slot init failure, this is cleared
1295 * by nxprov_advise_disconnect() below.
1296 */
1297 os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1298 if (NXPROV_LLINK(nxprov)) {
1299 err = nx_netif_llink_ext_init_default_queues(nx);
1300 } else {
1301 err = nx_init_rings(nx, ch);
1302 }
1303 if (err != 0) {
1304 goto done;
1305 }
1306 ASSERT(err == 0);
1307 ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1308 CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1309
1310 err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1311 if (err != 0) {
1312 SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1313 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1314 goto done;
1315 }
1316 os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1317 SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1318 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1319
1320
1321 done:
1322 lck_mtx_unlock(&ch->ch_lock);
1323 SK_LOCK();
1324 lck_mtx_lock(&ch->ch_lock);
1325 if ((err != 0) &&
1326 (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1327 nxprov_advise_disconnect(nx, ch);
1328 }
1329 /* caller is expected to hold one, in addition to ourselves */
1330 VERIFY(ch->ch_refcnt >= 2);
1331 ch_release_locked(ch);
1332
1333 return err;
1334 }
1335
1336 void
nxprov_advise_disconnect(struct kern_nexus * nx,struct kern_channel * ch)1337 nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1338 {
1339 struct kern_nexus_provider *nxprov = NX_PROV(nx);
1340
1341 SK_LOCK_ASSERT_HELD();
1342 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1343
1344 /* check as we might be called in the error handling path */
1345 if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1346 ch_retain_locked(ch);
1347 lck_mtx_unlock(&ch->ch_lock);
1348 SK_UNLOCK();
1349 lck_mtx_lock(&ch->ch_lock);
1350
1351 ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1352 if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1353 nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1354 os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1355 }
1356
1357 /*
1358 * Inform the external domain provider that the rings
1359 * and slots for this channel are no longer valid.
1360 */
1361 if (NXPROV_LLINK(nxprov)) {
1362 nx_netif_llink_ext_fini_default_queues(nx);
1363 } else {
1364 nx_fini_rings(nx, ch);
1365 }
1366
1367 ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1368 nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1369 os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1370
1371 SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1372 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1373
1374 /* We're done with this channel */
1375 ch->ch_ctx = NULL;
1376
1377 lck_mtx_unlock(&ch->ch_lock);
1378 SK_LOCK();
1379 lck_mtx_lock(&ch->ch_lock);
1380 /* caller is expected to hold one, in addition to ourselves */
1381 VERIFY(ch->ch_refcnt >= 2);
1382 ch_release_locked(ch);
1383 }
1384 ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1385 ASSERT(ch->ch_ctx == NULL);
1386 }
1387
1388 static struct kern_nexus_provider *
nxprov_create_common(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1389 nxprov_create_common(struct nxctl *nxctl,
1390 struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1391 const struct kern_nexus_provider_init *init, int *err)
1392 {
1393 struct skmem_region_params srp[SKMEM_REGIONS];
1394 struct kern_nexus_provider *nxprov = NULL;
1395 struct nxprov_params nxp;
1396 uint32_t override = 0;
1397 uint32_t pp_region_config_flags;
1398 int i;
1399
1400 _CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1401 _CASSERT(sizeof(*init) >=
1402 sizeof(struct kern_nexus_netif_provider_init));
1403
1404 SK_LOCK_ASSERT_HELD();
1405 ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1406
1407 pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1408 PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1409 /*
1410 * Special handling for external nexus providers; similar
1411 * logic to what's done in kern_pbufpool_create().
1412 */
1413 if (init != NULL) {
1414 if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1415 pp_region_config_flags |=
1416 PP_REGION_CONFIG_BUF_MONOLITHIC;
1417 }
1418
1419 if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1420 pp_region_config_flags |=
1421 PP_REGION_CONFIG_BUF_NOCACHE;
1422 }
1423 }
1424
1425 /*
1426 * For network devices, set the packet metadata memory as persistent
1427 * so that it is wired at segment creation. This allows us to access
1428 * it with preemption disabled, as well as for rdar://problem/46511741.
1429 */
1430 if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1431 pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1432 }
1433
1434 /* process and validate provider parameters */
1435 if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1436 &nxp, srp, override, pp_region_config_flags)) != 0) {
1437 goto done;
1438 }
1439
1440 nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1441 ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1442
1443 STAILQ_INIT(&nxprov->nxprov_nx_head);
1444 STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1445 nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1446 nxprov->nxprov_ctl = nxctl;
1447 uuid_generate_random(nxprov->nxprov_uuid);
1448 bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
1449
1450 if (init != NULL) {
1451 if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1452 ASSERT(NXPROV_LLINK(nxprov));
1453 bcopy(init, &nxprov->nxprov_netif_ext,
1454 sizeof(nxprov->nxprov_netif_ext));
1455 } else {
1456 ASSERT(!NXPROV_LLINK(nxprov));
1457 ASSERT(init->nxpi_version ==
1458 KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1459 bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
1460 }
1461 nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1462 }
1463
1464 /* store validated region parameters to the provider */
1465 for (i = 0; i < SKMEM_REGIONS; i++) {
1466 nxprov->nxprov_region_params[i] = srp[i];
1467 }
1468
1469 if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1470 uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1471
1472 if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1473 nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1474 }
1475 } else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1476 NEXUS_TYPE_NET_IF) {
1477 /*
1478 * Treat non-netif built-in nexus providers as those
1479 * meant for inter-process communications, i.e. there
1480 * is no actual networking hardware involved.
1481 */
1482 nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1483 }
1484
1485 nxprov_retain_locked(nxprov); /* one for being in the list */
1486 nxprov_retain_locked(nxprov); /* one for the caller */
1487
1488 #if SK_LOG
1489 uuid_string_t uuidstr;
1490 SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1491 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1492 #endif /* SK_LOG */
1493
1494 done:
1495 return nxprov;
1496 }
1497
1498 struct kern_nexus_provider *
nxprov_create(struct proc * p,struct nxctl * nxctl,struct nxprov_reg * reg,int * err)1499 nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1500 int *err)
1501 {
1502 struct nxprov_params *nxp = ®->nxpreg_params;
1503 struct kern_nexus_domain_provider *nxdom_prov = NULL;
1504 struct kern_nexus_provider *nxprov = NULL;
1505
1506 NXCTL_LOCK_ASSERT_HELD(nxctl);
1507
1508 ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1509 *err = 0;
1510
1511 switch (nxp->nxp_type) {
1512 case NEXUS_TYPE_USER_PIPE: /* only for userland */
1513 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1514 PRIV_SKYWALK_REGISTER_USER_PIPE);
1515 break;
1516
1517 case NEXUS_TYPE_FLOW_SWITCH: /* allowed for userland */
1518 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1519 PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1520 break;
1521
1522 case NEXUS_TYPE_NET_IF: /* allowed for userland */
1523 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1524 PRIV_SKYWALK_REGISTER_NET_IF);
1525 break;
1526
1527 case NEXUS_TYPE_KERNEL_PIPE: /* only for kernel */
1528 case NEXUS_TYPE_MONITOR: /* invalid */
1529 default:
1530 *err = EINVAL;
1531 goto done;
1532 }
1533
1534 if (*err != 0) {
1535 goto done;
1536 }
1537
1538 ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1539 if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1540 *err = ENXIO;
1541 goto done;
1542 }
1543
1544 #if CONFIG_NEXUS_NETIF
1545 /* make sure netif_compat is the default here */
1546 ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1547 strbufcmp(nxdom_prov->nxdom_prov_name, sizeof(nxdom_prov->nxdom_prov_name),
1548 NEXUS_PROVIDER_NET_IF_COMPAT, sizeof(NEXUS_PROVIDER_NET_IF_COMPAT)) == 0);
1549 #endif /* CONFIG_NEXUS_NETIF */
1550
1551 SK_LOCK();
1552 /* callee holds a reference for our caller upon success */
1553 nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1554 SK_UNLOCK();
1555 done:
1556 return nxprov;
1557 }
1558
1559 struct kern_nexus_provider *
nxprov_create_kern(struct nxctl * nxctl,struct kern_nexus_domain_provider * nxdom_prov,struct nxprov_reg * reg,const struct kern_nexus_provider_init * init,int * err)1560 nxprov_create_kern(struct nxctl *nxctl,
1561 struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1562 const struct kern_nexus_provider_init *init, int *err)
1563 {
1564 struct nxprov_params *nxp = ®->nxpreg_params;
1565 struct kern_nexus_provider *nxprov = NULL;
1566
1567 NXCTL_LOCK_ASSERT_HELD(nxctl);
1568 SK_LOCK_ASSERT_HELD();
1569
1570 ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1571 ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1572 ASSERT(init == NULL ||
1573 init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1574 init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1575
1576 *err = 0;
1577
1578 switch (nxp->nxp_type) {
1579 case NEXUS_TYPE_NET_IF:
1580 break;
1581 case NEXUS_TYPE_KERNEL_PIPE:
1582 if (init == NULL) {
1583 *err = EINVAL;
1584 goto done;
1585 }
1586 break;
1587 case NEXUS_TYPE_FLOW_SWITCH:
1588 if (init != NULL) {
1589 *err = EINVAL;
1590 goto done;
1591 }
1592 break;
1593
1594 case NEXUS_TYPE_USER_PIPE: /* only for userland */
1595 case NEXUS_TYPE_MONITOR: /* invalid */
1596 default:
1597 *err = EINVAL;
1598 goto done;
1599 }
1600
1601 /* callee holds a reference for our caller upon success */
1602 nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1603
1604 done:
1605 return nxprov;
1606 }
1607
1608 int
nxprov_destroy(struct nxctl * nxctl,const uuid_t nxprov_uuid)1609 nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1610 {
1611 struct kern_nexus_provider *nxprov = NULL;
1612 int err = 0;
1613
1614 NXCTL_LOCK_ASSERT_HELD(nxctl);
1615
1616 SK_LOCK();
1617
1618 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1619 if (nxctl == nxprov->nxprov_ctl &&
1620 uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1621 nxprov_retain_locked(nxprov);
1622 break;
1623 }
1624 }
1625
1626 if (nxprov == NULL) {
1627 err = ENOENT;
1628 } else {
1629 err = nxprov_close(nxprov, TRUE);
1630 }
1631
1632 if (nxprov != NULL) {
1633 (void) nxprov_release_locked(nxprov);
1634 }
1635
1636 SK_UNLOCK();
1637
1638 return err;
1639 }
1640
1641 int
nxprov_close(struct kern_nexus_provider * nxprov,boolean_t locked)1642 nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1643 {
1644 int err = 0;
1645
1646 if (!locked) {
1647 SK_LOCK();
1648 }
1649
1650 SK_LOCK_ASSERT_HELD();
1651
1652 #if SK_LOG
1653 uuid_string_t uuidstr;
1654 SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1655 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1656 nxprov->nxprov_flags, NXPROVF_BITS);
1657 #endif /* SK_LOG */
1658
1659 if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1660 err = EALREADY;
1661 } else {
1662 struct kern_nexus *nx, *tnx;
1663
1664 nxprov->nxprov_ctl = NULL;
1665
1666 STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1667 nx_prov_link, tnx) {
1668 nx_retain_locked(nx);
1669 (void) nx_close(nx, TRUE);
1670 (void) nx_release_locked(nx);
1671 }
1672
1673 if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1674 /* no nexus created on this, so detach now */
1675 nxprov_detach(nxprov, TRUE);
1676 } else {
1677 /* detach when last nexus is destroyed */
1678 ASSERT(nxprov->nxprov_refcnt > 1);
1679 nxprov->nxprov_flags |= NXPROVF_CLOSED;
1680 }
1681 }
1682
1683 if (!locked) {
1684 SK_UNLOCK();
1685 }
1686
1687 return err;
1688 }
1689
1690 static void
nxprov_detach(struct kern_nexus_provider * nxprov,boolean_t locked)1691 nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1692 {
1693 if (!locked) {
1694 SK_LOCK();
1695 }
1696
1697 SK_LOCK_ASSERT_HELD();
1698
1699 #if SK_LOG
1700 uuid_string_t uuidstr;
1701 SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1702 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1703 nxprov->nxprov_flags, NXPROVF_BITS);
1704 #endif /* SK_LOG */
1705
1706 ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1707 STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1708 nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1709
1710 /* caller must hold an extra ref */
1711 ASSERT(nxprov->nxprov_refcnt > 1);
1712 (void) nxprov_release_locked(nxprov);
1713
1714 if (!locked) {
1715 SK_UNLOCK();
1716 }
1717 }
1718
1719 static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider * nxdom_prov,zalloc_flags_t how)1720 nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1721 {
1722 struct kern_nexus_provider *nxprov;
1723 struct nxprov_params *nxp;
1724
1725 ASSERT(nxdom_prov != NULL);
1726
1727 nxp = nxprov_params_alloc(how);
1728 if (nxp == NULL) {
1729 SK_ERR("Failed to allocate nxprov_params");
1730 return NULL;
1731 }
1732
1733 nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1734 if (nxprov == NULL) {
1735 SK_ERR("Failed to allocate nxprov");
1736 nxprov_params_free(nxp);
1737 return NULL;
1738 }
1739
1740 nxprov->nxprov_dom_prov = nxdom_prov;
1741 nxprov->nxprov_params = nxp;
1742 /* hold a reference for nxprov */
1743 nxdom_prov_retain_locked(nxdom_prov);
1744
1745 return nxprov;
1746 }
1747
1748 static void
nxprov_free(struct kern_nexus_provider * nxprov)1749 nxprov_free(struct kern_nexus_provider *nxprov)
1750 {
1751 struct kern_nexus_domain_provider *nxdom_prov =
1752 nxprov->nxprov_dom_prov;
1753
1754 SK_LOCK_ASSERT_HELD();
1755
1756 ASSERT(nxdom_prov != NULL);
1757 (void) nxdom_prov_release_locked(nxdom_prov);
1758 nxprov->nxprov_dom_prov = NULL;
1759 ASSERT(nxprov->nxprov_params != NULL);
1760 nxprov_params_free(nxprov->nxprov_params);
1761 nxprov->nxprov_params = NULL;
1762 ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1763 SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1764 zfree(nxprov_zone, nxprov);
1765 }
1766
1767 static void
nxprov_retain_locked(struct kern_nexus_provider * nxprov)1768 nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1769 {
1770 SK_LOCK_ASSERT_HELD();
1771
1772 nxprov->nxprov_refcnt++;
1773 ASSERT(nxprov->nxprov_refcnt != 0);
1774 }
1775
1776 void
nxprov_retain(struct kern_nexus_provider * nxprov)1777 nxprov_retain(struct kern_nexus_provider *nxprov)
1778 {
1779 SK_LOCK();
1780 nxprov_retain_locked(nxprov);
1781 SK_UNLOCK();
1782 }
1783
1784 static int
nxprov_release_locked(struct kern_nexus_provider * nxprov)1785 nxprov_release_locked(struct kern_nexus_provider *nxprov)
1786 {
1787 int oldref = nxprov->nxprov_refcnt;
1788
1789 SK_LOCK_ASSERT_HELD();
1790
1791 ASSERT(nxprov->nxprov_refcnt != 0);
1792 if (--nxprov->nxprov_refcnt == 0) {
1793 nxprov_free(nxprov);
1794 }
1795
1796 return oldref == 1;
1797 }
1798
1799 int
nxprov_release(struct kern_nexus_provider * nxprov)1800 nxprov_release(struct kern_nexus_provider *nxprov)
1801 {
1802 int lastref;
1803
1804 SK_LOCK();
1805 lastref = nxprov_release_locked(nxprov);
1806 SK_UNLOCK();
1807
1808 return lastref;
1809 }
1810
1811 struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)1812 nxprov_params_alloc(zalloc_flags_t how)
1813 {
1814 return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1815 }
1816
1817 void
nxprov_params_free(struct nxprov_params * nxp)1818 nxprov_params_free(struct nxprov_params *nxp)
1819 {
1820 SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1821 zfree(nxprov_params_zone, nxp);
1822 }
1823
1824 static int
nx_check_pp(struct kern_nexus_provider * nxprov,struct kern_pbufpool * pp)1825 nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1826 {
1827 struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1828
1829 if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1830 SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1831 return ENOTSUP;
1832 }
1833
1834 /*
1835 * Require that the nexus domain metadata type and the
1836 * metadata type of the caller-provided pbufpool match.
1837 */
1838 if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1839 pp->pp_md_type ||
1840 nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1841 pp->pp_md_subtype) {
1842 SK_ERR("Mismatch in metadata type/subtype "
1843 "(%u/%u != %u/%u)", pp->pp_md_type,
1844 nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1845 pp->pp_md_subtype,
1846 nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1847 return EINVAL;
1848 }
1849
1850 /*
1851 * Require that the nexus provider memory configuration
1852 * has the same impedance as the caller-provided one.
1853 * Both need to be lacking or present; if one of them
1854 * is set and the other isn't, then we bail.
1855 */
1856 if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1857 !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1858 SK_ERR("Memory config mismatch: monolithic mode");
1859 return EINVAL;
1860 }
1861
1862 return 0;
1863 }
1864
1865 struct kern_nexus *
nx_create(struct nxctl * nxctl,const uuid_t nxprov_uuid,const nexus_type_t dom_type,const void * nx_ctx,nexus_ctx_release_fn_t nx_ctx_release,struct kern_pbufpool * tx_pp,struct kern_pbufpool * rx_pp,int * err)1866 nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1867 const nexus_type_t dom_type, const void *nx_ctx,
1868 nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1869 struct kern_pbufpool *rx_pp, int *err)
1870 {
1871 struct kern_nexus_domain_provider *nxdom_prov;
1872 struct kern_nexus_provider *nxprov = NULL;
1873 struct kern_nexus *nx = NULL;
1874 #if SK_LOG
1875 uuid_string_t uuidstr;
1876 #endif /* SK_LOG */
1877
1878 NXCTL_LOCK_ASSERT_HELD(nxctl);
1879
1880 ASSERT(dom_type < NEXUS_TYPE_MAX);
1881 ASSERT(!uuid_is_null(nxprov_uuid));
1882 *err = 0;
1883
1884 SK_LOCK();
1885
1886 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1887 if (nxctl == nxprov->nxprov_ctl &&
1888 uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
1889 break;
1890 }
1891 }
1892
1893 if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1894 SK_ERR("Provider not found or has been closed");
1895 *err = ENOENT;
1896 goto done;
1897 }
1898
1899 nxdom_prov = nxprov->nxprov_dom_prov;
1900 if (dom_type != NEXUS_TYPE_UNDEFINED &&
1901 (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1902 SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1903 dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1904 nxdom_prov = NULL;
1905 nxprov = NULL;
1906 *err = ENODEV;
1907 goto done;
1908 }
1909
1910 if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1911 (!tx_pp || !rx_pp)) {
1912 #if SK_LOG
1913 SK_ERR("TX/RX packet pool is required for netif logical link "
1914 "nexus provider UUID: %s",
1915 sk_uuid_unparse(nxprov_uuid, uuidstr));
1916 #endif /* SK_LOG */
1917 nxdom_prov = NULL;
1918 nxprov = NULL;
1919 *err = EINVAL;
1920 goto done;
1921 }
1922
1923 if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
1924 (rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
1925 goto done;
1926 }
1927
1928 nx = nx_alloc(Z_WAITOK);
1929
1930 STAILQ_INIT(&nx->nx_ch_head);
1931 STAILQ_INIT(&nx->nx_ch_nonxref_head);
1932 lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
1933 &nexus_lock_attr);
1934 STAILQ_INIT(&nx->nx_ch_if_adv_head);
1935 uuid_generate_random(nx->nx_uuid);
1936 nx->nx_prov = nxprov;
1937 nx->nx_ctx = __DECONST(void *, nx_ctx);
1938 nx->nx_ctx_release = nx_ctx_release;
1939 nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1940
1941 if (tx_pp != NULL) {
1942 nx->nx_tx_pp = tx_pp;
1943 pp_retain(tx_pp); /* released by nx_free */
1944 }
1945
1946 if (rx_pp != NULL) {
1947 nx->nx_rx_pp = rx_pp;
1948 pp_retain(rx_pp); /* released by nx_free */
1949 }
1950
1951 /* this nexus is alive; tell the nexus constructor to set it up */
1952 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1953 *err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1954 if (*err != 0) {
1955 nx->nx_prov = NULL;
1956 goto done;
1957 }
1958 }
1959
1960 nxprov_retain_locked(nxprov); /* hold a ref on the nexus reg */
1961
1962 STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1963 nxprov->nxprov_nx_count++;
1964 RB_INSERT(kern_nexus_tree, &nx_head, nx);
1965 os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1966
1967 nx_retain_locked(nx); /* one for the provider list */
1968 nx_retain_locked(nx); /* one for the global list */
1969 nx_retain_locked(nx); /* one for the caller */
1970
1971 #if SK_LOG
1972 SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1973 nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1975 #endif /* SK_LOG */
1976 done:
1977 SK_UNLOCK();
1978
1979 if (*err != 0) {
1980 if (nx != NULL) {
1981 nx_free(nx);
1982 nx = NULL;
1983 }
1984 }
1985 return nx;
1986 }
1987
1988 int
nx_destroy(struct nxctl * nxctl,const uuid_t nx_uuid)1989 nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1990 {
1991 struct kern_nexus *nx = NULL;
1992 struct kern_nexus find;
1993 int err = 0;
1994
1995 NXCTL_LOCK_ASSERT_HELD(nxctl);
1996
1997 SK_LOCK();
1998
1999 uuid_copy(find.nx_uuid, nx_uuid);
2000 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2001 if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
2002 nx = NULL;
2003 }
2004
2005 if (nx != NULL) {
2006 nx_retain_locked(nx);
2007 }
2008
2009 if (nx == NULL) {
2010 err = ENOENT;
2011 } else {
2012 err = nx_close(nx, TRUE);
2013 (void) nx_release_locked(nx);
2014 }
2015
2016 SK_UNLOCK();
2017
2018 return err;
2019 }
2020
2021 static inline int
nx_cmp(const struct kern_nexus * a,const struct kern_nexus * b)2022 nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
2023 {
2024 return uuid_compare(a->nx_uuid, b->nx_uuid);
2025 }
2026
2027 struct kern_nexus *
nx_find(const uuid_t nx_uuid,boolean_t locked)2028 nx_find(const uuid_t nx_uuid, boolean_t locked)
2029 {
2030 struct kern_nexus *nx = NULL;
2031 struct kern_nexus find;
2032
2033 if (!locked) {
2034 SK_LOCK();
2035 }
2036
2037 SK_LOCK_ASSERT_HELD();
2038
2039 uuid_copy(find.nx_uuid, nx_uuid);
2040 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2041 if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2042 nx = NULL;
2043 }
2044
2045 /* return reference to caller */
2046 if (nx != NULL) {
2047 nx_retain_locked(nx);
2048 }
2049
2050 if (!locked) {
2051 SK_UNLOCK();
2052 }
2053
2054 return nx;
2055 }
2056
2057 int
nx_close(struct kern_nexus * nx,boolean_t locked)2058 nx_close(struct kern_nexus *nx, boolean_t locked)
2059 {
2060 int err = 0;
2061
2062 if (!locked) {
2063 SK_LOCK();
2064 }
2065
2066 SK_LOCK_ASSERT_HELD();
2067
2068
2069 if (nx->nx_flags & NXF_CLOSED) {
2070 err = EALREADY;
2071 } else {
2072 #if SK_LOG
2073 uuid_string_t uuidstr;
2074 SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2075 NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2076 sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2077 NXF_BITS);
2078 #endif /* SK_LOG */
2079
2080 if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2081 /* no regular channels open to it, so detach now */
2082 nx_detach(nx);
2083 } else {
2084 /* detach when the last channel closes */
2085 ASSERT(nx->nx_refcnt > 3);
2086 os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2087 }
2088 }
2089
2090 if (!locked) {
2091 SK_UNLOCK();
2092 }
2093
2094 return err;
2095 }
2096
2097 void
nx_stop(struct kern_nexus * nx)2098 nx_stop(struct kern_nexus *nx)
2099 {
2100 struct kern_nexus_provider *nxprov = nx->nx_prov;
2101
2102 SK_LOCK_ASSERT_HELD();
2103
2104 /* send a stop message */
2105 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2106 nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2107 }
2108 }
2109
2110 void
nx_detach(struct kern_nexus * nx)2111 nx_detach(struct kern_nexus *nx)
2112 {
2113 struct kern_nexus_provider *nxprov = nx->nx_prov;
2114
2115 SK_LOCK_ASSERT_HELD();
2116
2117 #if SK_LOG
2118 uuid_string_t uuidstr;
2119 SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2120 sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2121 #endif /* SK_LOG */
2122
2123 /* Caller must hold extra refs, on top of the two in reg/global lists */
2124 ASSERT(nx->nx_refcnt >= 3);
2125 ASSERT(nx->nx_flags & NXF_ATTACHED);
2126
2127 /* this nexus is done; let the nexus destructor do final cleanups */
2128 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2129 nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2130 }
2131
2132 ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2133 ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2134
2135 STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2136 nxprov->nxprov_nx_count--;
2137 RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2138 os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2139 nx->nx_prov = NULL;
2140 if (nx->nx_ctx_release != NULL) {
2141 nx->nx_ctx_release(nx->nx_ctx);
2142 }
2143 nx->nx_ctx = NULL;
2144
2145 (void) nx_release_locked(nx); /* one for the reg list */
2146 (void) nx_release_locked(nx); /* one for the global list */
2147
2148 /*
2149 * If this was the last nexus and the provider has been closed,
2150 * detach the provider and and finish up the postponed job.
2151 */
2152 if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2153 (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2154 nxprov_detach(nxprov, TRUE);
2155 }
2156 (void) nxprov_release_locked(nxprov);
2157 }
2158
2159 int
nx_advisory_alloc(struct kern_nexus * nx,const char * name,struct skmem_region_params * srp_nexusadv,nexus_advisory_type_t type)2160 nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2161 struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2162 {
2163 struct __kern_nexus_adv_metadata *adv_md;
2164 uint32_t msize = 0;
2165 /* -fbounds-safety: why do we need maddr? */
2166 void *__sized_by(msize) maddr = NULL;
2167
2168 _CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2169 _CASSERT((sizeof(struct sk_nexusadv) +
2170 sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2171 _CASSERT((sizeof(struct netif_nexus_advisory) +
2172 sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2173 ASSERT(nx->nx_adv.nxv_reg == NULL);
2174 ASSERT(nx->nx_adv.nxv_adv == NULL);
2175 ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2176 type == NEXUS_ADVISORY_TYPE_NETIF);
2177
2178 if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2179 NULL, NULL, NULL)) == NULL) {
2180 return ENOMEM;
2181 }
2182
2183 nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, &maddr,
2184 NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC),
2185 nx->nx_adv.nxv_reg->skr_c_obj_size, &msize);
2186 nx->nx_adv.nxv_adv_size = nx->nx_adv.nxv_reg->skr_c_obj_size;
2187 adv_md = nx->nx_adv.nxv_adv;
2188 adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2189 adv_md->knam_type = type;
2190 adv_md->__reserved = 0;
2191 nx->nx_adv.nxv_adv_type = type;
2192 nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2193 if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2194 nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2195 NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2196 } else {
2197 nx->nx_adv.netif_nxv_adv->nna_version =
2198 NX_NETIF_ADVISORY_CURRENT_VERSION;
2199 }
2200 return 0;
2201 }
2202
2203 void
nx_advisory_free(struct kern_nexus * nx)2204 nx_advisory_free(struct kern_nexus *nx)
2205 {
2206 if (nx->nx_adv.nxv_reg != NULL) {
2207 ASSERT(nx->nx_adv.nxv_adv != NULL);
2208 skmem_region_free(nx->nx_adv.nxv_reg,
2209 nx->nx_adv.nxv_adv, NULL);
2210 nx->nx_adv.nxv_adv = NULL;
2211 nx->nx_adv.nxv_adv_size = 0;
2212 nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2213 nx->nx_adv.flowswitch_nxv_adv = NULL;
2214 skmem_region_release(nx->nx_adv.nxv_reg);
2215 nx->nx_adv.nxv_reg = NULL;
2216 }
2217
2218 ASSERT(nx->nx_adv.nxv_reg == NULL);
2219 ASSERT(nx->nx_adv.nxv_adv == NULL);
2220 ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2221 ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2222 }
2223
2224 static struct kern_nexus *
nx_alloc(zalloc_flags_t how)2225 nx_alloc(zalloc_flags_t how)
2226 {
2227 SK_LOCK_ASSERT_HELD();
2228
2229 return zalloc_flags(nx_zone, how | Z_ZERO);
2230 }
2231
2232 static void
nx_free(struct kern_nexus * nx)2233 nx_free(struct kern_nexus *nx)
2234 {
2235 ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2236 ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2237 ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2238
2239 nx_port_free_all(nx);
2240
2241 if (nx->nx_tx_pp != NULL) {
2242 pp_release(nx->nx_tx_pp);
2243 nx->nx_tx_pp = NULL;
2244 }
2245 if (nx->nx_rx_pp != NULL) {
2246 pp_release(nx->nx_rx_pp);
2247 nx->nx_rx_pp = NULL;
2248 }
2249
2250 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2251 lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
2252
2253 SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2254 zfree(nx_zone, nx);
2255 }
2256
2257 void
nx_retain_locked(struct kern_nexus * nx)2258 nx_retain_locked(struct kern_nexus *nx)
2259 {
2260 SK_LOCK_ASSERT_HELD();
2261
2262 nx->nx_refcnt++;
2263 VERIFY(nx->nx_refcnt > 0);
2264 }
2265
2266 void
nx_retain(struct kern_nexus * nx)2267 nx_retain(struct kern_nexus *nx)
2268 {
2269 SK_LOCK();
2270 nx_retain_locked(nx);
2271 SK_UNLOCK();
2272 }
2273
2274 int
nx_release_locked(struct kern_nexus * nx)2275 nx_release_locked(struct kern_nexus *nx)
2276 {
2277 int oldref = nx->nx_refcnt;
2278
2279 SK_LOCK_ASSERT_HELD();
2280
2281 VERIFY(nx->nx_refcnt > 0);
2282 if (--nx->nx_refcnt == 0) {
2283 nx_free(nx);
2284 }
2285
2286 return oldref == 1;
2287 }
2288
2289 int
nx_release(struct kern_nexus * nx)2290 nx_release(struct kern_nexus *nx)
2291 {
2292 int lastref;
2293
2294 SK_LOCK_ASSERT_NOTHELD();
2295
2296 SK_LOCK();
2297 lastref = nx_release_locked(nx);
2298 SK_UNLOCK();
2299
2300 return lastref;
2301 }
2302
2303 static int
nx_init_rings(struct kern_nexus * nx,struct kern_channel * ch)2304 nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2305 {
2306 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2307 struct nexus_adapter *na = ch->ch_na;
2308 boolean_t undo = FALSE;
2309 int ksd_retains = 0;
2310 enum txrx t;
2311 int err = 0;
2312
2313 ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2314 CHANF_EXT_PRECONNECT);
2315
2316 if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2317 return 0;
2318 }
2319
2320 for_rx_tx(t) {
2321 uint32_t i;
2322
2323 for (i = 0; i < na_get_nrings(na, t); i++) {
2324 struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2325
2326 /* skip host rings */
2327 if (kring->ckr_flags & CKRF_HOST) {
2328 continue;
2329 }
2330
2331 if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2332 nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2333 &kring->ckr_ctx)) != 0) {
2334 SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2335 "(0x%llx) krflags %b ring_init error %d",
2336 SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2337 SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2338 kring->ckr_flags, CKRF_BITS, err);
2339 kring->ckr_ctx = NULL;
2340 undo = TRUE;
2341 break;
2342 }
2343 kring->ckr_flags |= CKRF_EXT_RING_INITED;
2344
2345 if ((err = nx_init_slots(nx, kring)) != 0) {
2346 undo = TRUE;
2347 break;
2348 }
2349
2350 if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2351 ++ksd_retains;
2352 }
2353 }
2354 if (undo) {
2355 break;
2356 }
2357 }
2358
2359 /*
2360 * Note: retain KSD even in case of error, as we have set
2361 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2362 * nx_fini_rings would take care of release based on it.
2363 */
2364 if (ksd_retains != 0) {
2365 /*
2366 * Mark the kernel slot descriptor region as busy; this
2367 * prevents it from being torn-down at channel defunct
2368 * time, as we need to invoke the slot_fini() callback
2369 * for each slot and we need the descriptors until then.
2370 */
2371 skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
2372 ksd_retains);
2373 }
2374
2375 if (err != 0) {
2376 ASSERT(undo);
2377 nx_fini_rings(nx, ch);
2378 }
2379
2380 return err;
2381 }
2382
2383 static void
nx_fini_rings(struct kern_nexus * nx,struct kern_channel * ch)2384 nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2385 {
2386 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2387 struct nexus_adapter *na = ch->ch_na;
2388 int ksd_releases = 0;
2389 enum txrx t;
2390
2391 for_rx_tx(t) {
2392 uint32_t i;
2393
2394 for (i = 0; i < na_get_nrings(na, t); i++) {
2395 struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2396
2397 if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2398 continue;
2399 }
2400
2401 ASSERT(!(kring->ckr_flags & CKRF_HOST));
2402 ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2403 nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2404 kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2405
2406 if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2407 ++ksd_releases;
2408 }
2409
2410 /*
2411 * Undo the work done in nx_init_slots() and inform
2412 * the external domain provider, if applicable, that
2413 * the slots for this ring are no longer valid.
2414 */
2415 nx_fini_slots(nx, kring);
2416 kring->ckr_ctx = NULL;
2417 }
2418 }
2419
2420 if (ksd_releases != 0) {
2421 /*
2422 * Now that we've finished invoking the slot_fini()
2423 * callbacks, release the busy retain counts held
2424 * earlier in nx_init_rings(). This will allow the
2425 * kernel slot descriptor region to be torn down.
2426 */
2427 skmem_arena_nexus_sd_set_noidle(
2428 skmem_arena_nexus(na->na_arena), -ksd_releases);
2429 }
2430 }
2431
2432 static int
nx_init_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2433 nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2434 {
2435 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2436 struct __slot_desc *slot = kring->ckr_ksds;
2437 int err = 0;
2438 uint32_t i;
2439
2440 /*
2441 * If the slot init callback was not provided, or if the
2442 * kring was not created to hold any slot contexts, don't
2443 * go any further.
2444 */
2445 if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2446 kring->ckr_slot_ctxs == NULL) {
2447 return 0;
2448 }
2449
2450 ASSERT(kring->ckr_slot_ctxs_set == 0);
2451 ASSERT(slot != NULL);
2452
2453 for (i = 0; i < kring->ckr_num_slots; i++) {
2454 struct kern_slot_prop *__single slot_ctx_prop = NULL;
2455 /* -fbounds-safety: slot_ctx is unsafe anyway (mach_vmaddr_t) */
2456 void *__single slot_ctx_arg = NULL;
2457
2458 ASSERT(&slot[i] <= kring->ckr_ksds_last);
2459 if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2460 &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2461 SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2462 "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2463 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2464 break;
2465 }
2466 /* we don't want this to be used by client, so verify here */
2467 ASSERT(slot_ctx_prop == NULL);
2468 kring->ckr_slot_ctxs[i].slot_ctx_arg = slot_ctx_arg;
2469 kring->ckr_slot_ctxs_set++;
2470 }
2471
2472 if (err != 0) {
2473 nx_fini_slots(nx, kring);
2474 } else {
2475 kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2476 }
2477
2478 return err;
2479 }
2480
2481 static void
nx_fini_slots(struct kern_nexus * nx,struct __kern_channel_ring * kring)2482 nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2483 {
2484 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2485 struct __slot_desc *slot = kring->ckr_ksds;
2486 uint32_t i;
2487
2488 ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2489 nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2490 ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2491
2492 for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2493 ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2494 if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2495 nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2496 kring, &slot[i], i);
2497 }
2498 if (kring->ckr_slot_ctxs != NULL) {
2499 kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2500 }
2501 }
2502 kring->ckr_slot_ctxs_set = 0;
2503
2504 /* We're done with this kring */
2505 kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2506 }
2507
2508
2509 /* 64-bit mask with range */
2510 #define BMASK64(_beg, _end) \
2511 ((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2512
2513 int
nx_port_find(struct kern_nexus * nx,nexus_port_t first,nexus_port_t last,nexus_port_t * nx_port)2514 nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2515 nexus_port_t last, nexus_port_t *nx_port)
2516 {
2517 int err = 0;
2518
2519 ASSERT(first < last);
2520 *nx_port = NEXUS_PORT_ANY;
2521
2522 if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2523 /*
2524 * Left edge of the range is beyond the current map;
2525 * let nx_port_alloc() handle the growing later.
2526 */
2527 *nx_port = first;
2528 } else {
2529 nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2530 nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2531 nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2532 nexus_port_size_t i, j;
2533 bitmap_t *bmap;
2534
2535 /*
2536 * The right edge of the range is either within or
2537 * beyond the current map; scan thru the current
2538 * map and find the first available port.
2539 */
2540 for (i = fc; i <= lc; i++) {
2541 bitmap_t mask;
2542 nexus_port_size_t beg = 0, end = 63;
2543
2544 if (i == fc) {
2545 beg = (first % NX_PORT_CHUNK);
2546 }
2547 if (i == (last / NX_PORT_CHUNK)) {
2548 end = (last % NX_PORT_CHUNK);
2549 }
2550
2551 if (i < lim) {
2552 bmap = &nx->nx_ports_bmap[i];
2553 mask = BMASK64(beg, end);
2554
2555 j = (nexus_port_size_t)ffsll((*bmap) & mask);
2556 if (j == 0) {
2557 continue;
2558 }
2559
2560 --j;
2561 *nx_port = (i * NX_PORT_CHUNK) + j;
2562 }
2563 break;
2564 }
2565
2566 /*
2567 * If the requested range is within the current map and we
2568 * couldn't find a port, return an err. Otherwise, return
2569 * the next port index to trigger growing later.
2570 */
2571 if (*nx_port == NEXUS_PORT_ANY) {
2572 if (lc == (last / NX_PORT_CHUNK)) {
2573 err = EBUSY;
2574 SK_ERR("port unavail in [%u, %u)", first, last);
2575 } else {
2576 *nx_port = nx->nx_num_ports;
2577 }
2578 }
2579 }
2580
2581 SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2582 (int)*nx_port, err);
2583
2584 return err;
2585 }
2586
2587 static int
nx_port_grow(struct kern_nexus * nx,nexus_port_size_t grow)2588 nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2589 {
2590 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2591 nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2592 struct nx_port_info *ports;
2593 size_t limit;
2594 nexus_port_size_t i, num_ports, old_num_ports;
2595 bitmap_t *bmap;
2596
2597 ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2598 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2599 _CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2600 ASSERT(powerof2(dom_port_max));
2601 ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2602
2603 old_num_ports = nx->nx_num_ports;
2604 num_ports = nx->nx_num_ports + grow;
2605 limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2606 if (num_ports > limit) {
2607 SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2608 nx->nx_num_ports, grow, num_ports, limit);
2609 return EDOM;
2610 }
2611
2612 if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2613 (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2614 (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2615 Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2616 SK_ERR("bmap alloc failed, num_port %u", num_ports);
2617 return ENOMEM;
2618 }
2619 nx->nx_ports_bmap = bmap;
2620 nx->nx_ports_bmap_size = (num_ports / NX_PORT_CHUNK) * sizeof(*bmap);
2621
2622 if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2623 num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2624 /* can't free bmap here, otherwise nexus won't work */
2625 SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2626 return ENOMEM;
2627 }
2628
2629 /* initialize the additional new ports */
2630 bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
2631
2632 /* initialize new bitmaps (set all bits) */
2633 for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2634 i < (num_ports / NX_PORT_CHUNK); i++) {
2635 bmap[i] = NX_PORT_CHUNK_FREE;
2636 }
2637
2638 /*
2639 * -fbounds-safety: Not sure if moving nx_ports assignment down here
2640 * would cause a regression.
2641 */
2642 nx->nx_ports = ports;
2643 nx->nx_num_ports = num_ports;
2644
2645 SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2646 SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2647
2648 return 0;
2649 }
2650
2651 int
nx_port_alloc(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb,struct nexus_adapter ** na,struct proc * p)2652 nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2653 struct nexus_adapter **na, struct proc *p)
2654 {
2655 struct nx_port_info *npi = NULL;
2656 struct nxbind *nxb0;
2657 size_t g;
2658 uint32_t i, j;
2659 bitmap_t *bmap;
2660 bool refonly = false;
2661 int err = 0;
2662
2663 ASSERT(nx_port != NEXUS_PORT_ANY);
2664 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2665
2666 /* port is zero-based, so adjust here */
2667 if ((nx_port + 1) > nx->nx_num_ports) {
2668 g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2669 VERIFY(g <= NEXUS_PORT_MAX);
2670 if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2671 goto done;
2672 }
2673 }
2674 ASSERT(err == 0);
2675 ASSERT(nx_port < nx->nx_num_ports);
2676 npi = &nx->nx_ports[nx_port];
2677 nxb0 = npi->npi_nxb;
2678 i = nx_port / NX_PORT_CHUNK;
2679 j = nx_port % NX_PORT_CHUNK;
2680 bmap = &nx->nx_ports_bmap[i];
2681
2682 if (bit_test(*bmap, j)) {
2683 /* port is not (yet) bound or allocated */
2684 ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2685 if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2686 /*
2687 * If the port allocation is requested by userland
2688 * and the nexus is non-anonymous, then fail the
2689 * request.
2690 */
2691 err = EACCES;
2692 SK_ERR("user proc alloc on named nexus needs binding");
2693 } else if (na != NULL && *na != NULL) {
2694 /*
2695 * Otherwise claim it (clear bit) if the caller
2696 * supplied an adapter for this port; else, it
2697 * is just an existential check and so there's
2698 * no action needed at this point (we'll skip
2699 * the init below since vpna is NULL).
2700 */
2701 bit_clear(*bmap, j);
2702 }
2703 } else {
2704 /* if port is bound, check if credentials match */
2705 if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2706 (nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
2707 SK_ERR("nexus binding mismatch");
2708 err = EACCES;
2709 } else {
2710 /*
2711 * If port is already occupied by an adapter,
2712 * see if the client is requesting a reference
2713 * to it; if so, return the adapter. Otherwise,
2714 * if unoccupied and vpna is non-NULL, associate
2715 * it with this nexus port via the below init.
2716 */
2717 if (NPI_NA(npi) != NULL) {
2718 if (na != NULL && *na == NULL) {
2719 *na = NPI_NA(npi);
2720 na_retain_locked(*na);
2721 /* skip the init below */
2722 refonly = true;
2723 } else {
2724 /*
2725 * If the client supplied an adapter
2726 * (regardless of its value) for a
2727 * nexus port that's already occupied,
2728 * then we fail the request.
2729 */
2730 SK_ERR("nexus adapted exits");
2731 err = EEXIST;
2732 }
2733 }
2734 }
2735 }
2736
2737 done:
2738 /* initialize the nexus port and the adapter occupying it */
2739 if (err == 0 && na != NULL && *na != NULL && !refonly) {
2740 ASSERT(nx_port < nx->nx_num_ports);
2741 ASSERT(npi->npi_nah == 0);
2742 ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2743 ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2744 (nx_port % NX_PORT_CHUNK)));
2745
2746 nx->nx_active_ports++;
2747 npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2748 (*na)->na_nx_port = nx_port;
2749 }
2750
2751 SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2752 SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2753 err);
2754
2755 return err;
2756 }
2757
2758 void
nx_port_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2759 nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2760 {
2761 struct nx_port_info *npi = &nx->nx_ports[nx_port];
2762
2763 npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2764 NEXUS_PORT_STATE_DEFUNCT);
2765 }
2766
2767 void
nx_port_free(struct kern_nexus * nx,nexus_port_t nx_port)2768 nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2769 {
2770 struct nx_port_info *npi = NULL;
2771 bitmap_t *bmap;
2772 uint32_t i, j;
2773
2774 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2775 ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2776 ASSERT(nx->nx_active_ports != 0);
2777
2778 i = nx_port / NX_PORT_CHUNK;
2779 j = nx_port % NX_PORT_CHUNK;
2780 bmap = &nx->nx_ports_bmap[i];
2781 ASSERT(!bit_test(*bmap, j));
2782
2783 npi = &nx->nx_ports[nx_port];
2784 npi->npi_nah = 0;
2785 if (npi->npi_nxb == NULL) {
2786 /* it's vacant, release it (set bit) */
2787 bit_set(*bmap, j);
2788 }
2789
2790 nx->nx_active_ports--;
2791
2792 //XXX [email protected] --- try to shrink bitmap & nx_ports ???
2793
2794 SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2795 SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2796 }
2797
2798 int
nx_port_bind_info(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0,void * info)2799 nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2800 struct nxbind *nxb0, void *info)
2801 {
2802 struct nx_port_info *npi = NULL;
2803 size_t g;
2804 uint32_t i, j;
2805 bitmap_t *bmap;
2806 int err = 0;
2807
2808 ASSERT(nx_port != NEXUS_PORT_ANY);
2809 ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2810 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2811 ASSERT(nxb0 != NULL);
2812
2813 if ((nx_port) + 1 > nx->nx_num_ports) {
2814 g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2815 VERIFY(g <= NEXUS_PORT_MAX);
2816 if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
2817 goto done;
2818 }
2819 }
2820 ASSERT(err == 0);
2821
2822 npi = &nx->nx_ports[nx_port];
2823 i = nx_port / NX_PORT_CHUNK;
2824 j = nx_port % NX_PORT_CHUNK;
2825 bmap = &nx->nx_ports_bmap[i];
2826 if (bit_test(*bmap, j)) {
2827 /* port is not (yet) bound or allocated */
2828 ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2829
2830 bit_clear(*bmap, j);
2831 struct nxbind *nxb = nxb_alloc(Z_WAITOK);
2832 nxb_move(nxb0, nxb);
2833 npi->npi_nxb = nxb;
2834 npi->npi_info = info;
2835 /* claim it (clear bit) */
2836 bit_clear(*bmap, j);
2837 ASSERT(err == 0);
2838 } else {
2839 /* port is already taken */
2840 ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2841 err = EEXIST;
2842 }
2843 done:
2844
2845 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2846 "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2847 (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2848
2849 return err;
2850 }
2851
2852 int
nx_port_bind(struct kern_nexus * nx,nexus_port_t nx_port,struct nxbind * nxb0)2853 nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2854 {
2855 return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2856 }
2857
2858 /*
2859 * -fbounds-safety: all callers pass npi_info. Why don't we just change the
2860 * input type to nx_port_info_header *?
2861 */
2862 static int
nx_port_info_size(struct nx_port_info_header * info,size_t * sz)2863 nx_port_info_size(struct nx_port_info_header *info, size_t *sz)
2864 {
2865 struct nx_port_info_header *hdr = info;
2866
2867 switch (hdr->ih_type) {
2868 case NX_PORT_INFO_TYPE_NETIF:
2869 break;
2870 default:
2871 return EINVAL;
2872 }
2873 *sz = hdr->ih_size;
2874 return 0;
2875 }
2876
2877 int
nx_port_unbind(struct kern_nexus * nx,nexus_port_t nx_port)2878 nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2879 {
2880 struct nx_port_info *npi = NULL;
2881 struct nxbind *nxb;
2882 uint32_t i, j;
2883 bitmap_t *bmap;
2884 int err = 0;
2885
2886 ASSERT(nx_port != NEXUS_PORT_ANY);
2887
2888 if (nx_port >= nx->nx_num_ports) {
2889 err = EDOM;
2890 goto done;
2891 }
2892
2893 npi = &nx->nx_ports[nx_port];
2894 i = nx_port / NX_PORT_CHUNK;
2895 j = nx_port % NX_PORT_CHUNK;
2896 bmap = &nx->nx_ports_bmap[i];
2897
2898 if ((nxb = npi->npi_nxb) == NULL) {
2899 /* must be either free or allocated */
2900 ASSERT(NPI_NA(npi) == NULL ||
2901 (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2902 err = ENOENT;
2903 } else {
2904 nxb_free(nxb);
2905 npi->npi_nxb = NULL;
2906 if (npi->npi_info != NULL) {
2907 size_t sz;
2908
2909 VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2910 sk_free_data(npi->npi_info, sz);
2911 npi->npi_info = NULL;
2912 }
2913 ASSERT(!bit_test(*bmap, j));
2914 if (NPI_NA(npi) == NULL) {
2915 /* it's vacant, release it (set bit) */
2916 bit_set(*bmap, j);
2917 }
2918 }
2919
2920 done:
2921 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2922 "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2923 (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2924
2925 return err;
2926 }
2927
2928 struct nexus_adapter *
nx_port_get_na(struct kern_nexus * nx,nexus_port_t nx_port)2929 nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2930 {
2931 if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2932 return NPI_NA(&nx->nx_ports[nx_port]);
2933 } else {
2934 return NULL;
2935 }
2936 }
2937
2938 int
nx_port_get_info(struct kern_nexus * nx,nexus_port_t port,nx_port_info_type_t type,void * __sized_by (len)info,uint32_t len)2939 nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2940 nx_port_info_type_t type, void *__sized_by(len)info, uint32_t len)
2941 {
2942 struct nx_port_info *npi;
2943 struct nx_port_info_header *hdr;
2944
2945 if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2946 return ENXIO;
2947 }
2948 npi = &nx->nx_ports[port];
2949 /*
2950 * -fbounds-safety: Changing npi_info to be __sized_by is a major
2951 * surgery. Just forge it here for now.
2952 */
2953 hdr = __unsafe_forge_bidi_indexable(struct nx_port_info_header *,
2954 npi->npi_info, len);
2955 if (hdr == NULL) {
2956 return ENOENT;
2957 }
2958
2959 if (hdr->ih_type != type) {
2960 return EINVAL;
2961 }
2962
2963 bcopy(hdr, info, len);
2964 return 0;
2965 }
2966
2967 bool
nx_port_is_valid(struct kern_nexus * nx,nexus_port_t nx_port)2968 nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2969 {
2970 return nx_port < nx->nx_num_ports;
2971 }
2972
2973 bool
nx_port_is_defunct(struct kern_nexus * nx,nexus_port_t nx_port)2974 nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2975 {
2976 ASSERT(nx_port_is_valid(nx, nx_port));
2977
2978 return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2979 }
2980
2981 void
nx_port_free_all(struct kern_nexus * nx)2982 nx_port_free_all(struct kern_nexus *nx)
2983 {
2984 /* uncrustify doesn't handle C blocks properly */
2985 /* BEGIN IGNORE CODESTYLE */
2986 nx_port_foreach(nx, ^(nexus_port_t p) {
2987 struct nxbind *nxb;
2988 /*
2989 * XXX -fbounds-safety: Come back to this after fixing npi_info
2990 */
2991 void *__single info;
2992 nxb = nx->nx_ports[p].npi_nxb;
2993 info = nx->nx_ports[p].npi_info;
2994 if (nxb != NULL) {
2995 nxb_free(nxb);
2996 nx->nx_ports[p].npi_nxb = NULL;
2997 }
2998 if (info != NULL) {
2999 size_t sz;
3000
3001 VERIFY(nx_port_info_size(info, &sz) == 0);
3002 skn_free_data(info, info, sz);
3003 nx->nx_ports[p].npi_info = NULL;
3004 }
3005 });
3006 /* END IGNORE CODESTYLE */
3007
3008 nx->nx_active_ports = 0;
3009 sk_free_data_sized_by(nx->nx_ports_bmap, nx->nx_ports_bmap_size);
3010 nx->nx_ports_bmap = NULL;
3011 nx->nx_ports_bmap_size = 0;
3012 sk_free_type_array_counted_by(struct nx_port_info, nx->nx_num_ports, nx->nx_ports);
3013 nx->nx_ports = NULL;
3014 nx->nx_num_ports = 0;
3015 }
3016
3017 void
3018 nx_port_foreach(struct kern_nexus *nx,
3019 void (^port_handle)(nexus_port_t nx_port))
3020 {
3021 for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
3022 bitmap_t bmap = nx->nx_ports_bmap[i];
3023
3024 if (bmap == NX_PORT_CHUNK_FREE) {
3025 continue;
3026 }
3027
3028 for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
3029 if (bit_test(bmap, j)) {
3030 continue;
3031 }
3032 port_handle((i * NX_PORT_CHUNK) + j);
3033 }
3034 }
3035 }
3036
3037 /*
3038 * sysctl interfaces
3039 */
3040 static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
3041 static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
3042 static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
3043
3044 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
3045 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3046 0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
3047
3048 SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
3049 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3050 0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
3051
3052 SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
3053 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3054 0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
3055 "A list of logical links");
3056
3057 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3058 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3059 0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3060 "Nexus inet flows with stats collected in kernel");
3061
3062 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3063 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3064 0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3065 "Nexus flow owners");
3066
3067 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3068 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3069 0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3070 "Nexus flow routes");
3071
3072 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3073 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3074 0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3075 "Nexus netif statistics collected in kernel");
3076
3077 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3078 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3079 0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3080 "Nexus flowswitch statistics collected in kernel");
3081
3082 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3083 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3084 0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3085 "Nexus userstack statistics counter");
3086
3087 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3088 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3089 0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3090 "Nexus flow advisory dump");
3091
3092 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3093 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3094 0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3095 "A list of netif queue stats entries");
3096
3097 /*
3098 * Provider list sysctl
3099 */
3100 static void
nexus_provider_info_populate(struct kern_nexus_provider * nxprov,nexus_provider_info_t info)3101 nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3102 nexus_provider_info_t info)
3103 {
3104 struct kern_nexus *nx;
3105 uuid_t *uuids;
3106
3107 SK_LOCK_ASSERT_HELD();
3108
3109 /* provider UUID + params */
3110 uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
3111 bcopy(nxprov->nxprov_params, &info->npi_prov_params,
3112 sizeof(struct nxprov_params));
3113 info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3114
3115 /* instance UUID list */
3116 uuids = __unsafe_forge_bidi_indexable(uuid_t *,
3117 info->npi_instance_uuids, sizeof(uuid_t) * info->npi_instance_uuids_count);
3118 STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3119 uuid_copy(*uuids, nx->nx_uuid);
3120 uuids++;
3121 }
3122 }
3123
3124 static int
3125 nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3126 {
3127 #pragma unused(arg1, arg2, oidp)
3128 size_t actual_space;
3129 caddr_t buffer = NULL;
3130 size_t buffer_space;
3131 size_t allocated_space;
3132 int out_error;
3133 int error = 0;
3134 struct kern_nexus_provider *nxprov;
3135 caddr_t scan;
3136
3137 if (!kauth_cred_issuser(kauth_cred_get())) {
3138 return EPERM;
3139 }
3140
3141 net_update_uptime();
3142 buffer_space = req->oldlen;
3143 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3144 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3145 buffer_space = SK_SYSCTL_ALLOC_MAX;
3146 }
3147 allocated_space = buffer_space;
3148 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3149 if (__improbable(buffer == NULL)) {
3150 return ENOBUFS;
3151 }
3152 } else if (req->oldptr == USER_ADDR_NULL) {
3153 buffer_space = 0;
3154 }
3155 actual_space = 0;
3156 scan = buffer;
3157 SK_LOCK();
3158 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3159 size_t info_size;
3160
3161 info_size
3162 = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3163 if (scan != NULL) {
3164 if (buffer_space < info_size) {
3165 /* supplied buffer too small, stop copying */
3166 error = ENOMEM;
3167 break;
3168 }
3169 nexus_provider_info_populate(nxprov, (void *)scan);
3170 scan += info_size;
3171 buffer_space -= info_size;
3172 }
3173 actual_space += info_size;
3174 }
3175 SK_UNLOCK();
3176
3177 out_error = SYSCTL_OUT(req, buffer, actual_space);
3178 if (out_error != 0) {
3179 error = out_error;
3180 }
3181
3182 if (buffer != NULL) {
3183 sk_free_data(buffer, allocated_space);
3184 }
3185
3186 return error;
3187 }
3188
3189 /*
3190 * Channel list sysctl
3191 */
3192 static uint32_t
channel_ring_count(struct kern_channel * ch,enum txrx which)3193 channel_ring_count(struct kern_channel *ch, enum txrx which)
3194 {
3195 return ch->ch_last[which] - ch->ch_first[which];
3196 }
3197
3198 /*
3199 * -fbounds-safety: kring's range is [first..last]. Marking it
3200 * __counted_by(last) means range is [0..first..last]. The [0..first) might be
3201 * problematic. However, the for loop in this function starts indexing from
3202 * 'first', not 0, so that should be okay.
3203 * XXX Until BATS starts using uncrustify-7 (rdar://90709826), having a space
3204 * between __counted_by(entry_count) entries will be considered invalid code
3205 * style and build will fail. Until rdar://117811249 is resolved, either stick
3206 * to what makes BATS happy, or wrap IGNORE CODESTYLE around.
3207 */
3208 static void
populate_ring_entries(struct __kern_channel_ring * __counted_by (last)kring,ring_id_t first,ring_id_t last,nexus_channel_ring_entry * __counted_by (entry_count)entries,uint32_t NX_FB_ARG entry_count)3209 populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring,
3210 ring_id_t first, ring_id_t last,
3211 nexus_channel_ring_entry *__counted_by(entry_count)entries,
3212 uint32_t NX_FB_ARG entry_count)
3213 {
3214 ring_id_t i;
3215 nexus_channel_ring_entry_t scan;
3216 struct __kern_channel_ring *ring;
3217
3218 scan = entries;
3219 for (i = first; i < last; i++, scan++) {
3220 ring = &kring[i];
3221
3222 DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3223 ring);
3224 if (kr_stat_enable == 0) {
3225 bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
3226 bzero(&scan->ncre_user_stats,
3227 sizeof(scan->ncre_user_stats));
3228 } else {
3229 scan->ncre_stats = ring->ckr_stats;
3230 scan->ncre_user_stats = ring->ckr_usr_stats;
3231 }
3232 scan->ncre_error_stats = ring->ckr_err_stats;
3233 scan->ncre_ring_id = i;
3234 }
3235 }
3236
3237 /* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3238 static uint32_t
nexus_channel_get_flags(uint32_t ch_mode,uint32_t ch_flags)3239 nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3240 {
3241 uint32_t flags = 0;
3242
3243 flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3244 flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3245 flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3246 flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3247 flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3248 flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3249 flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3250 flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3251 flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3252 flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3253 flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3254 flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3255 flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3256
3257 return flags;
3258 }
3259
3260 SK_NO_INLINE_ATTRIBUTE
3261 static void
nexus_channel_entry_populate(struct kern_channel * ch,nexus_channel_entry_t entry)3262 nexus_channel_entry_populate(struct kern_channel *ch,
3263 nexus_channel_entry_t entry)
3264 {
3265 uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3266 uint32_t ch_flags = ch->ch_flags;
3267 ring_id_t rx_first = ch->ch_first[NR_RX];
3268 ring_id_t rx_last = ch->ch_last[NR_RX];
3269 ring_id_t tx_last = ch->ch_last[NR_TX];
3270 ring_id_t tx_first = ch->ch_first[NR_TX];
3271
3272 uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
3273 entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3274 entry->nce_port = ch->ch_info->cinfo_nx_port;
3275 entry->nce_pid = ch->ch_pid;
3276 entry->nce_fd = ch->ch_fd;
3277 entry->nce_tx_rings = tx_last - tx_first;
3278 entry->nce_rx_rings = rx_last - rx_first;
3279 populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
3280 entry->nce_ring_entries, entry->nce_tx_rings);
3281
3282 /*
3283 * -fbounds-safety: If entry->nce_tx_rings > 0 and
3284 * entry->nce_rx_rings == 0 (i.e. entry->nce_ring_count ==
3285 * entry->nce_tx_rings), simply passing
3286 * entry->nce_ring_entries + entry->nce_tx_rings to populate_ring_entries
3287 * will fail bounds check, because it is equivalent to assigning
3288 * nce_ring_entries + nce_tx_rings to a __single variable, and in this
3289 * case it goes out of bounds. It's same thing as having:
3290 * int a[1];
3291 * some_func(a + 1); <-- bounds check will fail
3292 */
3293 if (rx_first < rx_last) {
3294 populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
3295 entry->nce_ring_entries + entry->nce_tx_rings,
3296 entry->nce_rx_rings);
3297 }
3298 }
3299
3300 SK_NO_INLINE_ATTRIBUTE
3301 static size_t
nexus_channel_info_populate(struct kern_nexus * nx,nexus_channel_info * __sized_by (buffer_size)info,size_t buffer_size)3302 nexus_channel_info_populate(struct kern_nexus *nx,
3303 nexus_channel_info *__sized_by(buffer_size) info, size_t buffer_size)
3304 {
3305 struct kern_channel *ch = NULL;
3306 size_t info_size;
3307 caddr_t scan = NULL;
3308 nexus_channel_entry *entry;
3309
3310 SK_LOCK_ASSERT_HELD();
3311
3312 info_size = sizeof(nexus_channel_info);
3313
3314 /* channel list */
3315 if (info != NULL) {
3316 if (buffer_size < info_size) {
3317 return info_size;
3318 }
3319
3320 /* instance UUID */
3321 uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
3322 info->nci_channel_entries_count = nx->nx_ch_count;
3323 scan = (caddr_t __bidi_indexable)info->nci_channel_entries;
3324 }
3325 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3326 size_t entry_size;
3327 uint32_t ring_count;
3328
3329 ring_count = channel_ring_count(ch, NR_TX) +
3330 channel_ring_count(ch, NR_RX);
3331 entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3332 info_size += entry_size;
3333 if (scan != NULL) {
3334 if (buffer_size < info_size) {
3335 return info_size;
3336 }
3337 entry = (nexus_channel_entry *)(void *)scan;
3338 entry->nce_ring_count = ring_count;
3339
3340 nexus_channel_entry_populate(ch, entry);
3341 scan += entry_size;
3342 }
3343 }
3344 return info_size;
3345 }
3346
3347 static int
3348 nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3349 {
3350 #pragma unused(arg1, arg2, oidp)
3351 size_t actual_space;
3352 caddr_t buffer = NULL;
3353 size_t buffer_space;
3354 size_t allocated_space;
3355 int out_error;
3356 struct kern_nexus *nx;
3357 int error = 0;
3358 caddr_t scan;
3359
3360 if (!kauth_cred_issuser(kauth_cred_get())) {
3361 return EPERM;
3362 }
3363
3364 net_update_uptime();
3365 buffer_space = req->oldlen;
3366 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3367 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3368 buffer_space = SK_SYSCTL_ALLOC_MAX;
3369 }
3370 allocated_space = buffer_space;
3371 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3372 if (__improbable(buffer == NULL)) {
3373 return ENOBUFS;
3374 }
3375 } else if (req->oldptr == USER_ADDR_NULL) {
3376 buffer_space = 0;
3377 }
3378 actual_space = 0;
3379 scan = buffer;
3380 SK_LOCK();
3381 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3382 size_t info_size;
3383
3384 info_size = nexus_channel_info_populate(nx, (void *)scan,
3385 buffer_space);
3386 if (scan != NULL) {
3387 if (buffer_space < info_size) {
3388 /* supplied buffer too small, stop copying */
3389 error = ENOMEM;
3390 break;
3391 }
3392 scan += info_size;
3393 buffer_space -= info_size;
3394 }
3395 actual_space += info_size;
3396 }
3397 SK_UNLOCK();
3398
3399 if (actual_space != 0) {
3400 out_error = SYSCTL_OUT(req, buffer, actual_space);
3401 if (out_error != 0) {
3402 error = out_error;
3403 }
3404 }
3405 if (buffer != NULL) {
3406 sk_free_data(buffer, allocated_space);
3407 }
3408
3409 return error;
3410 }
3411
3412 static int
3413 nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3414 {
3415 #pragma unused(arg1, arg2)
3416 struct proc *p = req->p;
3417 struct nexus_mib_filter filter;
3418 int error = 0;
3419 size_t actual_space;
3420 size_t allocated_space = 0;
3421 caddr_t __sized_by(allocated_space) buffer = NULL;
3422 size_t buffer_space;
3423 int out_error;
3424 struct kern_nexus *nx;
3425 caddr_t scan;
3426
3427 /* Restrict protocol stats access to root user only (like netstat). */
3428 if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3429 !kauth_cred_issuser(kauth_cred_get())) {
3430 SK_ERR("mib request rejected, EPERM");
3431 return EPERM;
3432 }
3433
3434 if (req->newptr == USER_ADDR_NULL) {
3435 /*
3436 * For flow stats requests, non-root users need to provide a
3437 * 5-tuple. Otherwise, we do not grant access.
3438 */
3439 if (oidp->oid_arg2 == NXMIB_FLOW &&
3440 !kauth_cred_issuser(kauth_cred_get())) {
3441 SK_ERR("mib request rejected: tuple not provided");
3442 return EPERM;
3443 }
3444 /* use subcommand for multiple nodes */
3445 filter.nmf_type = oidp->oid_arg2;
3446 filter.nmf_bitmap = 0x0;
3447 } else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3448 SK_ERR("mis-matching newlen");
3449 return EINVAL;
3450 } else {
3451 error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3452 if (error != 0) {
3453 SK_ERR("SYSCTL_IN err %d", error);
3454 return error;
3455 }
3456 if (filter.nmf_type != oidp->oid_arg2) {
3457 SK_ERR("mis-matching nmf_type");
3458 return EINVAL;
3459 }
3460 /*
3461 * For flow stats requests, non-root users need to set the nexus
3462 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3463 * grant access. This ensures that fsw_mib_get_flow looks for a
3464 * flow entry that matches the given tuple of the non-root user.
3465 */
3466 if (filter.nmf_type == NXMIB_FLOW &&
3467 (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3468 !kauth_cred_issuser(kauth_cred_get())) {
3469 SK_ERR("mib request rejected: tuple filter not set");
3470 return EPERM;
3471 }
3472 }
3473
3474 net_update_uptime();
3475 buffer_space = req->oldlen;
3476 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3477 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3478 buffer_space = SK_SYSCTL_ALLOC_MAX;
3479 }
3480 buffer = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_sysctl_buf);
3481 allocated_space = buffer_space;
3482 if (__improbable(buffer == NULL)) {
3483 return ENOBUFS;
3484 }
3485 } else if (req->oldptr == USER_ADDR_NULL) {
3486 buffer_space = 0;
3487 }
3488 actual_space = 0;
3489 scan = buffer;
3490
3491 SK_LOCK();
3492 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3493 if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3494 continue;
3495 }
3496
3497 size_t size = 0;
3498 struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3499
3500 /*
3501 * -fbounds-safety: Because scan takes the bounds of buffer
3502 * (which is __sized_by(allocated_space)), at some point scan
3503 * will reach its bounds (because of scan += size). When it
3504 * does, it won't pass the bounds check when scan is passed to
3505 * nxdom_prov_nx_mib_get function. We need to avoid passing scan
3506 * to nxdom_prov_nx_mib_get when it reaches its upper bound,
3507 * i.e. when buffer_space reaches 0 (see buffer_space -= size).
3508 */
3509 if (req->oldptr == USER_ADDR_NULL || buffer_space) {
3510 size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3511 buffer_space, p);
3512 }
3513
3514 if (scan != NULL) {
3515 if (buffer_space < size) {
3516 /* supplied buffer too small, stop copying */
3517 error = ENOMEM;
3518 break;
3519 }
3520 scan += size;
3521 buffer_space -= size;
3522 }
3523 actual_space += size;
3524 }
3525 SK_UNLOCK();
3526
3527 if (actual_space != 0) {
3528 out_error = SYSCTL_OUT(req, buffer, actual_space);
3529 if (out_error != 0) {
3530 error = out_error;
3531 }
3532 }
3533 if (buffer != NULL) {
3534 sk_free_data_sized_by(buffer, allocated_space);
3535 }
3536
3537 return error;
3538 }
3539
3540 void
kern_nexus_walktree(kern_nexus_walktree_f_t * f,void * arg0,boolean_t is_sk_locked)3541 kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3542 boolean_t is_sk_locked)
3543 {
3544 struct kern_nexus *nx = NULL;
3545
3546 if (!is_sk_locked) {
3547 SK_LOCK();
3548 } else {
3549 SK_LOCK_ASSERT_HELD();
3550 }
3551
3552 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3553 (*f)(nx, arg0);
3554 }
3555
3556 if (!is_sk_locked) {
3557 SK_UNLOCK();
3558 }
3559 }
3560
3561 errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,struct kern_pbufpool_memory_info * rx_pool_info,struct kern_pbufpool_memory_info * tx_pool_info)3562 kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3563 struct kern_pbufpool_memory_info *rx_pool_info,
3564 struct kern_pbufpool_memory_info *tx_pool_info)
3565 {
3566 struct kern_pbufpool *__single tpp, *__single rpp;
3567 struct kern_nexus *nx;
3568 errno_t err = 0;
3569
3570 nx = nx_find(nx_uuid, FALSE);
3571 if (nx == NULL) {
3572 err = ENOENT;
3573 goto done;
3574 }
3575
3576 if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3577 err = ENOTSUP;
3578 goto done;
3579 }
3580
3581 err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3582 if (err != 0) {
3583 goto done;
3584 }
3585
3586 if ((tpp == NULL) && (rpp == NULL)) {
3587 err = ENOENT;
3588 goto done;
3589 }
3590
3591 if (tx_pool_info != NULL) {
3592 bzero(tx_pool_info, sizeof(*tx_pool_info));
3593 }
3594 if (rx_pool_info != NULL) {
3595 bzero(rx_pool_info, sizeof(*rx_pool_info));
3596 }
3597
3598 if ((tx_pool_info != NULL) && (tpp != NULL)) {
3599 err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
3600 if (err != 0) {
3601 goto done;
3602 }
3603 }
3604
3605 if ((rx_pool_info != NULL) && (rpp != NULL)) {
3606 err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
3607 }
3608
3609 done:
3610 if (nx != NULL) {
3611 (void) nx_release(nx);
3612 nx = NULL;
3613 }
3614 return err;
3615 }
3616
3617 void
nx_interface_advisory_notify(struct kern_nexus * nx)3618 nx_interface_advisory_notify(struct kern_nexus *nx)
3619 {
3620 struct kern_channel *ch;
3621 struct netif_stats *nifs;
3622 struct fsw_stats *fsw_stats;
3623 nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3624
3625 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3626 nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3627 } else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3628 fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3629 } else {
3630 VERIFY(0);
3631 __builtin_unreachable();
3632 }
3633 if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
3634 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3635 STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3636 } else {
3637 STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3638 }
3639 return;
3640 }
3641 /*
3642 * if the channel is in "nx_ch_if_adv_head" list, then we can
3643 * safely assume that the channel is not closed yet.
3644 * In ch_close_common(), the channel is removed from the
3645 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3646 * exclusive mode, prior to closing the channel.
3647 */
3648 STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3649 struct nexus_adapter *na = ch->ch_na;
3650
3651 ASSERT(na != NULL);
3652 na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3653 TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3654 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3655 STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3656 } else {
3657 STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3658 }
3659 }
3660 lck_rw_done(&nx->nx_ch_if_adv_lock);
3661 }
3662