xref: /xnu-12377.81.4/bsd/skywalk/channel/channel.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31  * All rights reserved.
32  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  *   1. Redistributions of source code must retain the above copyright
38  *      notice, this list of conditions and the following disclaimer.
39  *   2. Redistributions in binary form must reproduce the above copyright
40  *      notice, this list of conditions and the following disclaimer in the
41  *      documentation and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  */
55 
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61 
62 #include <kern/uipc_domain.h>
63 
64 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
65 
66 struct ch_event_result {
67 	uint32_t tx_data;
68 	uint32_t rx_data;
69 };
70 
71 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
72 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
73 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
74 
75 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
76 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
77 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
78 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
79     boolean_t, boolean_t, uint32_t);
80 static boolean_t csi_tcall_start(struct ch_selinfo *);
81 static void csi_tcall(thread_call_param_t, thread_call_param_t);
82 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
83 
84 static void ch_redzone_init(void);
85 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
86 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
87     ring_id_t);
88 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
89     struct ch_ev_thresh *);
90 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
91     struct nxbind *, struct proc *, int, int *);
92 static void ch_disconnect(struct kern_channel *);
93 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
94     struct sockopt *);
95 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
96     struct sockopt *);
97 static struct kern_channel *ch_alloc(zalloc_flags_t);
98 static void ch_free(struct kern_channel *);
99 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
100     struct sockopt *sopt);
101 
102 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
103 static void filt_chrwdetach(struct knote *, boolean_t);
104 static void filt_chrdetach(struct knote *);
105 static void filt_chwdetach(struct knote *);
106 static int filt_chrw(struct knote *, long, int);
107 static int filt_chread(struct knote *, long);
108 static int filt_chwrite(struct knote *, long);
109 
110 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
111 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
112 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
113 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
114 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
115 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
116 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
117 static void filt_che_detach(struct knote *);
118 static int filt_che_event(struct knote *, long);
119 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
120 static int filt_che_process(struct knote *, struct kevent_qos_s *);
121 static int filt_chan_extended_common(struct knote *, long);
122 
123 static int ch_event(struct kern_channel *ch, int events,
124     void *wql, struct proc *p, struct ch_event_result *,
125     const boolean_t is_kevent, int *errno, const boolean_t);
126 
127 const struct filterops skywalk_channel_rfiltops = {
128 	.f_isfd =       1,
129 	.f_attach =     filt_chrwattach,
130 	.f_detach =     filt_chrdetach,
131 	.f_event =      filt_chread,
132 	.f_touch =      filt_chrtouch,
133 	.f_process =    filt_chrprocess,
134 };
135 
136 const struct filterops skywalk_channel_wfiltops = {
137 	.f_isfd =       1,
138 	.f_attach =     filt_chrwattach,
139 	.f_detach =     filt_chwdetach,
140 	.f_event =      filt_chwrite,
141 	.f_touch =      filt_chwtouch,
142 	.f_process =    filt_chwprocess,
143 };
144 
145 const struct filterops skywalk_channel_efiltops = {
146 	.f_isfd =       1,
147 	.f_attach =     filt_che_attach,
148 	.f_detach =     filt_che_detach,
149 	.f_event =      filt_che_event,
150 	.f_touch =      filt_che_touch,
151 	.f_process =    filt_che_process,
152 };
153 
154 /* mitigation intervals in ns */
155 #define CH_MIT_IVAL_MIN         NSEC_PER_USEC
156 
157 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
158 
159 #if (DEVELOPMENT || DEBUG)
160 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
161     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
162 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
163     CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
164 #endif /* !DEVELOPMENT && !DEBUG */
165 
166 static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel);
167 
168 static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info);
169 
170 static int __ch_inited = 0;
171 
172 /*
173  * Global cookies to hold the random numbers used for verifying
174  * user metadata red zone violations.
175  */
176 uint64_t __ch_umd_redzone_cookie = 0;
177 
178 #define SKMEM_TAG_CH_KEY        "com.apple.skywalk.channel.key"
179 SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
180 
181 static void
ch_redzone_init(void)182 ch_redzone_init(void)
183 {
184 	static_assert(sizeof(__ch_umd_redzone_cookie) == sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
185 	static_assert(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
186 	static_assert(sizeof(struct __slot_desc) == 8);
187 
188 	/* Initialize random user red zone cookie values */
189 	do {
190 		read_random(&__ch_umd_redzone_cookie,
191 		    sizeof(__ch_umd_redzone_cookie));
192 	} while (__ch_umd_redzone_cookie == 0);
193 
194 	SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
195 }
196 
197 int
channel_init(void)198 channel_init(void)
199 {
200 	int error = 0;
201 
202 	SK_LOCK_ASSERT_HELD();
203 	ASSERT(!__ch_inited);
204 
205 	static_assert(offsetof(struct __user_packet, pkt_qum) == 0);
206 	static_assert(offsetof(struct __kern_packet, pkt_qum) == 0);
207 
208 	ch_redzone_init();
209 
210 	__ch_inited = 1;
211 
212 	return error;
213 }
214 
215 void
channel_fini(void)216 channel_fini(void)
217 {
218 	SK_LOCK_ASSERT_HELD();
219 
220 	if (__ch_inited) {
221 		__ch_umd_redzone_cookie = 0;
222 		__ch_inited = 0;
223 	}
224 }
225 
226 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)227 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
228 {
229 	csi->csi_flags = 0;
230 	csi->csi_pending = 0;
231 	if (mitigation) {
232 		csi->csi_interval = mit_ival;
233 		csi->csi_eff_interval = ch_mit_ival;    /* global override */
234 		os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed);
235 		csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
236 		    csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
237 		/* this must not fail */
238 		VERIFY(csi->csi_tcall != NULL);
239 	} else {
240 		csi->csi_interval = 0;
241 		csi->csi_eff_interval = 0;
242 		csi->csi_tcall = NULL;
243 	}
244 	lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
245 	klist_init(&csi->csi_si.si_note);
246 }
247 
248 void
csi_destroy(struct ch_selinfo * csi)249 csi_destroy(struct ch_selinfo *csi)
250 {
251 	/* check if not already destroyed, else do it now */
252 	if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) &
253 	    CSI_DESTROYED) == 0) {
254 		CSI_LOCK(csi);
255 		/* must have been set by above atomic op */
256 		VERIFY(csi->csi_flags & CSI_DESTROYED);
257 		if (csi->csi_flags & CSI_MITIGATION) {
258 			thread_call_t __single tcall = csi->csi_tcall;
259 			VERIFY(tcall != NULL);
260 			CSI_UNLOCK(csi);
261 
262 			(void) thread_call_cancel_wait(tcall);
263 			if (!thread_call_free(tcall)) {
264 				boolean_t freed;
265 				(void) thread_call_cancel_wait(tcall);
266 				freed = thread_call_free(tcall);
267 				VERIFY(freed);
268 			}
269 
270 			CSI_LOCK(csi);
271 			csi->csi_tcall = NULL;
272 			os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION,
273 			    relaxed);
274 		}
275 		csi->csi_pending = 0;
276 		CSI_UNLOCK(csi);
277 
278 		selthreadclear(&csi->csi_si);
279 		/* now we don't need the mutex anymore */
280 		lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
281 	}
282 }
283 
284 /*
285  * Called only for select(2).
286  */
287 __attribute__((always_inline))
288 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)289 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
290 {
291 	struct selinfo *si = &csi->csi_si;
292 
293 	CSI_LOCK_ASSERT_HELD(csi);
294 	selrecord(p, si, wql);
295 }
296 
297 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)298 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
299 {
300 	struct ch_selinfo *csi = &kring->ckr_si;
301 
302 	CSI_LOCK(csi);
303 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) "
304 	    "si %p si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
305 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
306 	    SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
307 
308 	csi_selrecord(csi, p, wql);
309 	CSI_UNLOCK(csi);
310 }
311 
312 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)313 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
314     void *wql)
315 {
316 	struct ch_selinfo *csi = &na->na_si[t];
317 
318 	CSI_LOCK(csi);
319 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p si_flags 0x%x",
320 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
321 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
322 
323 	csi_selrecord(csi, p, wql);
324 	CSI_UNLOCK(csi);
325 }
326 
327 /*
328  * Called from na_post_event().
329  */
330 __attribute__((always_inline))
331 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)332 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
333     boolean_t selwake, uint32_t hint)
334 {
335 	struct selinfo *si = &csi->csi_si;
336 
337 	CSI_LOCK_ASSERT_HELD(csi);
338 	csi->csi_pending = 0;
339 	if (selwake) {
340 		selwakeup(si);
341 	}
342 	if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
343 		KNOTE(&si->si_note, hint);
344 	}
345 }
346 
347 __attribute__((always_inline))
348 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)349 csi_selwakeup_delayed(struct ch_selinfo *csi)
350 {
351 	CSI_LOCK_ASSERT_HELD(csi);
352 	ASSERT(csi->csi_flags & CSI_MITIGATION);
353 	ASSERT(csi->csi_tcall != NULL);
354 
355 	if (thread_call_isactive(csi->csi_tcall)) {
356 		csi->csi_pending++;
357 	} else if (!csi_tcall_start(csi)) {
358 		csi_selwakeup(csi, FALSE, FALSE, 0);
359 	}
360 }
361 
362 __attribute__((always_inline))
363 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)364 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
365     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
366 {
367 	CSI_LOCK_ASSERT_HELD(csi);
368 
369 	if (nodelay || within_kevent || !selwake || hint != 0 ||
370 	    !(csi->csi_flags & CSI_MITIGATION)) {
371 		csi_selwakeup(csi, within_kevent, selwake, hint);
372 	} else {
373 		csi_selwakeup_delayed(csi);
374 	}
375 }
376 
377 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)378 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
379     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
380 {
381 	struct ch_selinfo *csi = &kring->ckr_si;
382 
383 	CSI_LOCK(csi);
384 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) "
385 	    "si %p si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x",
386 	    (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
387 	    SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
388 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
389 	    within_kevent, selwake, hint);
390 
391 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
392 	CSI_UNLOCK(csi);
393 }
394 
395 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)396 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
397     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
398 {
399 	struct ch_selinfo *csi = &na->na_si[t];
400 
401 	CSI_LOCK(csi);
402 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p "
403 	    "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x",
404 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
405 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
406 	    within_kevent, selwake, hint);
407 
408 	switch (t) {
409 	case NR_RX:
410 		if (!(na->na_flags & NAF_RX_MITIGATION)) {
411 			nodelay = TRUE;
412 		}
413 		break;
414 
415 	case NR_TX:
416 		if (!(na->na_flags & NAF_TX_MITIGATION)) {
417 			nodelay = TRUE;
418 		}
419 		break;
420 
421 	default:
422 		nodelay = TRUE;
423 		break;
424 	}
425 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
426 	CSI_UNLOCK(csi);
427 }
428 
429 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)430 csi_tcall_start(struct ch_selinfo *csi)
431 {
432 	uint64_t now, ival, deadline;
433 
434 	CSI_LOCK_ASSERT_HELD(csi);
435 	ASSERT(csi->csi_flags & CSI_MITIGATION);
436 	ASSERT(csi->csi_tcall != NULL);
437 
438 	/* pick up latest value */
439 	ival = csi_tcall_update_interval(csi);
440 
441 	/* if no mitigation, pass notification up now */
442 	if (__improbable(ival == 0)) {
443 		return FALSE;
444 	}
445 
446 	deadline = now = mach_absolute_time();
447 	clock_deadline_for_periodic_event(ival, now, &deadline);
448 	(void) thread_call_enter_delayed(csi->csi_tcall, deadline);
449 
450 	return TRUE;
451 }
452 
453 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)454 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
455 {
456 #pragma unused(arg1)
457 	struct ch_selinfo *csi = (struct ch_selinfo *__single)arg0;
458 
459 	CSI_LOCK(csi);
460 	csi_selwakeup(csi, FALSE, FALSE, 0);
461 	CSI_UNLOCK(csi);
462 
463 	CSI_LOCK(csi);
464 	if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
465 	    csi->csi_pending != 0 && !csi_tcall_start(csi))) {
466 		csi_selwakeup(csi, FALSE, FALSE, 0);
467 	}
468 	CSI_UNLOCK(csi);
469 }
470 
471 __attribute__((always_inline))
472 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)473 csi_tcall_update_interval(struct ch_selinfo *csi)
474 {
475 	uint64_t i = ch_mit_ival;
476 
477 	/* if global override was adjusted, update local copies */
478 	if (__improbable(csi->csi_eff_interval != i)) {
479 		ASSERT(csi->csi_flags & CSI_MITIGATION);
480 		csi->csi_interval = csi->csi_eff_interval =
481 		    ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
482 	}
483 
484 	return csi->csi_interval;
485 }
486 
487 /* return EV_EOF if the channel is defunct */
488 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)489 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
490 {
491 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
492 		if (kn) {
493 			kn->kn_flags |= EV_EOF;
494 		}
495 		return TRUE;
496 	}
497 	return FALSE;
498 }
499 
500 static void
filt_chrwdetach(struct knote * kn,boolean_t write)501 filt_chrwdetach(struct knote *kn, boolean_t write)
502 {
503 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
504 	struct ch_selinfo *csi;
505 	struct selinfo *si;
506 
507 	lck_mtx_lock(&ch->ch_lock);
508 	csi = ch->ch_si[write ? NR_TX : NR_RX];
509 	si = &csi->csi_si;
510 
511 	CSI_LOCK(csi);
512 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s) "
513 	    "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
514 	    SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
515 	    write ? "write" : "read", si->si_flags);
516 
517 	if (KNOTE_DETACH(&si->si_note, kn)) {
518 		os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
519 	}
520 
521 	CSI_UNLOCK(csi);
522 	lck_mtx_unlock(&ch->ch_lock);
523 }
524 
525 static void
filt_chrdetach(struct knote * kn)526 filt_chrdetach(struct knote *kn)
527 {
528 	ASSERT(kn->kn_filter == EVFILT_READ);
529 	filt_chrwdetach(kn, FALSE);
530 }
531 
532 static void
filt_chwdetach(struct knote * kn)533 filt_chwdetach(struct knote *kn)
534 {
535 	ASSERT(kn->kn_filter == EVFILT_WRITE);
536 	filt_chrwdetach(kn, TRUE);
537 }
538 
539 /*
540  * callback from notifies (generated externally).
541  * This always marks the knote activated, so always
542  * return 1.
543  */
544 static int
filt_chrw(struct knote * kn,long hint,int events)545 filt_chrw(struct knote *kn, long hint, int events)
546 {
547 #if SK_LOG
548 	struct kern_channel *ch = (struct kern_channel *__single)
549 	    knote_kn_hook_get_raw(kn);
550 #else
551 #pragma unused(kn)
552 #pragma unused(hint)
553 #pragma unused(events)
554 #endif
555 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p "
556 	    "kn %p (%s%s) hint 0x%x", ch->ch_na->na_name,
557 	    SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
558 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
559 	    (events == POLLOUT) ?  "write" : "read",
560 	    (uint32_t)hint);
561 
562 	/* assume we are ready */
563 	return 1;
564 }
565 
566 static int
filt_chread(struct knote * kn,long hint)567 filt_chread(struct knote *kn, long hint)
568 {
569 	ASSERT(kn->kn_filter == EVFILT_READ);
570 	/* There is no hint for read/write event */
571 	if (hint != 0) {
572 		return 0;
573 	}
574 	return filt_chrw(kn, hint, POLLIN);
575 }
576 
577 static int
filt_chwrite(struct knote * kn,long hint)578 filt_chwrite(struct knote *kn, long hint)
579 {
580 	ASSERT(kn->kn_filter == EVFILT_WRITE);
581 	/* There is no hint for read/write event */
582 	if (hint != 0) {
583 		return 0;
584 	}
585 	return filt_chrw(kn, hint, POLLOUT);
586 }
587 
588 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)589 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
590 {
591 #pragma unused(kev)
592 	/*
593 	 * -fbounds-safety: This seems like an example of interop with code that
594 	 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
595 	 */
596 	struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
597 	    knote_kn_hook_get_raw(kn));
598 	int ev = kn->kn_filter;
599 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
600 	int event_error = 0;
601 	int revents;
602 
603 	/* save off the new input fflags and data */
604 	kn->kn_sfflags = kev->fflags;
605 	kn->kn_sdata = kev->data;
606 
607 	lck_mtx_lock(&ch->ch_lock);
608 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
609 		lck_mtx_unlock(&ch->ch_lock);
610 		return 1;
611 	}
612 
613 	/* if a note-specific low watermark is given, validate it */
614 	if (kn->kn_sfflags & NOTE_LOWAT) {
615 		struct ch_ev_thresh note_thresh = {
616 			.cet_unit = (dir == NR_TX) ?
617 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
618 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
619 			.cet_value = (uint32_t)kn->kn_sdata
620 		};
621 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
622 		    &note_thresh) != 0) {
623 			SK_ERR("invalid NOTE_LOWAT threshold %u",
624 			    note_thresh.cet_value);
625 			knote_set_error(kn, EINVAL);
626 			lck_mtx_unlock(&ch->ch_lock);
627 			return 1;
628 		}
629 	}
630 
631 	/* capture new state just so we can return it */
632 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
633 	    &event_error, FALSE);
634 	lck_mtx_unlock(&ch->ch_lock);
635 
636 	if (revents & POLLERR) {
637 		ASSERT(event_error != 0);
638 		/*
639 		 * Setting a knote error here will confuse libdispatch, so we
640 		 * use EV_EOF instead.
641 		 */
642 		kn->kn_flags |= EV_EOF;
643 		return 1;
644 	} else {
645 		return (events & revents) != 0;
646 	}
647 }
648 
649 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)650 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
651 {
652 	ASSERT(kn->kn_filter == EVFILT_READ);
653 
654 	if (kev->flags & EV_ENABLE) {
655 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
656 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
657 		    kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
658 			    ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na));
659 	}
660 
661 	return filt_chtouch(kn, kev, POLLIN);
662 }
663 
664 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)665 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
666 {
667 	ASSERT(kn->kn_filter == EVFILT_WRITE);
668 	return filt_chtouch(kn, kev, POLLOUT);
669 }
670 
671 
672 /*
673  * Called from kevent.  We call ch_event(POLL[IN|OUT]) and
674  * return 0/1 accordingly.
675  */
676 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)677 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
678 {
679 	/*
680 	 * -fbounds-safety: This seems like an example of interop with code that
681 	 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
682 	 */
683 	struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
684 	    knote_kn_hook_get_raw(kn));
685 	struct ch_event_result result;
686 	uint32_t lowat;
687 	int trigger_event = 1;
688 	int revents;
689 	int event_error;
690 	int64_t data;
691 
692 	lck_mtx_lock(&ch->ch_lock);
693 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
694 		knote_fill_kevent(kn, kev, 0);
695 		lck_mtx_unlock(&ch->ch_lock);
696 		return 1;
697 	}
698 
699 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
700 	    TRUE, &event_error, FALSE);
701 
702 	if (revents & POLLERR) {
703 		ASSERT(event_error != 0);
704 		lck_mtx_unlock(&ch->ch_lock);
705 		/*
706 		 * Setting a knote error here will confuse libdispatch, so we
707 		 * use EV_EOF instead.
708 		 */
709 		kn->kn_flags |= EV_EOF;
710 		knote_fill_kevent_with_sdata(kn, kev);
711 		return 1;
712 	}
713 
714 	trigger_event = (events & revents) != 0;
715 
716 	if (events == POLLOUT) {
717 		lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
718 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
719 		    kn->kn_sdata > lowat) {
720 			lowat = (uint32_t)kn->kn_sdata;
721 		}
722 
723 		data = result.tx_data;
724 
725 		if (result.tx_data < lowat) {
726 			trigger_event = 0;
727 		}
728 	} else {
729 		lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
730 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
731 		    kn->kn_sdata > lowat) {
732 			lowat = (uint32_t)kn->kn_sdata;
733 		}
734 
735 		data = result.rx_data;
736 
737 		if (result.rx_data < lowat) {
738 			trigger_event = 0;
739 		}
740 	}
741 
742 	if (trigger_event) {
743 		knote_fill_kevent(kn, kev, data);
744 	}
745 
746 	lck_mtx_unlock(&ch->ch_lock);
747 
748 	return trigger_event;
749 }
750 
751 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)752 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
753 {
754 	ASSERT(kn->kn_filter == EVFILT_READ);
755 	return filt_chprocess(kn, kev, POLLIN);
756 }
757 
758 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)759 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
760 {
761 	ASSERT(kn->kn_filter == EVFILT_WRITE);
762 	return filt_chprocess(kn, kev, POLLOUT);
763 }
764 
765 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)766 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
767 {
768 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
769 	struct nexus_adapter *na;
770 	struct ch_selinfo *csi;
771 	int ev = kn->kn_filter;
772 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
773 	int revents;
774 	int events;
775 	int event_error = 0;
776 
777 	ASSERT((kn->kn_filter == EVFILT_READ) ||
778 	    (kn->kn_filter == EVFILT_WRITE));
779 
780 	/* ch_kqfilter() should have acquired the lock */
781 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
782 
783 	na = ch->ch_na;
784 	/* if a note-specific low watermark is given, validate it */
785 	if (kn->kn_sfflags & NOTE_LOWAT) {
786 		struct ch_ev_thresh note_thresh = {
787 			.cet_unit = (dir == NR_TX) ?
788 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
789 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
790 			.cet_value = (uint32_t)kn->kn_sdata
791 		};
792 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
793 		    &note_thresh) != 0) {
794 			SK_ERR("invalid NOTE_LOWAT threshold %u",
795 			    note_thresh.cet_value);
796 			knote_set_error(kn, EINVAL);
797 			return 0;
798 		}
799 	}
800 
801 	/* the si is indicated in the channel */
802 	csi = ch->ch_si[dir];
803 	CSI_LOCK(csi);
804 
805 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
806 		os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
807 	}
808 
809 	CSI_UNLOCK(csi);
810 
811 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s)",
812 	    na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
813 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
814 	    (ev == EVFILT_WRITE) ?  "write" : "read");
815 
816 	/* capture current state */
817 	events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
818 
819 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
820 		revents = events;
821 	} else {
822 		/* filt_chprocess() will fill in the kn_sdata field */
823 		revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
824 		    NULL, TRUE, &event_error, FALSE);
825 	}
826 
827 	if (revents & POLLERR) {
828 		ASSERT(event_error != 0);
829 		kn->kn_flags |= EV_EOF;
830 		return 1;
831 	} else {
832 		return (events & revents) != 0;
833 	}
834 }
835 
836 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)837 filt_chan_extended_common(struct knote *kn, long ev_hint)
838 {
839 	/*
840 	 * This function is not always called with the same set of locks held,
841 	 * hence it is only allowed to manipulate kn_fflags, with atomics.
842 	 *
843 	 * the f_event / f_process functions may run concurrently.
844 	 */
845 	uint32_t add_fflags = 0;
846 
847 	if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
848 		add_fflags |= NOTE_FLOW_ADV_UPDATE;
849 	}
850 	if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
851 		add_fflags |= NOTE_CHANNEL_EVENT;
852 	}
853 	if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
854 		add_fflags |= NOTE_IF_ADV_UPD;
855 	}
856 	if (add_fflags) {
857 		/* Reset any events that are not requested on this knote */
858 		add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
859 		os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
860 		return add_fflags != 0;
861 	}
862 	return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
863 }
864 
865 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)866 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
867     uint32_t fflags, long *hint)
868 {
869 	int revents, event_error = 0;
870 
871 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
872 	*hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
873 
874 	if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
875 	    ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
876 		/* capture new state to return */
877 		revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
878 		    NULL, TRUE, &event_error, TRUE);
879 		if (revents & POLLERR) {
880 			ASSERT(event_error != 0);
881 			/*
882 			 * Setting a knote error here will confuse libdispatch,
883 			 * so we use EV_EOF instead.
884 			 */
885 			kn->kn_flags |= EV_EOF;
886 		} else if ((revents & POLLIN) != 0) {
887 			*hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
888 		}
889 	}
890 	/*
891 	 * if the sync operation on event ring didn't find any events
892 	 * then indicate that the channel event is not active.
893 	 */
894 	if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
895 		/*
896 		 * Avoid a costly atomic when the bit is already cleared.
897 		 */
898 		uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
899 		if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
900 			os_atomic_andnot(&kn->kn_fflags,
901 			    CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
902 		}
903 	}
904 }
905 
906 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)907 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
908 {
909 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
910 	struct ch_selinfo *csi;
911 	long hint = 0;
912 
913 	static_assert(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
914 	static_assert(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
915 	static_assert(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
916 
917 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
918 
919 	/* ch_kqfilter() should have acquired the lock */
920 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
921 
922 	csi = ch->ch_si[NR_TX];
923 	CSI_LOCK(csi);
924 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
925 		os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
926 	}
927 	CSI_UNLOCK(csi);
928 
929 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
930 		return 1;
931 	}
932 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
933 		os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed);
934 	}
935 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
936 	if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
937 		/* on registration force an event */
938 		hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
939 	}
940 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)",
941 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
942 	    "EVFILT_NW_CHANNEL");
943 	return filt_chan_extended_common(kn, hint);
944 }
945 
946 static void
filt_che_detach(struct knote * kn)947 filt_che_detach(struct knote *kn)
948 {
949 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
950 	struct ch_selinfo *csi;
951 
952 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
953 
954 	lck_mtx_lock(&ch->ch_lock);
955 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
956 		os_atomic_andnot(&ch->ch_na->na_flags,
957 		    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
958 	}
959 	csi = ch->ch_si[NR_TX];
960 	CSI_LOCK(csi);
961 	if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
962 		os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
963 	}
964 	CSI_UNLOCK(csi);
965 	lck_mtx_unlock(&ch->ch_lock);
966 
967 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)",
968 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
969 	    "EVFILT_NW_CHANNEL");
970 }
971 
972 static int
filt_che_event(struct knote * kn,long hint)973 filt_che_event(struct knote *kn, long hint)
974 {
975 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
976 
977 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
978 	if (hint == 0) {
979 		return 0;
980 	}
981 	if (__improbable(ch_filt_check_defunct(ch, NULL))) {
982 		return 1;
983 	}
984 	if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
985 		VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
986 	}
987 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p hint 0x%lx)",
988 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint);
989 	return filt_chan_extended_common(kn, hint);
990 }
991 
992 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)993 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
994 {
995 	int ret;
996 	long hint = 0;
997 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
998 
999 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1000 	/* save off the new input fflags and data */
1001 	kn->kn_sfflags = kev->fflags;
1002 	kn->kn_sdata = kev->data;
1003 
1004 	lck_mtx_lock(&ch->ch_lock);
1005 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
1006 		ret = 1;
1007 		goto done;
1008 	}
1009 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1010 		if (kev->flags & EV_ENABLE) {
1011 			os_atomic_or(&ch->ch_na->na_flags,
1012 			    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1013 		} else if (kev->flags & EV_DISABLE) {
1014 			os_atomic_andnot(&ch->ch_na->na_flags,
1015 			    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1016 		}
1017 	}
1018 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1019 	ret = filt_chan_extended_common(kn, hint);
1020 done:
1021 	lck_mtx_unlock(&ch->ch_lock);
1022 	return ret;
1023 }
1024 
1025 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1026 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1027 {
1028 	int ret;
1029 	long hint = 0;
1030 
1031 	/*
1032 	 * -fbounds-safety: This seems like an example of interop with code that
1033 	 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
1034 	 */
1035 	struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
1036 	    knote_kn_hook_get_raw(kn));
1037 
1038 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1039 	lck_mtx_lock(&ch->ch_lock);
1040 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
1041 		ret = 1;
1042 		goto done;
1043 	}
1044 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1045 	ret = filt_chan_extended_common(kn, hint);
1046 done:
1047 	lck_mtx_unlock(&ch->ch_lock);
1048 	if (ret != 0) {
1049 		/*
1050 		 * This filter historically behaves like EV_CLEAR,
1051 		 * even when EV_CLEAR wasn't set.
1052 		 */
1053 		knote_fill_kevent(kn, kev, 0);
1054 		kn->kn_fflags = 0;
1055 	}
1056 	return ret;
1057 }
1058 
1059 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1060 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1061     struct kevent_qos_s *kev)
1062 {
1063 	SK_LOG_VAR(char dbgbuf[CH_DBGBUF_SIZE]);
1064 	int result;
1065 
1066 	lck_mtx_lock(&ch->ch_lock);
1067 	VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1068 
1069 	if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1070 	    na_reject_channel(ch, ch->ch_na))) {
1071 		SK_ERR("channel is non-permissive %s",
1072 		    ch2str(ch, dbgbuf, sizeof(dbgbuf)));
1073 		knote_set_error(kn, ENXIO);
1074 		lck_mtx_unlock(&ch->ch_lock);
1075 		return 0;
1076 	}
1077 
1078 	switch (kn->kn_filter) {
1079 	case EVFILT_READ:
1080 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1081 		break;
1082 
1083 	case EVFILT_WRITE:
1084 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1085 		break;
1086 
1087 	case EVFILT_NW_CHANNEL:
1088 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1089 		break;
1090 
1091 	default:
1092 		lck_mtx_unlock(&ch->ch_lock);
1093 		SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1094 		    ch->ch_pid, kn->kn_filter);
1095 		knote_set_error(kn, EINVAL);
1096 		return 0;
1097 	}
1098 
1099 	knote_kn_hook_set_raw(kn, ch);
1100 	/* call the appropriate sub-filter attach with the channel lock held */
1101 	result = knote_fops(kn)->f_attach(kn, kev);
1102 	lck_mtx_unlock(&ch->ch_lock);
1103 	return result;
1104 }
1105 
1106 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1107 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1108 {
1109 	return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1110 }
1111 
1112 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1113 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1114 {
1115 	int revents;
1116 	int event_error = 0;
1117 
1118 	lck_mtx_lock(&ch->ch_lock);
1119 	revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1120 	    FALSE);
1121 	lck_mtx_unlock(&ch->ch_lock);
1122 
1123 	ASSERT((revents & POLLERR) == 0 || event_error != 0);
1124 
1125 	return revents;
1126 }
1127 
1128 #if SK_LOG
1129 /* Hoisted out of line to reduce kernel stack footprint */
1130 SK_LOG_ATTRIBUTE
1131 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1132 ch_event_log(const char *prefix, const struct kern_channel *ch,
1133     struct proc *p, const struct nexus_adapter *na,
1134     int events, int revents)
1135 {
1136 	SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (%p) ch %p %s(%d) "
1137 	    "th %p ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1138 	    SK_KVA(ch), sk_proc_name(p), sk_proc_pid(p),
1139 	    SK_KVA(current_thread()), events, revents);
1140 }
1141 #endif /* SK_LOG */
1142 
1143 /*
1144  * select(2), poll(2) and kevent(2) handlers for channels.
1145  *
1146  * Can be called for one or more rings.  Return true the event mask
1147  * corresponding to ready events.  If there are no ready events, do
1148  * a selrecord on either individual selinfo or on the global one.
1149  * Device-dependent parts (locking and sync of tx/rx rings)
1150  * are done through callbacks.
1151  */
1152 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1153 ch_event(struct kern_channel *ch, int events, void *wql,
1154     struct proc *p, struct ch_event_result *result,
1155     const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1156 {
1157 	struct nexus_adapter *na;
1158 	struct __kern_channel_ring *kring;
1159 	uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1160 	uint32_t ready_tx_data = 0, ready_rx_data = 0;
1161 	sk_protect_t protect = NULL;
1162 
1163 #define want_tx want[NR_TX]
1164 #define want_rx want[NR_RX]
1165 	/*
1166 	 * In order to avoid nested locks, we need to "double check"
1167 	 * txsync and rxsync if we decide to do a selrecord().
1168 	 * retry_tx (and retry_rx, later) prevent looping forever.
1169 	 */
1170 	boolean_t retry_tx = TRUE, retry_rx = TRUE;
1171 	int found, error = 0;
1172 	int s;
1173 
1174 	net_update_uptime();
1175 
1176 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1177 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1178 
1179 	*errno = 0;
1180 
1181 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1182 	    ch->ch_schema == NULL)) {
1183 		SK_ERR("%s(%d): channel is defunct or no longer bound",
1184 		    ch->ch_name, ch->ch_pid);
1185 		revents = POLLERR;
1186 		*errno = ENXIO;
1187 		goto done;
1188 	}
1189 
1190 	/* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1191 	if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1192 		os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
1193 	}
1194 
1195 	na = ch->ch_na;
1196 	if (__improbable(na == NULL ||
1197 	    !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1198 		SK_ERR("%s(%d): channel is non-permissive",
1199 		    ch->ch_name, ch->ch_pid);
1200 		revents = POLLERR;
1201 		*errno = ENXIO;
1202 		goto done;
1203 	}
1204 
1205 	/* mark thread with sync-in-progress flag */
1206 	protect = sk_sync_protect();
1207 
1208 	/* update our work timestamp */
1209 	na->na_work_ts = net_uptime();
1210 
1211 	/* and make this channel eligible for draining again */
1212 	if (na->na_flags & NAF_DRAINING) {
1213 		os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed);
1214 	}
1215 
1216 #if SK_LOG
1217 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1218 		ch_event_log("enter", ch, p, na, events, revents);
1219 	}
1220 #endif
1221 	if (is_ch_event) {
1222 		goto process_channel_event;
1223 	}
1224 
1225 	want_tx = (events & (POLLOUT | POLLWRNORM));
1226 	want_rx = (events & (POLLIN | POLLRDNORM));
1227 
1228 	/*
1229 	 * check_all_{tx|rx} are set if the channel has more than one ring
1230 	 * AND the file descriptor is bound to all of them.  If so, we sleep
1231 	 * on the "global" selinfo, otherwise we sleep on individual selinfo
1232 	 * The interrupt routine in the driver wake one or the other (or both)
1233 	 * depending on which clients are active.
1234 	 *
1235 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1236 	 * txsync() is called if we run out of buffers on POLLOUT.
1237 	 */
1238 	check_all_tx = ch_is_multiplex(ch, NR_TX);
1239 	check_all_rx = ch_is_multiplex(ch, NR_RX);
1240 
1241 	/*
1242 	 * If want_tx is still set, we must issue txsync calls
1243 	 * (on all rings, to avoid that the tx rings stall).
1244 	 * XXX should also check head != khead on the tx rings.
1245 	 */
1246 	if (want_tx) {
1247 		ring_id_t first_tx = ch->ch_first[NR_TX];
1248 		ring_id_t last_tx = ch->ch_last[NR_TX];
1249 
1250 		channel_threshold_unit_t tx_unit =
1251 		    ch->ch_info->cinfo_tx_lowat.cet_unit;
1252 
1253 		/*
1254 		 * The first round checks if anyone is ready, if not
1255 		 * do a selrecord and another round to handle races.
1256 		 * want_tx goes to 0 if any space is found, and is
1257 		 * used to skip rings with no pending transmissions.
1258 		 */
1259 flush_tx:
1260 		for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1261 			kring = &na->na_tx_rings[i];
1262 			if (!want_tx &&
1263 			    kring->ckr_ring->ring_head == kring->ckr_khead) {
1264 				continue;
1265 			}
1266 
1267 			/* only one thread does txsync */
1268 			s = kr_enter(kring, TRUE);
1269 			ASSERT(s == 0);
1270 
1271 			error = 0;
1272 			DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1273 			    ch, struct __kern_channel_ring *, kring);
1274 			if (kr_txsync_prologue(ch, kring, p) >=
1275 			    kring->ckr_num_slots) {
1276 				kr_log_bad_ring(kring);
1277 				revents |= POLLERR;
1278 				error = EFAULT;
1279 				if (*errno == 0) {
1280 					*errno = EFAULT;
1281 				}
1282 			} else {
1283 				if (kring->ckr_na_sync(kring, p, 0)) {
1284 					revents |= POLLERR;
1285 					error = EIO;
1286 					if (*errno == 0) {
1287 						*errno = EIO;
1288 					}
1289 				} else {
1290 					kr_txsync_finalize(ch, kring, p);
1291 				}
1292 			}
1293 			DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1294 			    ch, struct __kern_channel_ring *, kring, int,
1295 			    error);
1296 
1297 			/*
1298 			 * If we found new slots, notify potential listeners on
1299 			 * the same ring. Since we just did a txsync, look at
1300 			 * the copies of cur,tail in the kring.
1301 			 */
1302 			found = kring->ckr_rhead != kring->ckr_rtail;
1303 			kr_exit(kring);
1304 			if (found) { /* notify other listeners */
1305 				revents |= want_tx;
1306 				want_tx = 0;
1307 				(void) kring->ckr_na_notify(kring, p,
1308 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1309 			}
1310 
1311 			/*
1312 			 * Add this ring's free data to our running
1313 			 * tally for userspace.
1314 			 */
1315 			if (result != NULL) {
1316 				switch (tx_unit) {
1317 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1318 					ready_tx_data += kring->ckr_ready_bytes;
1319 					break;
1320 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1321 					ready_tx_data += kring->ckr_ready_slots;
1322 					break;
1323 				}
1324 			}
1325 		}
1326 		if (want_tx && retry_tx && !is_kevent) {
1327 			if (check_all_tx) {
1328 				csi_selrecord_all(na, NR_TX, p, wql);
1329 			} else {
1330 				csi_selrecord_one(&na->na_tx_rings[first_tx],
1331 				    p, wql);
1332 			}
1333 			retry_tx = FALSE;
1334 			goto flush_tx;
1335 		}
1336 	}
1337 
1338 	/*
1339 	 * If want_rx is still set scan receive rings.
1340 	 * Do it on all rings because otherwise we starve.
1341 	 */
1342 	if (want_rx) {
1343 		ring_id_t first_rx = ch->ch_first[NR_RX];
1344 		ring_id_t last_rx = ch->ch_last[NR_RX];
1345 		channel_threshold_unit_t rx_unit =
1346 		    ch->ch_info->cinfo_rx_lowat.cet_unit;
1347 
1348 		/* two rounds here for race avoidance */
1349 do_retry_rx:
1350 		for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1351 			kring = &na->na_rx_rings[i];
1352 
1353 			/* only one thread does rxsync */
1354 			s = kr_enter(kring, TRUE);
1355 			ASSERT(s == 0);
1356 
1357 			error = 0;
1358 			DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1359 			    ch, struct __kern_channel_ring *, kring);
1360 			if (kr_rxsync_prologue(ch, kring, p) >=
1361 			    kring->ckr_num_slots) {
1362 				kr_log_bad_ring(kring);
1363 				revents |= POLLERR;
1364 				error = EFAULT;
1365 				if (*errno == 0) {
1366 					*errno = EFAULT;
1367 				}
1368 			} else {
1369 				/* now we can use kring->rhead, rtail */
1370 				if (kring->ckr_na_sync(kring, p, 0)) {
1371 					revents |= POLLERR;
1372 					error = EIO;
1373 					if (*errno == 0) {
1374 						*errno = EIO;
1375 					}
1376 				} else {
1377 					kr_rxsync_finalize(ch, kring, p);
1378 				}
1379 			}
1380 
1381 			DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1382 			    ch, struct __kern_channel_ring *, kring, int,
1383 			    error);
1384 
1385 			found = kring->ckr_rhead != kring->ckr_rtail;
1386 			kr_exit(kring);
1387 			if (found) {
1388 				revents |= want_rx;
1389 				retry_rx = FALSE;
1390 				(void) kring->ckr_na_notify(kring, p,
1391 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1392 			}
1393 
1394 			/*
1395 			 * Add this ring's readable data to our running
1396 			 * tally for userspace.
1397 			 */
1398 			if (result != NULL) {
1399 				switch (rx_unit) {
1400 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1401 					ready_rx_data += kring->ckr_ready_bytes;
1402 					break;
1403 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1404 					ready_rx_data += kring->ckr_ready_slots;
1405 					break;
1406 				}
1407 			}
1408 		}
1409 
1410 		if (retry_rx && !is_kevent) {
1411 			if (check_all_rx) {
1412 				csi_selrecord_all(na, NR_RX, p, wql);
1413 			} else {
1414 				csi_selrecord_one(&na->na_rx_rings[first_rx],
1415 				    p, wql);
1416 			}
1417 		}
1418 		if (retry_rx) {
1419 			retry_rx = FALSE;
1420 			goto do_retry_rx;
1421 		}
1422 	}
1423 
1424 	if (result != NULL) {
1425 		result->tx_data = ready_tx_data;
1426 		result->rx_data = ready_rx_data;
1427 	}
1428 	goto skip_channel_event;
1429 
1430 process_channel_event:
1431 	/*
1432 	 * perform sync operation on the event ring to make the channel
1433 	 * events enqueued in the ring visible to user-space.
1434 	 */
1435 
1436 	/* select() and poll() not supported for event ring */
1437 	ASSERT(is_kevent);
1438 	VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1439 	kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1440 
1441 	/* only one thread does the sync */
1442 	s = kr_enter(kring, TRUE);
1443 	ASSERT(s == 0);
1444 	if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1445 		kr_log_bad_ring(kring);
1446 		revents |= POLLERR;
1447 		if (*errno == 0) {
1448 			*errno = EFAULT;
1449 		}
1450 	} else {
1451 		if (kring->ckr_na_sync(kring, p, 0)) {
1452 			revents |= POLLERR;
1453 			if (*errno == 0) {
1454 				*errno = EIO;
1455 			}
1456 		} else {
1457 			kr_event_sync_finalize(ch, kring, p);
1458 		}
1459 	}
1460 	found = (kring->ckr_rhead != kring->ckr_rtail);
1461 	kr_exit(kring);
1462 	if (found) {
1463 		revents |= (events & POLLIN);
1464 	}
1465 
1466 skip_channel_event:
1467 #if SK_LOG
1468 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1469 		ch_event_log("exit", ch, p, na, events, revents);
1470 	}
1471 #endif /* SK_LOG */
1472 
1473 	/* unmark thread with sync-in-progress flag */
1474 	sk_sync_unprotect(protect);
1475 
1476 done:
1477 	ASSERT(!sk_is_sync_protected());
1478 
1479 	return revents;
1480 #undef want_tx
1481 #undef want_rx
1482 }
1483 
1484 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1485 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1486 {
1487 	struct kern_channel *ch;
1488 
1489 	SK_LOCK_ASSERT_HELD();
1490 
1491 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1492 		struct ch_info *cinfo = ch->ch_info;
1493 
1494 		/* see comments in ch_open() */
1495 		if (cinfo->cinfo_nx_port != port) {
1496 			continue;
1497 		} else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1498 		    ring_id != cinfo->cinfo_ch_ring_id &&
1499 		    ring_id != CHANNEL_RING_ID_ANY) {
1500 			continue;
1501 		}
1502 
1503 		/* found a match */
1504 		break;
1505 	}
1506 
1507 	if (ch != NULL) {
1508 		ch_retain_locked(ch);
1509 	}
1510 
1511 	return ch;
1512 }
1513 
1514 #if SK_LOG
1515 /* Hoisted out of line to reduce kernel stack footprint */
1516 SK_LOG_ATTRIBUTE
1517 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1518 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1519 {
1520 	uuid_string_t uuidstr;
1521 
1522 	SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1523 	    sk_proc_name(p), sk_proc_pid(p), proc_uniqueid(p),
1524 	    sk_uuid_unparse(p_uuid, uuidstr), port);
1525 }
1526 
1527 SK_LOG_ATTRIBUTE
1528 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,int err)1529 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1530     uint32_t mode, int err)
1531 {
1532 	SK_D("%s(%d) port %u ring %d mode 0x%x err %d",
1533 	    sk_proc_name(p), sk_proc_pid(p), port, (int)ring, mode, err);
1534 }
1535 #endif /* SK_LOG */
1536 
1537 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1538 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1539 {
1540 	uint32_t mode = init->ci_ch_mode;
1541 	nexus_port_t port = init->ci_nx_port;
1542 	ring_id_t ring = init->ci_ch_ring_id;
1543 	struct kern_channel *ch = NULL, *ch0 = NULL;
1544 	struct nxbind *nxb = NULL;
1545 	struct kern_nexus *nx;
1546 	struct chreq chr;
1547 	uuid_t p_uuid;
1548 	kauth_cred_t cred;
1549 
1550 	cred = kauth_cred_get();
1551 	ASSERT(!uuid_is_null(init->ci_nx_uuid));
1552 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1553 	*err = 0;
1554 
1555 	/* make sure we don't allow userland to set kernel-only flags */
1556 	mode &= CHMODE_MASK;
1557 
1558 	SK_LOCK();
1559 
1560 	nx = nx_find(init->ci_nx_uuid, TRUE);
1561 	if (nx == NULL) {
1562 		*err = ENOENT;
1563 		goto done;
1564 	}
1565 	if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1566 		*err = EBUSY;
1567 		goto done;
1568 	}
1569 
1570 	/* port (zero-based) must be within the domain's range */
1571 	if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1572 		*err = EDOM;
1573 		goto done;
1574 	}
1575 	VERIFY(port != NEXUS_PORT_ANY);
1576 
1577 	if (mode & CHMODE_LOW_LATENCY) {
1578 		if ((*err = skywalk_priv_check_cred(p, cred,
1579 		    PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1580 			goto done;
1581 		}
1582 	}
1583 
1584 	/*
1585 	 * Check with the nexus to see if the port is bound; if so, prepare
1586 	 * our nxbind structure that we'll need to pass down to the nexus
1587 	 * for it compare.  If the caller provides a key, we take it over
1588 	 * and will free it ourselves (as part of freeing nxbind.)
1589 	 */
1590 	if (!NX_ANONYMOUS_PROV(nx)) {
1591 		/*
1592 		 * -fbounds-safety: ci_key is user_addr_t (aka uint64_t), so
1593 		 * can't mark it as __sized_by. Forge it instead.
1594 		 */
1595 		void *key = __unsafe_forge_bidi_indexable(void *, init->ci_key,
1596 		    init->ci_key_len);
1597 
1598 #if SK_LOG
1599 		if (__improbable(sk_verbose != 0)) {
1600 			ch_open_log1(p_uuid, p, port);
1601 		}
1602 #endif /* SK_LOG */
1603 
1604 		nxb = nxb_alloc(Z_WAITOK);
1605 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1606 		nxb->nxb_uniqueid = proc_uniqueid(p);
1607 		nxb->nxb_pid = proc_pid(p);
1608 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1609 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1610 		if (key != NULL) {
1611 			nxb->nxb_flags |= NXBF_MATCH_KEY;
1612 			nxb->nxb_key_len = init->ci_key_len;
1613 			nxb->nxb_key = key;
1614 			init->ci_key = USER_ADDR_NULL;  /* take over */
1615 		}
1616 	}
1617 
1618 	/*
1619 	 * There can only be one owner of {port,ring_id} tuple.
1620 	 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1621 	 * all rings.  Further attempts to own any or all of the rings
1622 	 * will be declined.
1623 	 *
1624 	 * For example, assuming a 2-rings setup for port 'p':
1625 	 *
1626 	 * owner{p,-1}
1627 	 *      will not allow:
1628 	 *              owner{p,-1}, owner{p,0}, owner{p,1}
1629 	 *
1630 	 * owner{p,0}
1631 	 *      will allow:
1632 	 *		owner{p,1}
1633 	 *	will not allow:
1634 	 *		owner{p,-1}, owner{p,0}
1635 	 */
1636 	if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1637 		SK_D("found ch0 %p", SK_KVA(ch0));
1638 #if SK_LOG
1639 		uuid_string_t uuidstr;
1640 		char *na_name = (ch0->ch_na != NULL) ?
1641 		    ch0->ch_na->na_name : "";
1642 
1643 		SK_PERR(p, "ch %s flags (0x%x) exists on port %d on "
1644 		    "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1645 		    sk_uuid_unparse(nx->nx_uuid, uuidstr),
1646 		    ch0->ch_name, ch0->ch_pid);
1647 #endif /* SK_LOG */
1648 		*err = EBUSY;
1649 		goto done;
1650 	}
1651 
1652 	bzero(&chr, sizeof(chr));
1653 	chr.cr_tx_lowat = init->ci_tx_lowat;
1654 	chr.cr_rx_lowat = init->ci_rx_lowat;
1655 	chr.cr_port = port;
1656 	chr.cr_mode = mode;
1657 	chr.cr_ring_id = ring;
1658 
1659 	/* upon success, returns a channel with reference held */
1660 	ch = ch_connect(nx, &chr, nxb, p, fd, err);
1661 
1662 done:
1663 
1664 #if SK_LOG
1665 	if (__improbable(sk_verbose != 0)) {
1666 		ch_open_log2(p, port, ring, mode, *err);
1667 	}
1668 #endif /* SK_LOG */
1669 
1670 	if (ch0 != NULL) {
1671 		(void) ch_release_locked(ch0);
1672 	}
1673 
1674 	if (nx != NULL) {
1675 		(void) nx_release_locked(nx);
1676 	}
1677 
1678 	if (nxb != NULL) {
1679 		nxb_free(nxb);
1680 	}
1681 
1682 	SK_UNLOCK();
1683 
1684 	return ch;
1685 }
1686 
1687 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1688 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1689     int *err)
1690 {
1691 	struct kern_channel *ch = NULL;
1692 
1693 	SK_LOCK_ASSERT_HELD();
1694 	if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1695 		*err = EBUSY;
1696 		goto done;
1697 	}
1698 	*err = 0;
1699 
1700 	ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1701 	ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1702 	ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1703 	ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1704 	chr->cr_mode |= CHMODE_KERNEL;
1705 	if (nonxref) {
1706 		chr->cr_mode |= CHMODE_NO_NXREF;
1707 	} else {
1708 		chr->cr_mode &= ~CHMODE_NO_NXREF;
1709 	}
1710 
1711 	/* upon success, returns a channel with reference held */
1712 	ch = ch_connect(nx, chr, NULL, kernproc, -1, err);
1713 	if (ch != NULL) {
1714 		/*
1715 		 * nonxref channels don't hold any reference to the nexus,
1716 		 * since otherwise we'll never be able to close them when
1717 		 * the last regular channel of the nexus is closed, as part
1718 		 * of the nexus's destructor operation.  Release the nonxref
1719 		 * channel reference now, but make sure the nexus has at
1720 		 * least 3 refs: global list, provider list and the nonxref
1721 		 * channel itself, before doing that.
1722 		 */
1723 		if (nonxref) {
1724 			ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1725 			ASSERT(nx->nx_refcnt > 3);
1726 			(void) nx_release_locked(nx);
1727 		}
1728 	}
1729 
1730 #if SK_LOG
1731 	uuid_string_t uuidstr;
1732 	const char * na_name = NULL;
1733 	const char * nxdom_prov_name = NULL;
1734 
1735 	if (ch != NULL && ch->ch_na != NULL) {
1736 		na_name = ch->ch_na->na_name;
1737 	}
1738 	if (nx->nx_prov != NULL) {
1739 		nxdom_prov_name = NX_DOM_PROV(nx)->nxdom_prov_name;
1740 	}
1741 	SK_D("nx %p (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%x err %d",
1742 	    SK_KVA(nx),
1743 	    (nxdom_prov_name != NULL) ? nxdom_prov_name : "",
1744 	    (na_name != NULL) ? na_name : "",
1745 	    (int)chr->cr_port, (int)chr->cr_ring_id,
1746 	    sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode, *err);
1747 #endif /* SK_LOG */
1748 
1749 done:
1750 	return ch;
1751 }
1752 
1753 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1754 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1755 {
1756 #pragma unused(special)
1757 #if SK_LOG
1758 	uuid_string_t uuidstr;
1759 	const char *na_name = (ch->ch_na != NULL) ?
1760 	    ch->ch_na->na_name : "";
1761 	const char *__null_terminated nxdom_name = "";
1762 	if (ch->ch_nexus != NULL) {
1763 		nxdom_name = NX_DOM(ch->ch_nexus)->nxdom_name;
1764 	}
1765 	const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1766 	    NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1767 
1768 	SK_D("ch %p (%s:%s:\"%s\":%u:%d) uuid %s flags 0x%x",
1769 	    SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1770 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id,
1771 	    sk_uuid_unparse(ch->ch_info->cinfo_ch_id, uuidstr),
1772 	    ch->ch_flags);
1773 #endif /* SK_LOG */
1774 	struct kern_nexus *nx = ch->ch_nexus;
1775 
1776 	if (!locked) {
1777 		SK_LOCK();
1778 	}
1779 
1780 	SK_LOCK_ASSERT_HELD();
1781 	/*
1782 	 * If the channel is participating in the interface advisory
1783 	 * notification, remove it from the nexus.
1784 	 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1785 	 * is held in exclusive mode.
1786 	 */
1787 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1788 	if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1789 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1790 		    kern_channel, ch_link_if_adv);
1791 		os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
1792 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1793 			nx_netif_config_interface_advisory(nx, false);
1794 		}
1795 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1796 		lck_mtx_lock(&ch->ch_lock);
1797 		(void) ch_release_locked(ch);
1798 	} else {
1799 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1800 		lck_mtx_lock(&ch->ch_lock);
1801 	}
1802 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1803 	/*
1804 	 * Mark the channel as closing to prevent further setopt requests;
1805 	 * this flag is set once here and never gets cleared.
1806 	 */
1807 	ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1808 	os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed);
1809 
1810 	if (special) {
1811 		VERIFY(ch->ch_flags & CHANF_KERNEL);
1812 	} else {
1813 		VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1814 	}
1815 
1816 	ch->ch_fd = -1;
1817 
1818 	/* may be called as part of failure cleanup, so check */
1819 	if (ch->ch_flags & CHANF_ATTACHED) {
1820 		boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1821 
1822 		/* caller must hold an extra ref */
1823 		ASSERT(ch->ch_refcnt > 1);
1824 
1825 		/* disconnect from nexus */
1826 		ch_disconnect(ch);
1827 
1828 		/*
1829 		 * If this was the last regular channel and the nexus
1830 		 * has been closed, detach it and finish up the job.
1831 		 * If this was a nonxref channel, there is nothing
1832 		 * left to do; see comments in ch_open_special().
1833 		 */
1834 		if (!nonxref) {
1835 			STAILQ_REMOVE(&nx->nx_ch_head, ch,
1836 			    kern_channel, ch_link);
1837 			nx->nx_ch_count--;
1838 			if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1839 			    (nx->nx_flags & NXF_CLOSED)) {
1840 				ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1841 				nx_detach(nx);
1842 			}
1843 			(void) nx_release_locked(nx);
1844 		} else {
1845 			ASSERT(ch->ch_flags & CHANF_KERNEL);
1846 			STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1847 			    kern_channel, ch_link);
1848 		}
1849 
1850 		os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed);
1851 		ch->ch_nexus = NULL;
1852 
1853 		(void) ch_release_locked(ch);   /* for the list */
1854 	}
1855 
1856 	lck_mtx_unlock(&ch->ch_lock);
1857 	if (!locked) {
1858 		SK_UNLOCK();
1859 	}
1860 }
1861 
1862 void
ch_close(struct kern_channel * ch,boolean_t locked)1863 ch_close(struct kern_channel *ch, boolean_t locked)
1864 {
1865 	ch_close_common(ch, locked, FALSE);
1866 }
1867 
1868 void
ch_close_special(struct kern_channel * ch)1869 ch_close_special(struct kern_channel *ch)
1870 {
1871 	ch_close_common(ch, TRUE, TRUE);
1872 }
1873 
1874 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1875 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1876     struct ch_ev_thresh *cet)
1877 {
1878 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1879 	uint32_t bmin, bmax, smin, smax;
1880 	int err = 0;
1881 
1882 	if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1883 	    cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1884 		err = EINVAL;
1885 		goto done;
1886 	}
1887 
1888 	smin = 1;       /* minimum 1 slot */
1889 	bmin = 1;       /* minimum 1 byte */
1890 
1891 	if (t == NR_TX) {
1892 		ASSERT(nxp->nxp_tx_slots > 0);
1893 		smax = (nxp->nxp_tx_slots - 1);
1894 	} else {
1895 		ASSERT(nxp->nxp_rx_slots > 0);
1896 		smax = (nxp->nxp_rx_slots - 1);
1897 	}
1898 	bmax = (smax * nxp->nxp_buf_size);
1899 
1900 	switch (cet->cet_unit) {
1901 	case CHANNEL_THRESHOLD_UNIT_BYTES:
1902 		if (cet->cet_value < bmin) {
1903 			cet->cet_value = bmin;
1904 		} else if (cet->cet_value > bmax) {
1905 			cet->cet_value = bmax;
1906 		}
1907 		break;
1908 
1909 	case CHANNEL_THRESHOLD_UNIT_SLOTS:
1910 		if (cet->cet_value < smin) {
1911 			cet->cet_value = smin;
1912 		} else if (cet->cet_value > smax) {
1913 			cet->cet_value = smax;
1914 		}
1915 		break;
1916 	}
1917 
1918 done:
1919 	return err;
1920 }
1921 
1922 #if SK_LOG
1923 /* Hoisted out of line to reduce kernel stack footprint */
1924 SK_LOG_ATTRIBUTE
1925 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1926 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1927     const struct chreq *chr, const struct kern_channel *ch,
1928     const struct kern_nexus_domain_provider *nxdom_prov,
1929     struct proc *p)
1930 {
1931 	struct __user_channel_schema *ch_schema = ch->ch_schema;
1932 	uuid_string_t uuidstr;
1933 	unsigned int n;
1934 	ring_id_t i, j;
1935 
1936 	ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1937 	if (ch_schema != NULL) {
1938 		SK_D("channel_schema at %p", SK_KVA(ch_schema));
1939 		SK_D("  kern_name:     \"%s\"", ch_schema->csm_kern_name);
1940 		SK_D("  kern_uuid:     %s",
1941 		    sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1942 		SK_D("  flags:         0x%x", ch_schema->csm_flags);
1943 		SK_D("  tx_rings:      %u [%u,%u]", ch_schema->csm_tx_rings,
1944 		    cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1945 		SK_D("  rx_rings:      %u [%u,%u]", ch_schema->csm_rx_rings,
1946 		    cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1947 
1948 		j = ch->ch_last[NR_TX];
1949 		for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1950 			SK_D("  tx_ring_%u_off: 0x%llx", i,
1951 			    (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1952 			SK_D("  tx_sd_%u_off:   0x%llx", i,
1953 			    (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1954 		}
1955 		j = n;
1956 		for (n = 0, i = ch->ch_first[NR_RX];
1957 		    i < ch->ch_last[NR_RX]; n++, i++) {
1958 			SK_D("  rx_ring_%u_off: 0x%llx", i,
1959 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1960 			SK_D("  rx_sd_%u_off:   0x%llx", i,
1961 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1962 		}
1963 		SK_D("  md_type:       %u", ch_schema->csm_md_type);
1964 		SK_D("  md_subtype:    %u", ch_schema->csm_md_subtype);
1965 		SK_D("  stats_ofs:     0x%llx", ch_schema->csm_stats_ofs);
1966 		SK_D("  stats_type:    %u", ch_schema->csm_stats_type);
1967 		SK_D("  flowadv_ofs:   0x%llx", ch_schema->csm_flowadv_ofs);
1968 		SK_D("  flowadv_max:   %u", ch_schema->csm_flowadv_max);
1969 		SK_D("  nexusadv_ofs:  0x%llx", ch_schema->csm_nexusadv_ofs);
1970 	}
1971 
1972 	SK_D("ch %p (%s:%s:\"%s\":%u:%d)",
1973 	    SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1975 	    cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1976 	SK_D("  ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1977 	SK_D("  nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1978 	SK_D("  flags:   0x%x", ch->ch_flags);
1979 	SK_D("  task:    %p %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1980 	    sk_proc_name(p), sk_proc_pid(p));
1981 	SK_D("  txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1982 	    ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1983 	    "bytes" : "slots"));
1984 	SK_D("  rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1985 	    ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1986 	    "bytes" : "slots"));
1987 	SK_D("  mmapref: %p", SK_KVA(ch->ch_mmap.ami_mapref));
1988 	SK_D("  mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1989 	SK_D("  mapsize: %llu (%llu KB)",
1990 	    (uint64_t)cinfo->cinfo_mem_map_size,
1991 	    (uint64_t)cinfo->cinfo_mem_map_size >> 10);
1992 	SK_D("  memsize: %llu (%llu KB)",
1993 	    (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
1994 	SK_D("  offset:  0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
1995 }
1996 
1997 SK_LOG_ATTRIBUTE
1998 static void
ch_connect_log2(const struct kern_nexus * nx,int err)1999 ch_connect_log2(const struct kern_nexus *nx, int err)
2000 {
2001 	uuid_string_t nx_uuidstr;
2002 
2003 	SK_ERR("Error connecting to nexus UUID %s: %d",
2004 	    sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2005 }
2006 #endif /* SK_LOG */
2007 
2008 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct nxbind * nxb,struct proc * p,int fd,int * err)2009 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct nxbind *nxb,
2010     struct proc *p, int fd, int *err)
2011 {
2012 	struct kern_nexus_domain_provider *nxdom_prov;
2013 	struct kern_channel *ch = NULL;
2014 	struct ch_info *cinfo = NULL;
2015 	uint32_t ch_mode = chr->cr_mode;
2016 	boolean_t config = FALSE;
2017 	struct nxdom *nxdom;
2018 	boolean_t reserved_port = FALSE;
2019 
2020 	ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2021 	ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2022 	SK_LOCK_ASSERT_HELD();
2023 
2024 	/* validate thresholds before we proceed any further */
2025 	if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2026 	    (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2027 		goto done;
2028 	}
2029 
2030 	if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2031 		*err = ENOTSUP;
2032 		goto done;
2033 	}
2034 
2035 	ch = ch_alloc(Z_WAITOK);
2036 
2037 	lck_mtx_lock(&ch->ch_lock);
2038 
2039 	uuid_generate_random(ch->ch_info->cinfo_ch_id);
2040 	ch->ch_fd = fd;
2041 	ch->ch_pid = proc_pid(p);
2042 	(void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2043 	    proc_name_address(p));
2044 
2045 	nxdom_prov = NX_DOM_PROV(nx);
2046 	nxdom = NX_DOM(nx);
2047 
2048 	if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2049 		/*
2050 		 * CHANF_KERNEL implies a channel opened by a kernel
2051 		 * subsystem, and is triggered by the CHMODE_KERNEL
2052 		 * flag which (only ever) set by ch_open_special().
2053 		 *
2054 		 * CHANF_NONXREF can be optionally set based on the
2055 		 * CHMODE_NO_NXREF request flag.  This must only be
2056 		 * set by ch_open_special() as well, hence we verify.
2057 		 */
2058 		ASSERT(p == kernproc);
2059 		ASSERT(ch_mode & CHMODE_KERNEL);
2060 		os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed);
2061 		if (ch_mode & CHMODE_NO_NXREF) {
2062 			os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed);
2063 		}
2064 
2065 		config = (ch_mode & CHMODE_CONFIG) != 0;
2066 		if (chr->cr_port == NEXUS_PORT_ANY) {
2067 			if (nxdom->nxdom_find_port == NULL) {
2068 				*err = ENOTSUP;
2069 				goto done;
2070 			}
2071 
2072 			/*
2073 			 * If ephemeral port request, find one for client;
2074 			 * we ask for the reserved port range if this is
2075 			 * a configuration request (CHMODE_CONFIG).
2076 			 */
2077 			if ((*err = nxdom->nxdom_find_port(nx,
2078 			    config, &chr->cr_port)) != 0) {
2079 				goto done;
2080 			}
2081 		}
2082 	}
2083 
2084 	if (skywalk_check_platform_binary(p)) {
2085 		os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed);
2086 	}
2087 
2088 	ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2089 
2090 	reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2091 	    (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2092 	if (!config && reserved_port) {
2093 		*err = EDOM;
2094 		goto done;
2095 	}
2096 
2097 	SK_PDF(SK_VERB_CHANNEL, p, "%snexus port %u requested",
2098 	    reserved_port ? "[reserved] " : "", chr->cr_port);
2099 
2100 	if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2101 	    nx, ch, chr, nxb, p)) != 0) {
2102 		goto done;
2103 	}
2104 
2105 	cinfo = ch->ch_info;
2106 	uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2107 	/* for easy access to immutables */
2108 	bcopy(nx->nx_prov->nxprov_params, &cinfo->cinfo_nxprov_params,
2109 	    sizeof(struct nxprov_params));
2110 	cinfo->cinfo_ch_mode = ch_mode;
2111 	cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2112 	cinfo->cinfo_nx_port = chr->cr_port;
2113 	cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2114 	cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2115 	cinfo->cinfo_schema_offset = chr->cr_memoffset;
2116 	cinfo->cinfo_num_bufs =
2117 	    PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2118 	/*
2119 	 * ch_last is really the number of rings, but we need to return
2120 	 * the actual zero-based ring ID to the client.  Make sure that
2121 	 * is the case here and adjust last_{tx,rx}_ring accordingly.
2122 	 */
2123 	ASSERT((ch->ch_last[NR_TX] > 0) ||
2124 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2125 	ASSERT((ch->ch_last[NR_RX] > 0) ||
2126 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2127 	cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2128 	cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2129 	cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2130 	cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2131 	cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2132 	cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2133 
2134 	if (ch_mode & CHMODE_NO_NXREF) {
2135 		ASSERT(ch_mode & CHMODE_KERNEL);
2136 		STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2137 	} else {
2138 		STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2139 		nx->nx_ch_count++;
2140 	}
2141 	os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed);
2142 	ch->ch_nexus = nx;
2143 	nx_retain_locked(nx);   /* hold a ref on the nexus */
2144 
2145 	ch_retain_locked(ch);   /* one for being in the list */
2146 	ch_retain_locked(ch);   /* one for the caller */
2147 
2148 	/*
2149 	 * Now that we've successfully created the nexus adapter, inform the
2150 	 * nexus provider about the rings and the slots within each ring.
2151 	 * This is a no-op for internal nexus providers.
2152 	 */
2153 	if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2154 		lck_mtx_unlock(&ch->ch_lock);
2155 
2156 		/* gracefully close this fully-formed channel */
2157 		if (ch->ch_flags & CHANF_KERNEL) {
2158 			ch_close_special(ch);
2159 		} else {
2160 			ch_close(ch, TRUE);
2161 		}
2162 		(void) ch_release_locked(ch);
2163 		ch = NULL;
2164 		goto done;
2165 	}
2166 
2167 	ASSERT(ch->ch_schema == NULL ||
2168 	    (ch->ch_schema->csm_flags & CSM_ACTIVE));
2169 
2170 #if SK_LOG
2171 	if (__improbable(sk_verbose != 0)) {
2172 		ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2173 	}
2174 #endif /* SK_LOG */
2175 
2176 done:
2177 	if (ch != NULL) {
2178 		lck_mtx_unlock(&ch->ch_lock);
2179 	}
2180 	if (*err != 0) {
2181 #if SK_LOG
2182 		if (__improbable(sk_verbose != 0)) {
2183 			ch_connect_log2(nx, *err);
2184 		}
2185 #endif /* SK_LOG */
2186 		if (ch != NULL) {
2187 			ch_free(ch);
2188 			ch = NULL;
2189 		}
2190 	}
2191 	return ch;
2192 }
2193 
2194 static void
ch_disconnect(struct kern_channel * ch)2195 ch_disconnect(struct kern_channel *ch)
2196 {
2197 	struct kern_nexus *nx = ch->ch_nexus;
2198 	struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2199 
2200 	SK_LOCK_ASSERT_HELD();
2201 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2202 
2203 	/*
2204 	 * Inform the nexus provider that the channel has been quiesced
2205 	 * and disconnected from the nexus port.  This is a no-op for
2206 	 * internal nexus providers.
2207 	 */
2208 	nxprov_advise_disconnect(nx, ch);
2209 
2210 	/* Finally, let the domain provider tear down the instance */
2211 	nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2212 }
2213 
2214 void
ch_deactivate(struct kern_channel * ch)2215 ch_deactivate(struct kern_channel *ch)
2216 {
2217 	/*
2218 	 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2219 	 * it will never be set again.  Doing this will cause
2220 	 * os_channel_is_defunct() to indicate that the channel
2221 	 * is defunct and is no longer usable (thus should be
2222 	 * immediately closed).
2223 	 */
2224 	if (ch->ch_schema != NULL &&
2225 	    (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2226 		os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags),
2227 		    CSM_ACTIVE, relaxed);
2228 		/* make this globally visible */
2229 		os_atomic_thread_fence(seq_cst);
2230 	}
2231 }
2232 
2233 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2234 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2235 {
2236 #pragma unused(ch)
2237 	int err = 0;
2238 
2239 	if (sopt->sopt_dir != SOPT_SET) {
2240 		sopt->sopt_dir = SOPT_SET;
2241 	}
2242 
2243 	switch (sopt->sopt_name) {
2244 	case CHOPT_TX_LOWAT_THRESH:
2245 		err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2246 		break;
2247 
2248 	case CHOPT_RX_LOWAT_THRESH:
2249 		err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2250 		break;
2251 
2252 	case CHOPT_IF_ADV_CONF:
2253 		err = ch_configure_interface_advisory_event(ch, sopt);
2254 		break;
2255 
2256 	default:
2257 		err = ENOPROTOOPT;
2258 		break;
2259 	}
2260 
2261 	return err;
2262 }
2263 
2264 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2265 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2266 {
2267 #pragma unused(ch)
2268 	int err = 0;
2269 
2270 	if (sopt->sopt_dir != SOPT_GET) {
2271 		sopt->sopt_dir = SOPT_GET;
2272 	}
2273 
2274 	switch (sopt->sopt_name) {
2275 	case CHOPT_TX_LOWAT_THRESH:
2276 		err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2277 		break;
2278 
2279 	case CHOPT_RX_LOWAT_THRESH:
2280 		err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2281 		break;
2282 
2283 	default:
2284 		err = ENOPROTOOPT;
2285 		break;
2286 	}
2287 
2288 	return err;
2289 }
2290 
2291 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2292 ch_configure_interface_advisory_event(struct kern_channel *ch,
2293     struct sockopt *sopt)
2294 {
2295 	int err = 0;
2296 	boolean_t enable = 0;
2297 	struct kern_nexus *nx = ch->ch_nexus;
2298 
2299 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2300 	SK_LOCK_ASSERT_NOTHELD();
2301 
2302 	if (sopt->sopt_val == USER_ADDR_NULL) {
2303 		return EINVAL;
2304 	}
2305 	if (nx->nx_adv.nxv_adv == NULL) {
2306 		return ENOTSUP;
2307 	}
2308 	err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2309 	if (err != 0) {
2310 		return err;
2311 	}
2312 
2313 	/*
2314 	 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2315 	 * ordering requirement; check if the channel is closing once ch_lock
2316 	 * is reacquired and bail if so.
2317 	 */
2318 	lck_mtx_unlock(&ch->ch_lock);
2319 	SK_LOCK();
2320 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2321 	lck_mtx_lock(&ch->ch_lock);
2322 	if (ch->ch_flags & CHANF_CLOSING) {
2323 		err = ENXIO;
2324 		goto done;
2325 	}
2326 
2327 	/*
2328 	 * if interface advisory reporting is enabled on the channel then
2329 	 * add the channel to the list of channels eligible for interface
2330 	 * advisory update on the nexus. If disabled, remove from the list.
2331 	 */
2332 	if (enable) {
2333 		if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2334 			ASSERT(err == 0);
2335 			goto done;
2336 		}
2337 		bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2338 		os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2339 		STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2340 		if (enable_adv) {
2341 			nx_netif_config_interface_advisory(nx, true);
2342 		}
2343 		ch_retain_locked(ch);   /* for being in the IF ADV list */
2344 	} else {
2345 		if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2346 			ASSERT(err == 0);
2347 			goto done;
2348 		}
2349 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2350 		    ch_link_if_adv);
2351 		os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2352 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2353 			nx_netif_config_interface_advisory(nx, false);
2354 		}
2355 		(void) ch_release_locked(ch);
2356 	}
2357 
2358 done:
2359 	lck_mtx_unlock(&ch->ch_lock);
2360 	lck_rw_done(&nx->nx_ch_if_adv_lock);
2361 	SK_UNLOCK();
2362 	lck_mtx_lock(&ch->ch_lock);
2363 
2364 	return err;
2365 }
2366 
2367 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2368 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2369     struct sockopt *sopt)
2370 {
2371 	struct ch_ev_thresh cet, *ocet;
2372 	int err = 0;
2373 
2374 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2375 
2376 	if (sopt->sopt_val == USER_ADDR_NULL) {
2377 		return EINVAL;
2378 	}
2379 
2380 	bzero(&cet, sizeof(cet));
2381 	err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2382 	if (err == 0) {
2383 		err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2384 		if (err == 0) {
2385 			if (t == NR_TX) {
2386 				ocet = &ch->ch_info->cinfo_tx_lowat;
2387 			} else {
2388 				ocet = &ch->ch_info->cinfo_rx_lowat;
2389 			}
2390 
2391 			/* if there is no change, we're done */
2392 			if (ocet->cet_unit == cet.cet_unit &&
2393 			    ocet->cet_value == cet.cet_value) {
2394 				return 0;
2395 			}
2396 
2397 			*ocet = cet;
2398 
2399 			for_rx_tx(t) {
2400 				ring_id_t qfirst = ch->ch_first[t];
2401 				ring_id_t qlast = ch->ch_last[t];
2402 				uint32_t i;
2403 
2404 				for (i = qfirst; i < qlast; i++) {
2405 					struct __kern_channel_ring *kring =
2406 					    &NAKR(ch->ch_na, t)[i];
2407 
2408 					(void) kring->ckr_na_notify(kring,
2409 					    sopt->sopt_p, 0);
2410 				}
2411 			}
2412 
2413 			(void) sooptcopyout(sopt, &cet, sizeof(cet));
2414 		}
2415 	}
2416 
2417 	return err;
2418 }
2419 
2420 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2421 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2422     struct sockopt *sopt)
2423 {
2424 	struct ch_ev_thresh cet;
2425 
2426 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2427 
2428 	if (sopt->sopt_val == USER_ADDR_NULL) {
2429 		return EINVAL;
2430 	}
2431 
2432 	if (t == NR_TX) {
2433 		cet = ch->ch_info->cinfo_tx_lowat;
2434 	} else {
2435 		cet = ch->ch_info->cinfo_rx_lowat;
2436 	}
2437 
2438 	return sooptcopyout(sopt, &cet, sizeof(cet));
2439 }
2440 
2441 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2442 ch_alloc(zalloc_flags_t how)
2443 {
2444 	struct kern_channel *ch;
2445 
2446 	ch = zalloc_flags(ch_zone, how | Z_ZERO);
2447 	if (ch) {
2448 		lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2449 		ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2450 	}
2451 	return ch;
2452 }
2453 
2454 static void
ch_free(struct kern_channel * ch)2455 ch_free(struct kern_channel *ch)
2456 {
2457 	ASSERT(ch->ch_refcnt == 0);
2458 	ASSERT(ch->ch_pp == NULL);
2459 	ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2460 	    CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2461 	lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2462 	SK_DF(SK_VERB_MEM, "ch %p FREE", SK_KVA(ch));
2463 	ASSERT(ch->ch_info != NULL);
2464 	zfree(ch_info_zone, ch->ch_info);
2465 	ch->ch_info = NULL;
2466 	zfree(ch_zone, ch);
2467 }
2468 
2469 void
ch_retain_locked(struct kern_channel * ch)2470 ch_retain_locked(struct kern_channel *ch)
2471 {
2472 	SK_LOCK_ASSERT_HELD();
2473 
2474 	ch->ch_refcnt++;
2475 	VERIFY(ch->ch_refcnt != 0);
2476 }
2477 
2478 void
ch_retain(struct kern_channel * ch)2479 ch_retain(struct kern_channel *ch)
2480 {
2481 	SK_LOCK();
2482 	ch_retain_locked(ch);
2483 	SK_UNLOCK();
2484 }
2485 
2486 int
ch_release_locked(struct kern_channel * ch)2487 ch_release_locked(struct kern_channel *ch)
2488 {
2489 	int oldref = ch->ch_refcnt;
2490 
2491 	SK_LOCK_ASSERT_HELD();
2492 
2493 	VERIFY(ch->ch_refcnt != 0);
2494 	if (--ch->ch_refcnt == 0) {
2495 		ch_free(ch);
2496 	}
2497 
2498 	return oldref == 1;
2499 }
2500 
2501 int
ch_release(struct kern_channel * ch)2502 ch_release(struct kern_channel *ch)
2503 {
2504 	int lastref;
2505 
2506 	SK_LOCK();
2507 	lastref = ch_release_locked(ch);
2508 	SK_UNLOCK();
2509 
2510 	return lastref;
2511 }
2512 
2513 void
ch_dtor(struct kern_channel * ch)2514 ch_dtor(struct kern_channel *ch)
2515 {
2516 	SK_LOCK();
2517 	ch_close(ch, TRUE);
2518 	(void) ch_release_locked(ch);
2519 	SK_UNLOCK();
2520 }
2521 
2522 void
ch_update_upp_buf_stats(struct kern_channel * ch,struct kern_pbufpool * pp)2523 ch_update_upp_buf_stats(struct kern_channel *ch, struct kern_pbufpool *pp)
2524 {
2525 	uint64_t buf_inuse = pp->pp_u_bufinuse;
2526 	struct __user_channel_schema *csm = ch->ch_schema;
2527 	os_atomic_store(&csm->csm_upp_buf_inuse, buf_inuse, relaxed);
2528 }
2529 
2530 #if SK_LOG
2531 SK_NO_INLINE_ATTRIBUTE
2532 char *
ch2str(const struct kern_channel * ch,char * __counted_by (dsz)dst,size_t dsz)2533 ch2str(const struct kern_channel *ch, char *__counted_by(dsz)dst, size_t dsz)
2534 {
2535 	(void) sk_snprintf(dst, dsz, "%p %s flags 0x%b",
2536 	    SK_KVA(ch), ch->ch_name, ch->ch_flags, CHANF_BITS);
2537 
2538 	return dst;
2539 }
2540 #endif
2541