xref: /xnu-10063.141.1/bsd/skywalk/channel/channel.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31  * All rights reserved.
32  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  *   1. Redistributions of source code must retain the above copyright
38  *      notice, this list of conditions and the following disclaimer.
39  *   2. Redistributions in binary form must reproduce the above copyright
40  *      notice, this list of conditions and the following disclaimer in the
41  *      documentation and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  */
55 
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61 
62 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63 
64 struct ch_event_result {
65 	uint32_t tx_data;
66 	uint32_t rx_data;
67 };
68 
69 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72 
73 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77     boolean_t, boolean_t, uint32_t);
78 static boolean_t csi_tcall_start(struct ch_selinfo *);
79 static void csi_tcall(thread_call_param_t, thread_call_param_t);
80 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81 
82 static void ch_redzone_init(void);
83 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85     ring_id_t);
86 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87     struct ch_ev_thresh *);
88 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89     struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90 static void ch_disconnect(struct kern_channel *);
91 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92     struct sockopt *);
93 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94     struct sockopt *);
95 static struct kern_channel *ch_alloc(zalloc_flags_t);
96 static void ch_free(struct kern_channel *);
97 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98     struct sockopt *sopt);
99 
100 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101 static void filt_chrwdetach(struct knote *, boolean_t);
102 static void filt_chrdetach(struct knote *);
103 static void filt_chwdetach(struct knote *);
104 static int filt_chrw(struct knote *, long, int);
105 static int filt_chread(struct knote *, long);
106 static int filt_chwrite(struct knote *, long);
107 
108 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115 static void filt_che_detach(struct knote *);
116 static int filt_che_event(struct knote *, long);
117 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118 static int filt_che_process(struct knote *, struct kevent_qos_s *);
119 static int filt_chan_extended_common(struct knote *, long);
120 
121 static int ch_event(struct kern_channel *ch, int events,
122     void *wql, struct proc *p, struct ch_event_result *,
123     const boolean_t is_kevent, int *errno, const boolean_t);
124 
125 const struct filterops skywalk_channel_rfiltops = {
126 	.f_isfd =       1,
127 	.f_attach =     filt_chrwattach,
128 	.f_detach =     filt_chrdetach,
129 	.f_event =      filt_chread,
130 	.f_touch =      filt_chrtouch,
131 	.f_process =    filt_chrprocess,
132 };
133 
134 const struct filterops skywalk_channel_wfiltops = {
135 	.f_isfd =       1,
136 	.f_attach =     filt_chrwattach,
137 	.f_detach =     filt_chwdetach,
138 	.f_event =      filt_chwrite,
139 	.f_touch =      filt_chwtouch,
140 	.f_process =    filt_chwprocess,
141 };
142 
143 const struct filterops skywalk_channel_efiltops = {
144 	.f_isfd =       1,
145 	.f_attach =     filt_che_attach,
146 	.f_detach =     filt_che_detach,
147 	.f_event =      filt_che_event,
148 	.f_touch =      filt_che_touch,
149 	.f_process =    filt_che_process,
150 };
151 
152 /* mitigation intervals in ns */
153 #define CH_MIT_IVAL_MIN         NSEC_PER_USEC
154 
155 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156 
157 #if (DEVELOPMENT || DEBUG)
158 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161     CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162 #endif /* !DEVELOPMENT && !DEBUG */
163 
164 static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel);
165 
166 static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info);
167 
168 static int __ch_inited = 0;
169 
170 /*
171  * Global cookies to hold the random numbers used for verifying
172  * user metadata red zone violations.
173  */
174 uint64_t __ch_umd_redzone_cookie = 0;
175 
176 #define SKMEM_TAG_CH_KEY        "com.apple.skywalk.channel.key"
177 SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
178 
179 static void
ch_redzone_init(void)180 ch_redzone_init(void)
181 {
182 	_CASSERT(sizeof(__ch_umd_redzone_cookie) ==
183 	    sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
184 	_CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
185 	_CASSERT(sizeof(struct __slot_desc) == 8);
186 
187 	/* Initialize random user red zone cookie values */
188 	do {
189 		read_random(&__ch_umd_redzone_cookie,
190 		    sizeof(__ch_umd_redzone_cookie));
191 	} while (__ch_umd_redzone_cookie == 0);
192 
193 	SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
194 }
195 
196 int
channel_init(void)197 channel_init(void)
198 {
199 	int error = 0;
200 
201 	SK_LOCK_ASSERT_HELD();
202 	ASSERT(!__ch_inited);
203 
204 	_CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
205 	_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
206 
207 	ch_redzone_init();
208 
209 	__ch_inited = 1;
210 
211 	return error;
212 }
213 
214 void
channel_fini(void)215 channel_fini(void)
216 {
217 	SK_LOCK_ASSERT_HELD();
218 
219 	if (__ch_inited) {
220 		__ch_umd_redzone_cookie = 0;
221 		__ch_inited = 0;
222 	}
223 }
224 
225 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)226 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
227 {
228 	csi->csi_flags = 0;
229 	csi->csi_pending = 0;
230 	if (mitigation) {
231 		csi->csi_interval = mit_ival;
232 		csi->csi_eff_interval = ch_mit_ival;    /* global override */
233 		os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed);
234 		csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
235 		    csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
236 		/* this must not fail */
237 		VERIFY(csi->csi_tcall != NULL);
238 	} else {
239 		csi->csi_interval = 0;
240 		csi->csi_eff_interval = 0;
241 		csi->csi_tcall = NULL;
242 	}
243 	lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
244 	klist_init(&csi->csi_si.si_note);
245 }
246 
247 void
csi_destroy(struct ch_selinfo * csi)248 csi_destroy(struct ch_selinfo *csi)
249 {
250 	/* check if not already destroyed, else do it now */
251 	if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) &
252 	    CSI_DESTROYED) == 0) {
253 		CSI_LOCK(csi);
254 		/* must have been set by above atomic op */
255 		VERIFY(csi->csi_flags & CSI_DESTROYED);
256 		if (csi->csi_flags & CSI_MITIGATION) {
257 			thread_call_t tcall = csi->csi_tcall;
258 			VERIFY(tcall != NULL);
259 			CSI_UNLOCK(csi);
260 
261 			(void) thread_call_cancel_wait(tcall);
262 			if (!thread_call_free(tcall)) {
263 				boolean_t freed;
264 				(void) thread_call_cancel_wait(tcall);
265 				freed = thread_call_free(tcall);
266 				VERIFY(freed);
267 			}
268 
269 			CSI_LOCK(csi);
270 			csi->csi_tcall = NULL;
271 			os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION,
272 			    relaxed);
273 		}
274 		csi->csi_pending = 0;
275 		CSI_UNLOCK(csi);
276 
277 		selthreadclear(&csi->csi_si);
278 		/* now we don't need the mutex anymore */
279 		lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
280 	}
281 }
282 
283 /*
284  * Called only for select(2).
285  */
286 __attribute__((always_inline))
287 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)288 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
289 {
290 	struct selinfo *si = &csi->csi_si;
291 
292 	CSI_LOCK_ASSERT_HELD(csi);
293 	selrecord(p, si, wql);
294 }
295 
296 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)297 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
298 {
299 	struct ch_selinfo *csi = &kring->ckr_si;
300 
301 	CSI_LOCK(csi);
302 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
303 	    "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
304 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
305 	    SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
306 
307 	csi_selrecord(csi, p, wql);
308 	CSI_UNLOCK(csi);
309 }
310 
311 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)312 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
313     void *wql)
314 {
315 	struct ch_selinfo *csi = &na->na_si[t];
316 
317 	CSI_LOCK(csi);
318 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
319 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
320 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
321 
322 	csi_selrecord(csi, p, wql);
323 	CSI_UNLOCK(csi);
324 }
325 
326 /*
327  * Called from na_post_event().
328  */
329 __attribute__((always_inline))
330 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)331 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
332     boolean_t selwake, uint32_t hint)
333 {
334 	struct selinfo *si = &csi->csi_si;
335 
336 	CSI_LOCK_ASSERT_HELD(csi);
337 	csi->csi_pending = 0;
338 	if (selwake) {
339 		selwakeup(si);
340 	}
341 	if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
342 		KNOTE(&si->si_note, hint);
343 	}
344 }
345 
346 __attribute__((always_inline))
347 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)348 csi_selwakeup_delayed(struct ch_selinfo *csi)
349 {
350 	CSI_LOCK_ASSERT_HELD(csi);
351 	ASSERT(csi->csi_flags & CSI_MITIGATION);
352 	ASSERT(csi->csi_tcall != NULL);
353 
354 	if (thread_call_isactive(csi->csi_tcall)) {
355 		csi->csi_pending++;
356 	} else if (!csi_tcall_start(csi)) {
357 		csi_selwakeup(csi, FALSE, FALSE, 0);
358 	}
359 }
360 
361 __attribute__((always_inline))
362 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)363 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
364     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
365 {
366 	CSI_LOCK_ASSERT_HELD(csi);
367 
368 	if (nodelay || within_kevent || !selwake || hint != 0 ||
369 	    !(csi->csi_flags & CSI_MITIGATION)) {
370 		csi_selwakeup(csi, within_kevent, selwake, hint);
371 	} else {
372 		csi_selwakeup_delayed(csi);
373 	}
374 }
375 
376 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)377 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
378     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
379 {
380 	struct ch_selinfo *csi = &kring->ckr_si;
381 
382 	CSI_LOCK(csi);
383 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
384 	    "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
385 	    (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
386 	    SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
387 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
388 	    within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
389 
390 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
391 	CSI_UNLOCK(csi);
392 }
393 
394 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)395 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
396     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
397 {
398 	struct ch_selinfo *csi = &na->na_si[t];
399 
400 	CSI_LOCK(csi);
401 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
402 	    "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
403 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
404 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
405 	    within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
406 
407 	switch (t) {
408 	case NR_RX:
409 		if (!(na->na_flags & NAF_RX_MITIGATION)) {
410 			nodelay = TRUE;
411 		}
412 		break;
413 
414 	case NR_TX:
415 		if (!(na->na_flags & NAF_TX_MITIGATION)) {
416 			nodelay = TRUE;
417 		}
418 		break;
419 
420 	default:
421 		nodelay = TRUE;
422 		break;
423 	}
424 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
425 	CSI_UNLOCK(csi);
426 }
427 
428 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)429 csi_tcall_start(struct ch_selinfo *csi)
430 {
431 	uint64_t now, ival, deadline;
432 
433 	CSI_LOCK_ASSERT_HELD(csi);
434 	ASSERT(csi->csi_flags & CSI_MITIGATION);
435 	ASSERT(csi->csi_tcall != NULL);
436 
437 	/* pick up latest value */
438 	ival = csi_tcall_update_interval(csi);
439 
440 	/* if no mitigation, pass notification up now */
441 	if (__improbable(ival == 0)) {
442 		return FALSE;
443 	}
444 
445 	deadline = now = mach_absolute_time();
446 	clock_deadline_for_periodic_event(ival, now, &deadline);
447 	(void) thread_call_enter_delayed(csi->csi_tcall, deadline);
448 
449 	return TRUE;
450 }
451 
452 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)453 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
454 {
455 #pragma unused(arg1)
456 	struct ch_selinfo *csi = arg0;
457 
458 	CSI_LOCK(csi);
459 	csi_selwakeup(csi, FALSE, FALSE, 0);
460 	CSI_UNLOCK(csi);
461 
462 	CSI_LOCK(csi);
463 	if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
464 	    csi->csi_pending != 0 && !csi_tcall_start(csi))) {
465 		csi_selwakeup(csi, FALSE, FALSE, 0);
466 	}
467 	CSI_UNLOCK(csi);
468 }
469 
470 __attribute__((always_inline))
471 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)472 csi_tcall_update_interval(struct ch_selinfo *csi)
473 {
474 	uint64_t i = ch_mit_ival;
475 
476 	/* if global override was adjusted, update local copies */
477 	if (__improbable(csi->csi_eff_interval != i)) {
478 		ASSERT(csi->csi_flags & CSI_MITIGATION);
479 		csi->csi_interval = csi->csi_eff_interval =
480 		    ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
481 	}
482 
483 	return csi->csi_interval;
484 }
485 
486 /* return EV_EOF if the channel is defunct */
487 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)488 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
489 {
490 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
491 		if (kn) {
492 			kn->kn_flags |= EV_EOF;
493 		}
494 		return TRUE;
495 	}
496 	return FALSE;
497 }
498 
499 static void
filt_chrwdetach(struct knote * kn,boolean_t write)500 filt_chrwdetach(struct knote *kn, boolean_t write)
501 {
502 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
503 	struct ch_selinfo *csi;
504 	struct selinfo *si;
505 
506 	lck_mtx_lock(&ch->ch_lock);
507 	csi = ch->ch_si[write ? NR_TX : NR_RX];
508 	si = &csi->csi_si;
509 
510 	CSI_LOCK(csi);
511 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
512 	    "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
513 	    SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
514 	    write ? "write" : "read", si->si_flags);
515 
516 	if (KNOTE_DETACH(&si->si_note, kn)) {
517 		os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
518 	}
519 
520 	CSI_UNLOCK(csi);
521 	lck_mtx_unlock(&ch->ch_lock);
522 }
523 
524 static void
filt_chrdetach(struct knote * kn)525 filt_chrdetach(struct knote *kn)
526 {
527 	ASSERT(kn->kn_filter == EVFILT_READ);
528 	filt_chrwdetach(kn, FALSE);
529 }
530 
531 static void
filt_chwdetach(struct knote * kn)532 filt_chwdetach(struct knote *kn)
533 {
534 	ASSERT(kn->kn_filter == EVFILT_WRITE);
535 	filt_chrwdetach(kn, TRUE);
536 }
537 
538 /*
539  * callback from notifies (generated externally).
540  * This always marks the knote activated, so always
541  * return 1.
542  */
543 static int
filt_chrw(struct knote * kn,long hint,int events)544 filt_chrw(struct knote *kn, long hint, int events)
545 {
546 #if SK_LOG
547 	struct kern_channel *ch = knote_kn_hook_get_raw(kn);
548 #else
549 #pragma unused(kn)
550 #pragma unused(hint)
551 #pragma unused(events)
552 #endif
553 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
554 	    "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
555 	    SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
556 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
557 	    (events == POLLOUT) ?  "write" : "read",
558 	    (uint32_t)hint);
559 
560 	/* assume we are ready */
561 	return 1;
562 }
563 
564 static int
filt_chread(struct knote * kn,long hint)565 filt_chread(struct knote *kn, long hint)
566 {
567 	ASSERT(kn->kn_filter == EVFILT_READ);
568 	/* There is no hint for read/write event */
569 	if (hint != 0) {
570 		return 0;
571 	}
572 	return filt_chrw(kn, hint, POLLIN);
573 }
574 
575 static int
filt_chwrite(struct knote * kn,long hint)576 filt_chwrite(struct knote *kn, long hint)
577 {
578 	ASSERT(kn->kn_filter == EVFILT_WRITE);
579 	/* There is no hint for read/write event */
580 	if (hint != 0) {
581 		return 0;
582 	}
583 	return filt_chrw(kn, hint, POLLOUT);
584 }
585 
586 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)587 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
588 {
589 #pragma unused(kev)
590 	struct kern_channel *ch = knote_kn_hook_get_raw(kn);
591 	int ev = kn->kn_filter;
592 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
593 	int event_error = 0;
594 	int revents;
595 
596 	/* save off the new input fflags and data */
597 	kn->kn_sfflags = kev->fflags;
598 	kn->kn_sdata = kev->data;
599 
600 	lck_mtx_lock(&ch->ch_lock);
601 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
602 		lck_mtx_unlock(&ch->ch_lock);
603 		return 1;
604 	}
605 
606 	/* if a note-specific low watermark is given, validate it */
607 	if (kn->kn_sfflags & NOTE_LOWAT) {
608 		struct ch_ev_thresh note_thresh = {
609 			.cet_unit = (dir == NR_TX) ?
610 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
611 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
612 			.cet_value = (uint32_t)kn->kn_sdata
613 		};
614 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
615 		    &note_thresh) != 0) {
616 			SK_ERR("invalid NOTE_LOWAT threshold %u",
617 			    note_thresh.cet_value);
618 			knote_set_error(kn, EINVAL);
619 			lck_mtx_unlock(&ch->ch_lock);
620 			return 1;
621 		}
622 	}
623 
624 	/* capture new state just so we can return it */
625 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
626 	    &event_error, FALSE);
627 	lck_mtx_unlock(&ch->ch_lock);
628 
629 	if (revents & POLLERR) {
630 		ASSERT(event_error != 0);
631 		/*
632 		 * Setting a knote error here will confuse libdispatch, so we
633 		 * use EV_EOF instead.
634 		 */
635 		kn->kn_flags |= EV_EOF;
636 		return 1;
637 	} else {
638 		return (events & revents) != 0;
639 	}
640 }
641 
642 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)643 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
644 {
645 	ASSERT(kn->kn_filter == EVFILT_READ);
646 
647 	if (kev->flags & EV_ENABLE) {
648 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
649 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
650 		    kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
651 			    ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na));
652 	}
653 
654 	return filt_chtouch(kn, kev, POLLIN);
655 }
656 
657 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)658 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
659 {
660 	ASSERT(kn->kn_filter == EVFILT_WRITE);
661 	return filt_chtouch(kn, kev, POLLOUT);
662 }
663 
664 
665 /*
666  * Called from kevent.  We call ch_event(POLL[IN|OUT]) and
667  * return 0/1 accordingly.
668  */
669 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)670 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
671 {
672 	struct kern_channel *ch = knote_kn_hook_get_raw(kn);
673 	struct ch_event_result result;
674 	uint32_t lowat;
675 	int trigger_event = 1;
676 	int revents;
677 	int event_error;
678 	int64_t data;
679 
680 	lck_mtx_lock(&ch->ch_lock);
681 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
682 		knote_fill_kevent(kn, kev, 0);
683 		lck_mtx_unlock(&ch->ch_lock);
684 		return 1;
685 	}
686 
687 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
688 	    TRUE, &event_error, FALSE);
689 
690 	if (revents & POLLERR) {
691 		ASSERT(event_error != 0);
692 		lck_mtx_unlock(&ch->ch_lock);
693 		/*
694 		 * Setting a knote error here will confuse libdispatch, so we
695 		 * use EV_EOF instead.
696 		 */
697 		kn->kn_flags |= EV_EOF;
698 		knote_fill_kevent_with_sdata(kn, kev);
699 		return 1;
700 	}
701 
702 	trigger_event = (events & revents) != 0;
703 
704 	if (events == POLLOUT) {
705 		lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
706 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
707 		    kn->kn_sdata > lowat) {
708 			lowat = (uint32_t)kn->kn_sdata;
709 		}
710 
711 		data = result.tx_data;
712 
713 		if (result.tx_data < lowat) {
714 			trigger_event = 0;
715 		}
716 	} else {
717 		lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
718 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
719 		    kn->kn_sdata > lowat) {
720 			lowat = (uint32_t)kn->kn_sdata;
721 		}
722 
723 		data = result.rx_data;
724 
725 		if (result.rx_data < lowat) {
726 			trigger_event = 0;
727 		}
728 	}
729 
730 	if (trigger_event) {
731 		knote_fill_kevent(kn, kev, data);
732 	}
733 
734 	lck_mtx_unlock(&ch->ch_lock);
735 
736 	return trigger_event;
737 }
738 
739 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)740 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
741 {
742 	ASSERT(kn->kn_filter == EVFILT_READ);
743 	return filt_chprocess(kn, kev, POLLIN);
744 }
745 
746 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)747 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
748 {
749 	ASSERT(kn->kn_filter == EVFILT_WRITE);
750 	return filt_chprocess(kn, kev, POLLOUT);
751 }
752 
753 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)754 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
755 {
756 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
757 	struct nexus_adapter *na;
758 	struct ch_selinfo *csi;
759 	int ev = kn->kn_filter;
760 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
761 	int revents;
762 	int events;
763 	int event_error = 0;
764 
765 	ASSERT((kn->kn_filter == EVFILT_READ) ||
766 	    (kn->kn_filter == EVFILT_WRITE));
767 
768 	/* ch_kqfilter() should have acquired the lock */
769 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
770 
771 	na = ch->ch_na;
772 	/* if a note-specific low watermark is given, validate it */
773 	if (kn->kn_sfflags & NOTE_LOWAT) {
774 		struct ch_ev_thresh note_thresh = {
775 			.cet_unit = (dir == NR_TX) ?
776 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
777 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
778 			.cet_value = (uint32_t)kn->kn_sdata
779 		};
780 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
781 		    &note_thresh) != 0) {
782 			SK_ERR("invalid NOTE_LOWAT threshold %u",
783 			    note_thresh.cet_value);
784 			knote_set_error(kn, EINVAL);
785 			return 0;
786 		}
787 	}
788 
789 	/* the si is indicated in the channel */
790 	csi = ch->ch_si[dir];
791 	CSI_LOCK(csi);
792 
793 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
794 		os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
795 	}
796 
797 	CSI_UNLOCK(csi);
798 
799 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
800 	    na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
801 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
802 	    (ev == EVFILT_WRITE) ?  "write" : "read");
803 
804 	/* capture current state */
805 	events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
806 
807 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
808 		revents = events;
809 	} else {
810 		/* filt_chprocess() will fill in the kn_sdata field */
811 		revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
812 		    NULL, TRUE, &event_error, FALSE);
813 	}
814 
815 	if (revents & POLLERR) {
816 		ASSERT(event_error != 0);
817 		kn->kn_flags |= EV_EOF;
818 		return 1;
819 	} else {
820 		return (events & revents) != 0;
821 	}
822 }
823 
824 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)825 filt_chan_extended_common(struct knote *kn, long ev_hint)
826 {
827 	/*
828 	 * This function is not always called with the same set of locks held,
829 	 * hence it is only allowed to manipulate kn_fflags, with atomics.
830 	 *
831 	 * the f_event / f_process functions may run concurrently.
832 	 */
833 	uint32_t add_fflags = 0;
834 
835 	if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
836 		add_fflags |= NOTE_FLOW_ADV_UPDATE;
837 	}
838 	if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
839 		add_fflags |= NOTE_CHANNEL_EVENT;
840 	}
841 	if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
842 		add_fflags |= NOTE_IF_ADV_UPD;
843 	}
844 	if (add_fflags) {
845 		/* Reset any events that are not requested on this knote */
846 		add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
847 		os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
848 		return add_fflags != 0;
849 	}
850 	return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
851 }
852 
853 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)854 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
855     uint32_t fflags, long *hint)
856 {
857 	int revents, event_error = 0;
858 
859 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
860 	*hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
861 
862 	if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
863 	    ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
864 		/* capture new state to return */
865 		revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
866 		    NULL, TRUE, &event_error, TRUE);
867 		if (revents & POLLERR) {
868 			ASSERT(event_error != 0);
869 			/*
870 			 * Setting a knote error here will confuse libdispatch,
871 			 * so we use EV_EOF instead.
872 			 */
873 			kn->kn_flags |= EV_EOF;
874 		} else if ((revents & POLLIN) != 0) {
875 			*hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
876 		}
877 	}
878 	/*
879 	 * if the sync operation on event ring didn't find any events
880 	 * then indicate that the channel event is not active.
881 	 */
882 	if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
883 		/*
884 		 * Avoid a costly atomic when the bit is already cleared.
885 		 */
886 		uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
887 		if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
888 			os_atomic_andnot(&kn->kn_fflags,
889 			    CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
890 		}
891 	}
892 }
893 
894 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)895 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
896 {
897 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
898 	struct ch_selinfo *csi;
899 	long hint = 0;
900 
901 	_CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
902 	_CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
903 	_CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
904 
905 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
906 
907 	/* ch_kqfilter() should have acquired the lock */
908 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
909 
910 	csi = ch->ch_si[NR_TX];
911 	CSI_LOCK(csi);
912 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
913 		os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
914 	}
915 	CSI_UNLOCK(csi);
916 
917 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
918 		return 1;
919 	}
920 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
921 		os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed);
922 	}
923 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
924 	if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
925 		/* on registration force an event */
926 		hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
927 	}
928 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
929 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
930 	    "EVFILT_NW_CHANNEL");
931 	return filt_chan_extended_common(kn, hint);
932 }
933 
934 static void
filt_che_detach(struct knote * kn)935 filt_che_detach(struct knote *kn)
936 {
937 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
938 	struct ch_selinfo *csi;
939 
940 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
941 
942 	lck_mtx_lock(&ch->ch_lock);
943 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
944 		os_atomic_andnot(&ch->ch_na->na_flags,
945 		    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
946 	}
947 	csi = ch->ch_si[NR_TX];
948 	CSI_LOCK(csi);
949 	if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
950 		os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
951 	}
952 	CSI_UNLOCK(csi);
953 	lck_mtx_unlock(&ch->ch_lock);
954 
955 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
956 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
957 	    "EVFILT_NW_CHANNEL");
958 }
959 
960 static int
filt_che_event(struct knote * kn,long hint)961 filt_che_event(struct knote *kn, long hint)
962 {
963 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
964 
965 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
966 	if (hint == 0) {
967 		return 0;
968 	}
969 	if (__improbable(ch_filt_check_defunct(ch, NULL))) {
970 		return 1;
971 	}
972 	if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
973 		VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
974 	}
975 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
976 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
977 	    CHAN_FILT_HINT_BITS);
978 	return filt_chan_extended_common(kn, hint);
979 }
980 
981 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)982 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
983 {
984 	int ret;
985 	long hint = 0;
986 	struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
987 
988 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
989 	/* save off the new input fflags and data */
990 	kn->kn_sfflags = kev->fflags;
991 	kn->kn_sdata = kev->data;
992 
993 	lck_mtx_lock(&ch->ch_lock);
994 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
995 		ret = 1;
996 		goto done;
997 	}
998 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
999 		if (kev->flags & EV_ENABLE) {
1000 			os_atomic_or(&ch->ch_na->na_flags,
1001 			    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1002 		} else if (kev->flags & EV_DISABLE) {
1003 			os_atomic_andnot(&ch->ch_na->na_flags,
1004 			    NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1005 		}
1006 	}
1007 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1008 	ret = filt_chan_extended_common(kn, hint);
1009 done:
1010 	lck_mtx_unlock(&ch->ch_lock);
1011 	return ret;
1012 }
1013 
1014 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1015 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1016 {
1017 	int ret;
1018 	long hint = 0;
1019 	struct kern_channel *ch = knote_kn_hook_get_raw(kn);
1020 
1021 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1022 	lck_mtx_lock(&ch->ch_lock);
1023 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
1024 		ret = 1;
1025 		goto done;
1026 	}
1027 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1028 	ret = filt_chan_extended_common(kn, hint);
1029 done:
1030 	lck_mtx_unlock(&ch->ch_lock);
1031 	if (ret != 0) {
1032 		/*
1033 		 * This filter historically behaves like EV_CLEAR,
1034 		 * even when EV_CLEAR wasn't set.
1035 		 */
1036 		knote_fill_kevent(kn, kev, 0);
1037 		kn->kn_fflags = 0;
1038 	}
1039 	return ret;
1040 }
1041 
1042 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1043 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1044     struct kevent_qos_s *kev)
1045 {
1046 	int result;
1047 
1048 	lck_mtx_lock(&ch->ch_lock);
1049 	VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1050 
1051 	if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1052 	    na_reject_channel(ch, ch->ch_na))) {
1053 		SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1054 		    ch->ch_pid, ch->ch_flags, CHANF_BITS);
1055 		knote_set_error(kn, ENXIO);
1056 		lck_mtx_unlock(&ch->ch_lock);
1057 		return 0;
1058 	}
1059 
1060 	switch (kn->kn_filter) {
1061 	case EVFILT_READ:
1062 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1063 		break;
1064 
1065 	case EVFILT_WRITE:
1066 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1067 		break;
1068 
1069 	case EVFILT_NW_CHANNEL:
1070 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1071 		break;
1072 
1073 	default:
1074 		lck_mtx_unlock(&ch->ch_lock);
1075 		SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1076 		    ch->ch_pid, kn->kn_filter);
1077 		knote_set_error(kn, EINVAL);
1078 		return 0;
1079 	}
1080 
1081 	knote_kn_hook_set_raw(kn, ch);
1082 	/* call the appropriate sub-filter attach with the channel lock held */
1083 	result = knote_fops(kn)->f_attach(kn, kev);
1084 	lck_mtx_unlock(&ch->ch_lock);
1085 	return result;
1086 }
1087 
1088 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1089 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1090 {
1091 	return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1092 }
1093 
1094 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1095 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1096 {
1097 	int revents;
1098 	int event_error = 0;
1099 
1100 	lck_mtx_lock(&ch->ch_lock);
1101 	revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1102 	    FALSE);
1103 	lck_mtx_unlock(&ch->ch_lock);
1104 
1105 	ASSERT((revents & POLLERR) == 0 || event_error != 0);
1106 
1107 	return revents;
1108 }
1109 
1110 #if SK_LOG
1111 /* Hoisted out of line to reduce kernel stack footprint */
1112 SK_LOG_ATTRIBUTE
1113 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1114 ch_event_log(const char *prefix, const struct kern_channel *ch,
1115     struct proc *p, const struct nexus_adapter *na,
1116     int events, int revents)
1117 {
1118 	SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1119 	    "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1120 	    SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1121 	    SK_KVA(current_thread()), events, revents);
1122 }
1123 #endif /* SK_LOG */
1124 
1125 /*
1126  * select(2), poll(2) and kevent(2) handlers for channels.
1127  *
1128  * Can be called for one or more rings.  Return true the event mask
1129  * corresponding to ready events.  If there are no ready events, do
1130  * a selrecord on either individual selinfo or on the global one.
1131  * Device-dependent parts (locking and sync of tx/rx rings)
1132  * are done through callbacks.
1133  */
1134 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1135 ch_event(struct kern_channel *ch, int events, void *wql,
1136     struct proc *p, struct ch_event_result *result,
1137     const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1138 {
1139 	struct nexus_adapter *na;
1140 	struct __kern_channel_ring *kring;
1141 	uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1142 	uint32_t ready_tx_data = 0, ready_rx_data = 0;
1143 	sk_protect_t protect = NULL;
1144 
1145 #define want_tx want[NR_TX]
1146 #define want_rx want[NR_RX]
1147 	/*
1148 	 * In order to avoid nested locks, we need to "double check"
1149 	 * txsync and rxsync if we decide to do a selrecord().
1150 	 * retry_tx (and retry_rx, later) prevent looping forever.
1151 	 */
1152 	boolean_t retry_tx = TRUE, retry_rx = TRUE;
1153 	int found, error = 0;
1154 	int s;
1155 
1156 	net_update_uptime();
1157 
1158 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1159 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1160 
1161 	*errno = 0;
1162 
1163 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1164 	    ch->ch_schema == NULL)) {
1165 		SK_ERR("%s(%d): channel is defunct or no longer bound",
1166 		    ch->ch_name, ch->ch_pid);
1167 		revents = POLLERR;
1168 		*errno = ENXIO;
1169 		goto done;
1170 	}
1171 
1172 	/* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1173 	if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1174 		os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
1175 	}
1176 
1177 	na = ch->ch_na;
1178 	if (__improbable(na == NULL ||
1179 	    !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1180 		SK_ERR("%s(%d): channel is non-permissive",
1181 		    ch->ch_name, ch->ch_pid);
1182 		revents = POLLERR;
1183 		*errno = ENXIO;
1184 		goto done;
1185 	}
1186 
1187 	/* mark thread with sync-in-progress flag */
1188 	protect = sk_sync_protect();
1189 
1190 	/* update our work timestamp */
1191 	na->na_work_ts = _net_uptime;
1192 
1193 	/* and make this channel eligible for draining again */
1194 	if (na->na_flags & NAF_DRAINING) {
1195 		os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed);
1196 	}
1197 
1198 #if SK_LOG
1199 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1200 		ch_event_log("enter", ch, p, na, events, revents);
1201 	}
1202 #endif
1203 	if (is_ch_event) {
1204 		goto process_channel_event;
1205 	}
1206 
1207 	want_tx = (events & (POLLOUT | POLLWRNORM));
1208 	want_rx = (events & (POLLIN | POLLRDNORM));
1209 
1210 	/*
1211 	 * check_all_{tx|rx} are set if the channel has more than one ring
1212 	 * AND the file descriptor is bound to all of them.  If so, we sleep
1213 	 * on the "global" selinfo, otherwise we sleep on individual selinfo
1214 	 * The interrupt routine in the driver wake one or the other (or both)
1215 	 * depending on which clients are active.
1216 	 *
1217 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1218 	 * txsync() is called if we run out of buffers on POLLOUT.
1219 	 */
1220 	check_all_tx = ch_is_multiplex(ch, NR_TX);
1221 	check_all_rx = ch_is_multiplex(ch, NR_RX);
1222 
1223 	/*
1224 	 * If want_tx is still set, we must issue txsync calls
1225 	 * (on all rings, to avoid that the tx rings stall).
1226 	 * XXX should also check head != khead on the tx rings.
1227 	 */
1228 	if (want_tx) {
1229 		ring_id_t first_tx = ch->ch_first[NR_TX];
1230 		ring_id_t last_tx = ch->ch_last[NR_TX];
1231 
1232 		channel_threshold_unit_t tx_unit =
1233 		    ch->ch_info->cinfo_tx_lowat.cet_unit;
1234 
1235 		/*
1236 		 * The first round checks if anyone is ready, if not
1237 		 * do a selrecord and another round to handle races.
1238 		 * want_tx goes to 0 if any space is found, and is
1239 		 * used to skip rings with no pending transmissions.
1240 		 */
1241 flush_tx:
1242 		for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1243 			kring = &na->na_tx_rings[i];
1244 			if (!want_tx &&
1245 			    kring->ckr_ring->ring_head == kring->ckr_khead) {
1246 				continue;
1247 			}
1248 
1249 			/* only one thread does txsync */
1250 			s = kr_enter(kring, TRUE);
1251 			ASSERT(s == 0);
1252 
1253 			error = 0;
1254 			DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1255 			    ch, struct __kern_channel_ring *, kring);
1256 			if (kr_txsync_prologue(ch, kring, p) >=
1257 			    kring->ckr_num_slots) {
1258 				kr_log_bad_ring(kring);
1259 				revents |= POLLERR;
1260 				error = EFAULT;
1261 				if (*errno == 0) {
1262 					*errno = EFAULT;
1263 				}
1264 			} else {
1265 				if (kring->ckr_na_sync(kring, p, 0)) {
1266 					revents |= POLLERR;
1267 					error = EIO;
1268 					if (*errno == 0) {
1269 						*errno = EIO;
1270 					}
1271 				} else {
1272 					kr_txsync_finalize(ch, kring, p);
1273 				}
1274 			}
1275 			DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1276 			    ch, struct __kern_channel_ring *, kring, int,
1277 			    error);
1278 
1279 			/*
1280 			 * If we found new slots, notify potential listeners on
1281 			 * the same ring. Since we just did a txsync, look at
1282 			 * the copies of cur,tail in the kring.
1283 			 */
1284 			found = kring->ckr_rhead != kring->ckr_rtail;
1285 			kr_exit(kring);
1286 			if (found) { /* notify other listeners */
1287 				revents |= want_tx;
1288 				want_tx = 0;
1289 				(void) kring->ckr_na_notify(kring, p,
1290 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1291 			}
1292 
1293 			/*
1294 			 * Add this ring's free data to our running
1295 			 * tally for userspace.
1296 			 */
1297 			if (result != NULL) {
1298 				switch (tx_unit) {
1299 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1300 					ready_tx_data += kring->ckr_ready_bytes;
1301 					break;
1302 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1303 					ready_tx_data += kring->ckr_ready_slots;
1304 					break;
1305 				}
1306 			}
1307 		}
1308 		if (want_tx && retry_tx && !is_kevent) {
1309 			if (check_all_tx) {
1310 				csi_selrecord_all(na, NR_TX, p, wql);
1311 			} else {
1312 				csi_selrecord_one(&na->na_tx_rings[first_tx],
1313 				    p, wql);
1314 			}
1315 			retry_tx = FALSE;
1316 			goto flush_tx;
1317 		}
1318 	}
1319 
1320 	/*
1321 	 * If want_rx is still set scan receive rings.
1322 	 * Do it on all rings because otherwise we starve.
1323 	 */
1324 	if (want_rx) {
1325 		ring_id_t first_rx = ch->ch_first[NR_RX];
1326 		ring_id_t last_rx = ch->ch_last[NR_RX];
1327 		channel_threshold_unit_t rx_unit =
1328 		    ch->ch_info->cinfo_rx_lowat.cet_unit;
1329 
1330 		/* two rounds here for race avoidance */
1331 do_retry_rx:
1332 		for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1333 			kring = &na->na_rx_rings[i];
1334 
1335 			/* only one thread does rxsync */
1336 			s = kr_enter(kring, TRUE);
1337 			ASSERT(s == 0);
1338 
1339 			error = 0;
1340 			DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1341 			    ch, struct __kern_channel_ring *, kring);
1342 			if (kr_rxsync_prologue(ch, kring, p) >=
1343 			    kring->ckr_num_slots) {
1344 				kr_log_bad_ring(kring);
1345 				revents |= POLLERR;
1346 				error = EFAULT;
1347 				if (*errno == 0) {
1348 					*errno = EFAULT;
1349 				}
1350 			} else {
1351 				/* now we can use kring->rhead, rtail */
1352 				if (kring->ckr_na_sync(kring, p, 0)) {
1353 					revents |= POLLERR;
1354 					error = EIO;
1355 					if (*errno == 0) {
1356 						*errno = EIO;
1357 					}
1358 				} else {
1359 					kr_rxsync_finalize(ch, kring, p);
1360 				}
1361 			}
1362 
1363 			DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1364 			    ch, struct __kern_channel_ring *, kring, int,
1365 			    error);
1366 
1367 			found = kring->ckr_rhead != kring->ckr_rtail;
1368 			kr_exit(kring);
1369 			if (found) {
1370 				revents |= want_rx;
1371 				retry_rx = FALSE;
1372 				(void) kring->ckr_na_notify(kring, p,
1373 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1374 			}
1375 
1376 			/*
1377 			 * Add this ring's readable data to our running
1378 			 * tally for userspace.
1379 			 */
1380 			if (result != NULL) {
1381 				switch (rx_unit) {
1382 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1383 					ready_rx_data += kring->ckr_ready_bytes;
1384 					break;
1385 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1386 					ready_rx_data += kring->ckr_ready_slots;
1387 					break;
1388 				}
1389 			}
1390 		}
1391 
1392 		if (retry_rx && !is_kevent) {
1393 			if (check_all_rx) {
1394 				csi_selrecord_all(na, NR_RX, p, wql);
1395 			} else {
1396 				csi_selrecord_one(&na->na_rx_rings[first_rx],
1397 				    p, wql);
1398 			}
1399 		}
1400 		if (retry_rx) {
1401 			retry_rx = FALSE;
1402 			goto do_retry_rx;
1403 		}
1404 	}
1405 
1406 	if (result != NULL) {
1407 		result->tx_data = ready_tx_data;
1408 		result->rx_data = ready_rx_data;
1409 	}
1410 	goto skip_channel_event;
1411 
1412 process_channel_event:
1413 	/*
1414 	 * perform sync operation on the event ring to make the channel
1415 	 * events enqueued in the ring visible to user-space.
1416 	 */
1417 
1418 	/* select() and poll() not supported for event ring */
1419 	ASSERT(is_kevent);
1420 	VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1421 	kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1422 
1423 	/* only one thread does the sync */
1424 	s = kr_enter(kring, TRUE);
1425 	ASSERT(s == 0);
1426 	if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1427 		kr_log_bad_ring(kring);
1428 		revents |= POLLERR;
1429 		if (*errno == 0) {
1430 			*errno = EFAULT;
1431 		}
1432 	} else {
1433 		if (kring->ckr_na_sync(kring, p, 0)) {
1434 			revents |= POLLERR;
1435 			if (*errno == 0) {
1436 				*errno = EIO;
1437 			}
1438 		} else {
1439 			kr_event_sync_finalize(ch, kring, p);
1440 		}
1441 	}
1442 	found = (kring->ckr_rhead != kring->ckr_rtail);
1443 	kr_exit(kring);
1444 	if (found) {
1445 		revents |= (events & POLLIN);
1446 	}
1447 
1448 skip_channel_event:
1449 #if SK_LOG
1450 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1451 		ch_event_log("exit", ch, p, na, events, revents);
1452 	}
1453 #endif /* SK_LOG */
1454 
1455 	/* unmark thread with sync-in-progress flag */
1456 	sk_sync_unprotect(protect);
1457 
1458 done:
1459 	ASSERT(!sk_is_sync_protected());
1460 
1461 	return revents;
1462 #undef want_tx
1463 #undef want_rx
1464 }
1465 
1466 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1467 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1468 {
1469 	struct kern_channel *ch;
1470 
1471 	SK_LOCK_ASSERT_HELD();
1472 
1473 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1474 		struct ch_info *cinfo = ch->ch_info;
1475 
1476 		/* see comments in ch_open() */
1477 		if (cinfo->cinfo_nx_port != port) {
1478 			continue;
1479 		} else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1480 			continue;
1481 		} else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1482 		    ring_id != cinfo->cinfo_ch_ring_id &&
1483 		    ring_id != CHANNEL_RING_ID_ANY) {
1484 			continue;
1485 		}
1486 
1487 		/* found a match */
1488 		break;
1489 	}
1490 
1491 	if (ch != NULL) {
1492 		ch_retain_locked(ch);
1493 	}
1494 
1495 	return ch;
1496 }
1497 
1498 #if SK_LOG
1499 /* Hoisted out of line to reduce kernel stack footprint */
1500 SK_LOG_ATTRIBUTE
1501 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1502 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1503 {
1504 	uuid_string_t uuidstr;
1505 
1506 	SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1507 	    sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1508 	    sk_uuid_unparse(p_uuid, uuidstr), port);
1509 }
1510 
1511 SK_LOG_ATTRIBUTE
1512 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,const char * mode_bits,int err)1513 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1514     uint32_t mode, const char *mode_bits, int err)
1515 {
1516 	SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1517 	    sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1518 	    mode, mode_bits, err);
1519 }
1520 #endif /* SK_LOG */
1521 
1522 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1523 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1524 {
1525 	uint32_t mode = init->ci_ch_mode;
1526 	nexus_port_t port = init->ci_nx_port;
1527 	ring_id_t ring = init->ci_ch_ring_id;
1528 	struct kern_channel *ch = NULL, *ch0 = NULL;
1529 	struct nxbind *nxb = NULL;
1530 	struct kern_nexus *nx;
1531 	struct chreq chr;
1532 	uuid_t p_uuid;
1533 	kauth_cred_t cred;
1534 
1535 	cred = kauth_cred_get();
1536 	ASSERT(!uuid_is_null(init->ci_nx_uuid));
1537 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1538 	*err = 0;
1539 
1540 	/* make sure we don't allow userland to set kernel-only flags */
1541 	mode &= CHMODE_MASK;
1542 
1543 	SK_LOCK();
1544 
1545 	nx = nx_find(init->ci_nx_uuid, TRUE);
1546 	if (nx == NULL) {
1547 		*err = ENOENT;
1548 		goto done;
1549 	}
1550 
1551 	/* port (zero-based) must be within the domain's range */
1552 	if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1553 		*err = EDOM;
1554 		goto done;
1555 	}
1556 	VERIFY(port != NEXUS_PORT_ANY);
1557 
1558 	if (mode & CHMODE_LOW_LATENCY) {
1559 		if ((*err = skywalk_priv_check_cred(p, cred,
1560 		    PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1561 			goto done;
1562 		}
1563 	}
1564 
1565 	/* "no copy" is valid only when at least one tx/rx mon flag is set */
1566 	if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1567 		mode &= ~CHMODE_MONITOR_NO_COPY;
1568 	}
1569 
1570 	if (mode & CHMODE_MONITOR) {
1571 		if ((*err = skywalk_priv_check_cred(p, cred,
1572 		    PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1573 			goto done;
1574 		}
1575 		/* Don't allow non-root processes to monitor channels. */
1576 		if (kauth_cred_issuser(cred) == 0) {
1577 			*err = EPERM;
1578 			goto done;
1579 		}
1580 	}
1581 
1582 	/*
1583 	 * Check with the nexus to see if the port is bound; if so, prepare
1584 	 * our nxbind structure that we'll need to pass down to the nexus
1585 	 * for it compare.  If the caller provides a key, we take it over
1586 	 * and will free it ourselves (as part of freeing nxbind.)
1587 	 *
1588 	 * If this is a monitor channel, skip this altogether since the check
1589 	 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1590 	 */
1591 	if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1592 		void *key = (void *)(init->ci_key);
1593 
1594 #if SK_LOG
1595 		if (__improbable(sk_verbose != 0)) {
1596 			ch_open_log1(p_uuid, p, port);
1597 		}
1598 #endif /* SK_LOG */
1599 
1600 		nxb = nxb_alloc(Z_WAITOK);
1601 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1602 		nxb->nxb_uniqueid = proc_uniqueid(p);
1603 		nxb->nxb_pid = proc_pid(p);
1604 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1605 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1606 		if (key != NULL) {
1607 			nxb->nxb_flags |= NXBF_MATCH_KEY;
1608 			nxb->nxb_key_len = init->ci_key_len;
1609 			nxb->nxb_key = key;
1610 			init->ci_key = USER_ADDR_NULL;  /* take over */
1611 		}
1612 	}
1613 
1614 	/*
1615 	 * There can only be one owner of {port,ring_id} tuple.  Once
1616 	 * owned, this can be made available among multiple monitors.
1617 	 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1618 	 * all rings.  Further attempts to own any or all of the rings
1619 	 * will be declined.
1620 	 *
1621 	 * Multiple monitors are allowed to exist.  If a channel has been
1622 	 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1623 	 * monitored.  If an owning channel has been bound to an individual
1624 	 * ring, only that ring can be monitored, either by specifying the
1625 	 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1626 	 *
1627 	 * For example, assuming a 2-rings setup for port 'p':
1628 	 *
1629 	 * owner{p,-1}
1630 	 *      will allow:
1631 	 *              monitor{p,-1}, monitor{p,0}, monitor{p,1}
1632 	 *      will not allow:
1633 	 *              owner{p,-1}, owner{p,0}, owner{p,1}
1634 	 *
1635 	 * owner{p,0}
1636 	 *      will allow:
1637 	 *		owner{p,1}, monitor{p,-1}, monitor{p,0}
1638 	 *	will not allow:
1639 	 *		owner{p,-1}, owner{p,0}, monitor{p,1}
1640 	 */
1641 	if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1642 		SK_D("found ch0 0x%llx", SK_KVA(ch0));
1643 		/*
1644 		 * Unless this is a monitor channel, allow only at
1645 		 * most one owner of the {port,ring_id} tuple.
1646 		 */
1647 		if (!(mode & CHMODE_MONITOR)) {
1648 #if SK_LOG
1649 			uuid_string_t uuidstr;
1650 			char *na_name = (ch0->ch_na != NULL) ?
1651 			    ch0->ch_na->na_name : "";
1652 
1653 			SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1654 			    "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1655 			    sk_uuid_unparse(nx->nx_uuid, uuidstr),
1656 			    ch0->ch_name, ch0->ch_pid);
1657 #endif /* SK_LOG */
1658 			*err = EBUSY;
1659 			goto done;
1660 		}
1661 	} else if (mode & CHMODE_MONITOR) {
1662 		*err = ENXIO;
1663 		goto done;
1664 	}
1665 
1666 	bzero(&chr, sizeof(chr));
1667 	chr.cr_tx_lowat = init->ci_tx_lowat;
1668 	chr.cr_rx_lowat = init->ci_rx_lowat;
1669 	chr.cr_port = port;
1670 	chr.cr_mode = mode;
1671 	chr.cr_ring_id = ring;
1672 
1673 	/* upon success, returns a channel with reference held */
1674 	ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1675 
1676 done:
1677 
1678 #if SK_LOG
1679 	if (__improbable(sk_verbose != 0)) {
1680 		ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1681 	}
1682 #endif /* SK_LOG */
1683 
1684 	if (ch0 != NULL) {
1685 		(void) ch_release_locked(ch0);
1686 	}
1687 
1688 	if (nx != NULL) {
1689 		(void) nx_release_locked(nx);
1690 	}
1691 
1692 	if (nxb != NULL) {
1693 		nxb_free(nxb);
1694 	}
1695 
1696 	SK_UNLOCK();
1697 
1698 	return ch;
1699 }
1700 
1701 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1702 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1703     int *err)
1704 {
1705 	struct kern_channel *ch = NULL;
1706 
1707 	SK_LOCK_ASSERT_HELD();
1708 	*err = 0;
1709 
1710 	ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1711 	ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1712 	ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1713 	ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1714 	chr->cr_mode |= CHMODE_KERNEL;
1715 	if (nonxref) {
1716 		chr->cr_mode |= CHMODE_NO_NXREF;
1717 	} else {
1718 		chr->cr_mode &= ~CHMODE_NO_NXREF;
1719 	}
1720 
1721 	/* upon success, returns a channel with reference held */
1722 	ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1723 	if (ch != NULL) {
1724 		/*
1725 		 * nonxref channels don't hold any reference to the nexus,
1726 		 * since otherwise we'll never be able to close them when
1727 		 * the last regular channel of the nexus is closed, as part
1728 		 * of the nexus's destructor operation.  Release the nonxref
1729 		 * channel reference now, but make sure the nexus has at
1730 		 * least 3 refs: global list, provider list and the nonxref
1731 		 * channel itself, before doing that.
1732 		 */
1733 		if (nonxref) {
1734 			ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1735 			ASSERT(nx->nx_refcnt > 3);
1736 			(void) nx_release_locked(nx);
1737 		}
1738 	}
1739 
1740 #if SK_LOG
1741 	uuid_string_t uuidstr;
1742 	SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1743 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ?
1744 	    ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id,
1745 	    sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1746 	    CHMODE_BITS, *err);
1747 #endif /* SK_LOG */
1748 
1749 	return ch;
1750 }
1751 
1752 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1753 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1754 {
1755 #pragma unused(special)
1756 #if SK_LOG
1757 	uuid_string_t uuidstr;
1758 	const char *na_name = (ch->ch_na != NULL) ?
1759 	    ch->ch_na->na_name : "";
1760 	const char *nxdom_name = (ch->ch_nexus != NULL) ?
1761 	    NX_DOM(ch->ch_nexus)->nxdom_name : "";
1762 	const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1763 	    NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1764 
1765 	SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1766 	    SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1767 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1768 	SK_D("  UUID:    %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1769 	    uuidstr));
1770 	SK_D("  flags:   0x%b", ch->ch_flags, CHANF_BITS);
1771 #endif /* SK_LOG */
1772 	struct kern_nexus *nx = ch->ch_nexus;
1773 
1774 	if (!locked) {
1775 		SK_LOCK();
1776 	}
1777 
1778 	SK_LOCK_ASSERT_HELD();
1779 	/*
1780 	 * If the channel is participating in the interface advisory
1781 	 * notification, remove it from the nexus.
1782 	 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1783 	 * is held in exclusive mode.
1784 	 */
1785 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1786 	if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1787 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1788 		    kern_channel, ch_link_if_adv);
1789 		os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
1790 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1791 			nx_netif_config_interface_advisory(nx, false);
1792 		}
1793 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1794 		lck_mtx_lock(&ch->ch_lock);
1795 		(void) ch_release_locked(ch);
1796 	} else {
1797 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1798 		lck_mtx_lock(&ch->ch_lock);
1799 	}
1800 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1801 	/*
1802 	 * Mark the channel as closing to prevent further setopt requests;
1803 	 * this flag is set once here and never gets cleared.
1804 	 */
1805 	ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1806 	os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed);
1807 
1808 	if (special) {
1809 		VERIFY(ch->ch_flags & CHANF_KERNEL);
1810 	} else {
1811 		VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1812 	}
1813 
1814 	ch->ch_fd = -1;
1815 
1816 	/* may be called as part of failure cleanup, so check */
1817 	if (ch->ch_flags & CHANF_ATTACHED) {
1818 		boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1819 
1820 		/* caller must hold an extra ref */
1821 		ASSERT(ch->ch_refcnt > 1);
1822 
1823 		/* disconnect from nexus */
1824 		ch_disconnect(ch);
1825 
1826 		/*
1827 		 * If this was the last regular channel and the nexus
1828 		 * has been closed, detach it and finish up the job.
1829 		 * If this was a nonxref channel, there is nothing
1830 		 * left to do; see comments in ch_open_special().
1831 		 */
1832 		if (!nonxref) {
1833 			STAILQ_REMOVE(&nx->nx_ch_head, ch,
1834 			    kern_channel, ch_link);
1835 			nx->nx_ch_count--;
1836 			if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1837 			    (nx->nx_flags & NXF_CLOSED)) {
1838 				ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1839 				nx_detach(nx);
1840 			}
1841 			(void) nx_release_locked(nx);
1842 		} else {
1843 			ASSERT(ch->ch_flags & CHANF_KERNEL);
1844 			STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1845 			    kern_channel, ch_link);
1846 		}
1847 
1848 		os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed);
1849 		ch->ch_nexus = NULL;
1850 
1851 		(void) ch_release_locked(ch);   /* for the list */
1852 	}
1853 
1854 	lck_mtx_unlock(&ch->ch_lock);
1855 	if (!locked) {
1856 		SK_UNLOCK();
1857 	}
1858 }
1859 
1860 void
ch_close(struct kern_channel * ch,boolean_t locked)1861 ch_close(struct kern_channel *ch, boolean_t locked)
1862 {
1863 	ch_close_common(ch, locked, FALSE);
1864 }
1865 
1866 void
ch_close_special(struct kern_channel * ch)1867 ch_close_special(struct kern_channel *ch)
1868 {
1869 	ch_close_common(ch, TRUE, TRUE);
1870 }
1871 
1872 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1873 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1874     struct ch_ev_thresh *cet)
1875 {
1876 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1877 	uint32_t bmin, bmax, smin, smax;
1878 	int err = 0;
1879 
1880 	if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1881 	    cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1882 		err = EINVAL;
1883 		goto done;
1884 	}
1885 
1886 	smin = 1;       /* minimum 1 slot */
1887 	bmin = 1;       /* minimum 1 byte */
1888 
1889 	if (t == NR_TX) {
1890 		ASSERT(nxp->nxp_tx_slots > 0);
1891 		smax = (nxp->nxp_tx_slots - 1);
1892 	} else {
1893 		ASSERT(nxp->nxp_rx_slots > 0);
1894 		smax = (nxp->nxp_rx_slots - 1);
1895 	}
1896 	bmax = (smax * nxp->nxp_buf_size);
1897 
1898 	switch (cet->cet_unit) {
1899 	case CHANNEL_THRESHOLD_UNIT_BYTES:
1900 		if (cet->cet_value < bmin) {
1901 			cet->cet_value = bmin;
1902 		} else if (cet->cet_value > bmax) {
1903 			cet->cet_value = bmax;
1904 		}
1905 		break;
1906 
1907 	case CHANNEL_THRESHOLD_UNIT_SLOTS:
1908 		if (cet->cet_value < smin) {
1909 			cet->cet_value = smin;
1910 		} else if (cet->cet_value > smax) {
1911 			cet->cet_value = smax;
1912 		}
1913 		break;
1914 	}
1915 
1916 done:
1917 	return err;
1918 }
1919 
1920 #if SK_LOG
1921 /* Hoisted out of line to reduce kernel stack footprint */
1922 SK_LOG_ATTRIBUTE
1923 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1924 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1925     const struct chreq *chr, const struct kern_channel *ch,
1926     const struct kern_nexus_domain_provider *nxdom_prov,
1927     struct proc *p)
1928 {
1929 	struct __user_channel_schema *ch_schema = ch->ch_schema;
1930 	uuid_string_t uuidstr;
1931 	unsigned int n;
1932 	ring_id_t i, j;
1933 
1934 	ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1935 	if (ch_schema != NULL) {
1936 		SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1937 		SK_D("  kern_name:     \"%s\"", ch_schema->csm_kern_name);
1938 		SK_D("  kern_uuid:     %s",
1939 		    sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1940 		SK_D("  flags:         0x%b", ch_schema->csm_flags, CSM_BITS);
1941 		SK_D("  tx_rings:      %u [%u,%u]", ch_schema->csm_tx_rings,
1942 		    cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1943 		SK_D("  rx_rings:      %u [%u,%u]", ch_schema->csm_rx_rings,
1944 		    cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1945 
1946 		j = ch->ch_last[NR_TX];
1947 		for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1948 			SK_D("  tx_ring_%u_off: 0x%llx", i,
1949 			    (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1950 			SK_D("  tx_sd_%u_off:   0x%llx", i,
1951 			    (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1952 		}
1953 		j = n;
1954 		for (n = 0, i = ch->ch_first[NR_RX];
1955 		    i < ch->ch_last[NR_RX]; n++, i++) {
1956 			SK_D("  rx_ring_%u_off: 0x%llx", i,
1957 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1958 			SK_D("  rx_sd_%u_off:   0x%llx", i,
1959 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1960 		}
1961 		SK_D("  md_type:       %u", ch_schema->csm_md_type);
1962 		SK_D("  md_subtype:    %u", ch_schema->csm_md_subtype);
1963 		SK_D("  stats_ofs:     0x%llx", ch_schema->csm_stats_ofs);
1964 		SK_D("  stats_type:    %u", ch_schema->csm_stats_type);
1965 		SK_D("  flowadv_ofs:   0x%llx", ch_schema->csm_flowadv_ofs);
1966 		SK_D("  flowadv_max:   %u", ch_schema->csm_flowadv_max);
1967 		SK_D("  nexusadv_ofs:  0x%llx", ch_schema->csm_nexusadv_ofs);
1968 	}
1969 
1970 	SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1971 	    SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1972 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1973 	    cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1974 	SK_D("  ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1975 	SK_D("  nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1976 	SK_D("  flags:   0x%b", ch->ch_flags, CHANF_BITS);
1977 	SK_D("  task:    0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1978 	    sk_proc_name_address(p), sk_proc_pid(p));
1979 	SK_D("  txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1980 	    ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1981 	    "bytes" : "slots"));
1982 	SK_D("  rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1983 	    ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1984 	    "bytes" : "slots"));
1985 	SK_D("  mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
1986 	SK_D("  mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1987 	SK_D("  mapsize: 0x%llx (%llu KB)",
1988 	    (uint64_t)cinfo->cinfo_mem_map_size,
1989 	    (uint64_t)cinfo->cinfo_mem_map_size >> 10);
1990 	SK_D("  memsize: 0x%llx (%llu KB)",
1991 	    (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
1992 	SK_D("  offset:  0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
1993 }
1994 
1995 SK_LOG_ATTRIBUTE
1996 static void
ch_connect_log2(const struct kern_nexus * nx,int err)1997 ch_connect_log2(const struct kern_nexus *nx, int err)
1998 {
1999 	uuid_string_t nx_uuidstr;
2000 
2001 	SK_ERR("Error connecting to nexus UUID %s: %d",
2002 	    sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2003 }
2004 #endif /* SK_LOG */
2005 
2006 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,int fd,int * err)2007 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2008     struct nxbind *nxb, struct proc *p, int fd, int *err)
2009 {
2010 	struct kern_nexus_domain_provider *nxdom_prov;
2011 	struct kern_channel *ch = NULL;
2012 	struct ch_info *cinfo = NULL;
2013 	uint32_t ch_mode = chr->cr_mode;
2014 	boolean_t config = FALSE;
2015 	struct nxdom *nxdom;
2016 	boolean_t reserved_port = FALSE;
2017 
2018 	ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2019 	ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2020 	SK_LOCK_ASSERT_HELD();
2021 
2022 	/* validate thresholds before we proceed any further */
2023 	if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2024 	    (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2025 		goto done;
2026 	}
2027 
2028 	if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2029 		*err = ENOTSUP;
2030 		goto done;
2031 	}
2032 
2033 	ch = ch_alloc(Z_WAITOK);
2034 
2035 	lck_mtx_lock(&ch->ch_lock);
2036 
2037 	uuid_generate_random(ch->ch_info->cinfo_ch_id);
2038 	ch->ch_fd = fd;
2039 	ch->ch_pid = proc_pid(p);
2040 	(void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2041 	    proc_name_address(p));
2042 
2043 	nxdom_prov = NX_DOM_PROV(nx);
2044 	nxdom = NX_DOM(nx);
2045 
2046 	if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2047 		/*
2048 		 * CHANF_KERNEL implies a channel opened by a kernel
2049 		 * subsystem, and is triggered by the CHMODE_KERNEL
2050 		 * flag which (only ever) set by ch_open_special().
2051 		 *
2052 		 * CHANF_NONXREF can be optionally set based on the
2053 		 * CHMODE_NO_NXREF request flag.  This must only be
2054 		 * set by ch_open_special() as well, hence we verify.
2055 		 */
2056 		ASSERT(p == kernproc);
2057 		ASSERT(ch_mode & CHMODE_KERNEL);
2058 		os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed);
2059 		if (ch_mode & CHMODE_NO_NXREF) {
2060 			os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed);
2061 		}
2062 
2063 		config = (ch_mode & CHMODE_CONFIG) != 0;
2064 		if (chr->cr_port == NEXUS_PORT_ANY) {
2065 			if (nxdom->nxdom_find_port == NULL) {
2066 				*err = ENOTSUP;
2067 				goto done;
2068 			}
2069 
2070 			/*
2071 			 * If ephemeral port request, find one for client;
2072 			 * we ask for the reserved port range if this is
2073 			 * a configuration request (CHMODE_CONFIG).
2074 			 */
2075 			if ((*err = nxdom->nxdom_find_port(nx,
2076 			    config, &chr->cr_port)) != 0) {
2077 				goto done;
2078 			}
2079 		}
2080 	}
2081 
2082 	if (skywalk_check_platform_binary(p)) {
2083 		os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed);
2084 	}
2085 
2086 	ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2087 
2088 	reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2089 	    (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2090 	if (!config && reserved_port) {
2091 		*err = EDOM;
2092 		goto done;
2093 	}
2094 
2095 	SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2096 	    sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2097 
2098 	if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2099 	    nx, ch, chr, ch0, nxb, p)) != 0) {
2100 		goto done;
2101 	}
2102 
2103 	cinfo = ch->ch_info;
2104 	uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2105 	/* for easy access to immutables */
2106 	bcopy((void *)nx->nx_prov->nxprov_params,
2107 	    (void *)&cinfo->cinfo_nxprov_params, sizeof(struct nxprov_params));
2108 	cinfo->cinfo_ch_mode = ch_mode;
2109 	cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2110 	cinfo->cinfo_nx_port = chr->cr_port;
2111 	cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2112 	cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2113 	cinfo->cinfo_schema_offset = chr->cr_memoffset;
2114 	cinfo->cinfo_num_bufs =
2115 	    PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2116 	/*
2117 	 * ch_last is really the number of rings, but we need to return
2118 	 * the actual zero-based ring ID to the client.  Make sure that
2119 	 * is the case here and adjust last_{tx,rx}_ring accordingly.
2120 	 */
2121 	ASSERT((ch->ch_last[NR_TX] > 0) ||
2122 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2123 	ASSERT((ch->ch_last[NR_RX] > 0) ||
2124 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2125 	cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2126 	cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2127 	cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2128 	cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2129 	cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2130 	cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2131 
2132 	if (ch_mode & CHMODE_NO_NXREF) {
2133 		ASSERT(ch_mode & CHMODE_KERNEL);
2134 		STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2135 	} else {
2136 		STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2137 		nx->nx_ch_count++;
2138 	}
2139 	os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed);
2140 	ch->ch_nexus = nx;
2141 	nx_retain_locked(nx);   /* hold a ref on the nexus */
2142 
2143 	ch_retain_locked(ch);   /* one for being in the list */
2144 	ch_retain_locked(ch);   /* one for the caller */
2145 
2146 	/*
2147 	 * Now that we've successfully created the nexus adapter, inform the
2148 	 * nexus provider about the rings and the slots within each ring.
2149 	 * This is a no-op for internal nexus providers.
2150 	 */
2151 	if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2152 		lck_mtx_unlock(&ch->ch_lock);
2153 
2154 		/* gracefully close this fully-formed channel */
2155 		if (ch->ch_flags & CHANF_KERNEL) {
2156 			ch_close_special(ch);
2157 		} else {
2158 			ch_close(ch, TRUE);
2159 		}
2160 		(void) ch_release_locked(ch);
2161 		ch = NULL;
2162 		goto done;
2163 	}
2164 
2165 	ASSERT(ch->ch_schema == NULL ||
2166 	    (ch->ch_schema->csm_flags & CSM_ACTIVE));
2167 
2168 #if SK_LOG
2169 	if (__improbable(sk_verbose != 0)) {
2170 		ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2171 	}
2172 #endif /* SK_LOG */
2173 
2174 done:
2175 	if (ch != NULL) {
2176 		lck_mtx_unlock(&ch->ch_lock);
2177 	}
2178 	if (*err != 0) {
2179 #if SK_LOG
2180 		if (__improbable(sk_verbose != 0)) {
2181 			ch_connect_log2(nx, *err);
2182 		}
2183 #endif /* SK_LOG */
2184 		if (ch != NULL) {
2185 			ch_free(ch);
2186 			ch = NULL;
2187 		}
2188 	}
2189 	return ch;
2190 }
2191 
2192 static void
ch_disconnect(struct kern_channel * ch)2193 ch_disconnect(struct kern_channel *ch)
2194 {
2195 	struct kern_nexus *nx = ch->ch_nexus;
2196 	struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2197 
2198 	SK_LOCK_ASSERT_HELD();
2199 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2200 
2201 	/*
2202 	 * Inform the nexus provider that the channel has been quiesced
2203 	 * and disconnected from the nexus port.  This is a no-op for
2204 	 * internal nexus providers.
2205 	 */
2206 	nxprov_advise_disconnect(nx, ch);
2207 
2208 	/* Finally, let the domain provider tear down the instance */
2209 	nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2210 }
2211 
2212 void
ch_deactivate(struct kern_channel * ch)2213 ch_deactivate(struct kern_channel *ch)
2214 {
2215 	/*
2216 	 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2217 	 * it will never be set again.  Doing this will cause
2218 	 * os_channel_is_defunct() to indicate that the channel
2219 	 * is defunct and is no longer usable (thus should be
2220 	 * immediately closed).
2221 	 */
2222 	if (ch->ch_schema != NULL &&
2223 	    (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2224 		os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags),
2225 		    CSM_ACTIVE, relaxed);
2226 		/* make this globally visible */
2227 		os_atomic_thread_fence(seq_cst);
2228 	}
2229 }
2230 
2231 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2232 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2233 {
2234 #pragma unused(ch)
2235 	int err = 0;
2236 
2237 	if (sopt->sopt_dir != SOPT_SET) {
2238 		sopt->sopt_dir = SOPT_SET;
2239 	}
2240 
2241 	switch (sopt->sopt_name) {
2242 	case CHOPT_TX_LOWAT_THRESH:
2243 		err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2244 		break;
2245 
2246 	case CHOPT_RX_LOWAT_THRESH:
2247 		err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2248 		break;
2249 
2250 	case CHOPT_IF_ADV_CONF:
2251 		err = ch_configure_interface_advisory_event(ch, sopt);
2252 		break;
2253 
2254 	default:
2255 		err = ENOPROTOOPT;
2256 		break;
2257 	}
2258 
2259 	return err;
2260 }
2261 
2262 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2263 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2264 {
2265 #pragma unused(ch)
2266 	int err = 0;
2267 
2268 	if (sopt->sopt_dir != SOPT_GET) {
2269 		sopt->sopt_dir = SOPT_GET;
2270 	}
2271 
2272 	switch (sopt->sopt_name) {
2273 	case CHOPT_TX_LOWAT_THRESH:
2274 		err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2275 		break;
2276 
2277 	case CHOPT_RX_LOWAT_THRESH:
2278 		err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2279 		break;
2280 
2281 	default:
2282 		err = ENOPROTOOPT;
2283 		break;
2284 	}
2285 
2286 	return err;
2287 }
2288 
2289 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2290 ch_configure_interface_advisory_event(struct kern_channel *ch,
2291     struct sockopt *sopt)
2292 {
2293 	int err = 0;
2294 	boolean_t enable = 0;
2295 	struct kern_nexus *nx = ch->ch_nexus;
2296 
2297 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2298 	SK_LOCK_ASSERT_NOTHELD();
2299 
2300 	if (sopt->sopt_val == USER_ADDR_NULL) {
2301 		return EINVAL;
2302 	}
2303 	if (nx->nx_adv.nxv_adv == NULL) {
2304 		return ENOTSUP;
2305 	}
2306 	err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2307 	if (err != 0) {
2308 		return err;
2309 	}
2310 
2311 	/*
2312 	 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2313 	 * ordering requirement; check if the channel is closing once ch_lock
2314 	 * is reacquired and bail if so.
2315 	 */
2316 	lck_mtx_unlock(&ch->ch_lock);
2317 	SK_LOCK();
2318 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2319 	lck_mtx_lock(&ch->ch_lock);
2320 	if (ch->ch_flags & CHANF_CLOSING) {
2321 		err = ENXIO;
2322 		goto done;
2323 	}
2324 
2325 	/*
2326 	 * if interface advisory reporting is enabled on the channel then
2327 	 * add the channel to the list of channels eligible for interface
2328 	 * advisory update on the nexus. If disabled, remove from the list.
2329 	 */
2330 	if (enable) {
2331 		if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2332 			ASSERT(err == 0);
2333 			goto done;
2334 		}
2335 		bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2336 		os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2337 		STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2338 		if (enable_adv) {
2339 			nx_netif_config_interface_advisory(nx, true);
2340 		}
2341 		ch_retain_locked(ch);   /* for being in the IF ADV list */
2342 	} else {
2343 		if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2344 			ASSERT(err == 0);
2345 			goto done;
2346 		}
2347 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2348 		    ch_link_if_adv);
2349 		os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2350 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2351 			nx_netif_config_interface_advisory(nx, false);
2352 		}
2353 		(void) ch_release_locked(ch);
2354 	}
2355 
2356 done:
2357 	lck_mtx_unlock(&ch->ch_lock);
2358 	lck_rw_done(&nx->nx_ch_if_adv_lock);
2359 	SK_UNLOCK();
2360 	lck_mtx_lock(&ch->ch_lock);
2361 
2362 	return err;
2363 }
2364 
2365 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2366 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2367     struct sockopt *sopt)
2368 {
2369 	struct ch_ev_thresh cet, *ocet;
2370 	int err = 0;
2371 
2372 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2373 
2374 	if (sopt->sopt_val == USER_ADDR_NULL) {
2375 		return EINVAL;
2376 	}
2377 
2378 	bzero(&cet, sizeof(cet));
2379 	err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2380 	if (err == 0) {
2381 		err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2382 		if (err == 0) {
2383 			if (t == NR_TX) {
2384 				ocet = &ch->ch_info->cinfo_tx_lowat;
2385 			} else {
2386 				ocet = &ch->ch_info->cinfo_rx_lowat;
2387 			}
2388 
2389 			/* if there is no change, we're done */
2390 			if (ocet->cet_unit == cet.cet_unit &&
2391 			    ocet->cet_value == cet.cet_value) {
2392 				return 0;
2393 			}
2394 
2395 			*ocet = cet;
2396 
2397 			for_rx_tx(t) {
2398 				ring_id_t qfirst = ch->ch_first[t];
2399 				ring_id_t qlast = ch->ch_last[t];
2400 				uint32_t i;
2401 
2402 				for (i = qfirst; i < qlast; i++) {
2403 					struct __kern_channel_ring *kring =
2404 					    &NAKR(ch->ch_na, t)[i];
2405 
2406 					(void) kring->ckr_na_notify(kring,
2407 					    sopt->sopt_p, 0);
2408 				}
2409 			}
2410 
2411 			(void) sooptcopyout(sopt, &cet, sizeof(cet));
2412 		}
2413 	}
2414 
2415 	return err;
2416 }
2417 
2418 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2419 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2420     struct sockopt *sopt)
2421 {
2422 	struct ch_ev_thresh cet;
2423 
2424 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2425 
2426 	if (sopt->sopt_val == USER_ADDR_NULL) {
2427 		return EINVAL;
2428 	}
2429 
2430 	if (t == NR_TX) {
2431 		cet = ch->ch_info->cinfo_tx_lowat;
2432 	} else {
2433 		cet = ch->ch_info->cinfo_rx_lowat;
2434 	}
2435 
2436 	return sooptcopyout(sopt, &cet, sizeof(cet));
2437 }
2438 
2439 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2440 ch_alloc(zalloc_flags_t how)
2441 {
2442 	struct kern_channel *ch;
2443 
2444 	ch = zalloc_flags(ch_zone, how | Z_ZERO);
2445 	if (ch) {
2446 		lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2447 		ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2448 	}
2449 	return ch;
2450 }
2451 
2452 static void
ch_free(struct kern_channel * ch)2453 ch_free(struct kern_channel *ch)
2454 {
2455 	ASSERT(ch->ch_refcnt == 0);
2456 	ASSERT(ch->ch_pp == NULL);
2457 	ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2458 	    CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2459 	lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2460 	SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2461 	ASSERT(ch->ch_info != NULL);
2462 	zfree(ch_info_zone, ch->ch_info);
2463 	ch->ch_info = NULL;
2464 	zfree(ch_zone, ch);
2465 }
2466 
2467 void
ch_retain_locked(struct kern_channel * ch)2468 ch_retain_locked(struct kern_channel *ch)
2469 {
2470 	SK_LOCK_ASSERT_HELD();
2471 
2472 	ch->ch_refcnt++;
2473 	VERIFY(ch->ch_refcnt != 0);
2474 }
2475 
2476 void
ch_retain(struct kern_channel * ch)2477 ch_retain(struct kern_channel *ch)
2478 {
2479 	SK_LOCK();
2480 	ch_retain_locked(ch);
2481 	SK_UNLOCK();
2482 }
2483 
2484 int
ch_release_locked(struct kern_channel * ch)2485 ch_release_locked(struct kern_channel *ch)
2486 {
2487 	int oldref = ch->ch_refcnt;
2488 
2489 	SK_LOCK_ASSERT_HELD();
2490 
2491 	VERIFY(ch->ch_refcnt != 0);
2492 	if (--ch->ch_refcnt == 0) {
2493 		ch_free(ch);
2494 	}
2495 
2496 	return oldref == 1;
2497 }
2498 
2499 int
ch_release(struct kern_channel * ch)2500 ch_release(struct kern_channel *ch)
2501 {
2502 	int lastref;
2503 
2504 	SK_LOCK();
2505 	lastref = ch_release_locked(ch);
2506 	SK_UNLOCK();
2507 
2508 	return lastref;
2509 }
2510 
2511 void
ch_dtor(void * arg)2512 ch_dtor(void *arg)
2513 {
2514 	struct kern_channel *ch = arg;
2515 
2516 	SK_LOCK();
2517 	ch_close(ch, TRUE);
2518 	(void) ch_release_locked(ch);
2519 	SK_UNLOCK();
2520 }
2521