xref: /xnu-8019.80.24/bsd/skywalk/channel/channel.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31  * All rights reserved.
32  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  *   1. Redistributions of source code must retain the above copyright
38  *      notice, this list of conditions and the following disclaimer.
39  *   2. Redistributions in binary form must reproduce the above copyright
40  *      notice, this list of conditions and the following disclaimer in the
41  *      documentation and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  */
55 
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61 
62 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63 
64 struct ch_event_result {
65 	uint32_t tx_data;
66 	uint32_t rx_data;
67 };
68 
69 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72 
73 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77     boolean_t, boolean_t, uint32_t);
78 static boolean_t csi_tcall_start(struct ch_selinfo *);
79 static void csi_tcall(thread_call_param_t, thread_call_param_t);
80 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81 
82 static void ch_redzone_init(void);
83 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85     ring_id_t);
86 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87     struct ch_ev_thresh *);
88 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89     struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90 static void ch_disconnect(struct kern_channel *);
91 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92     struct sockopt *);
93 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94     struct sockopt *);
95 static struct kern_channel *ch_alloc(zalloc_flags_t);
96 static void ch_free(struct kern_channel *);
97 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98     struct sockopt *sopt);
99 
100 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101 static void filt_chrwdetach(struct knote *, boolean_t);
102 static void filt_chrdetach(struct knote *);
103 static void filt_chwdetach(struct knote *);
104 static int filt_chrw(struct knote *, long, int);
105 static int filt_chread(struct knote *, long);
106 static int filt_chwrite(struct knote *, long);
107 
108 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115 static void filt_che_detach(struct knote *);
116 static int filt_che_event(struct knote *, long);
117 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118 static int filt_che_process(struct knote *, struct kevent_qos_s *);
119 static int filt_chan_extended_common(struct knote *, long);
120 
121 static int ch_event(struct kern_channel *ch, int events,
122     void *wql, struct proc *p, struct ch_event_result *,
123     const boolean_t is_kevent, int *errno, const boolean_t);
124 
125 const struct filterops skywalk_channel_rfiltops = {
126 	.f_isfd =       1,
127 	.f_attach =     filt_chrwattach,
128 	.f_detach =     filt_chrdetach,
129 	.f_event =      filt_chread,
130 	.f_touch =      filt_chrtouch,
131 	.f_process =    filt_chrprocess,
132 };
133 
134 const struct filterops skywalk_channel_wfiltops = {
135 	.f_isfd =       1,
136 	.f_attach =     filt_chrwattach,
137 	.f_detach =     filt_chwdetach,
138 	.f_event =      filt_chwrite,
139 	.f_touch =      filt_chwtouch,
140 	.f_process =    filt_chwprocess,
141 };
142 
143 const struct filterops skywalk_channel_efiltops = {
144 	.f_isfd =       1,
145 	.f_attach =     filt_che_attach,
146 	.f_detach =     filt_che_detach,
147 	.f_event =      filt_che_event,
148 	.f_touch =      filt_che_touch,
149 	.f_process =    filt_che_process,
150 };
151 
152 /* mitigation intervals in ns */
153 #define CH_MIT_IVAL_MIN         NSEC_PER_USEC
154 
155 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156 
157 #if (DEVELOPMENT || DEBUG)
158 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161     CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162 #endif /* !DEVELOPMENT && !DEBUG */
163 
164 static ZONE_DECLARE(ch_zone, SKMEM_ZONE_PREFIX ".ch",
165     sizeof(struct kern_channel), ZC_ZFREE_CLEARMEM);
166 
167 static ZONE_DECLARE(ch_info_zone, SKMEM_ZONE_PREFIX ".ch.info",
168     sizeof(struct ch_info), ZC_ZFREE_CLEARMEM);
169 
170 static int __ch_inited = 0;
171 
172 /*
173  * Global cookies to hold the random numbers used for verifying
174  * user metadata red zone violations.
175  */
176 uint64_t __ch_umd_redzone_cookie = 0;
177 
178 #define SKMEM_TAG_CH_KEY        "com.apple.skywalk.channel.key"
179 kern_allocation_name_t skmem_tag_ch_key;
180 
181 static void
ch_redzone_init(void)182 ch_redzone_init(void)
183 {
184 	_CASSERT(sizeof(__ch_umd_redzone_cookie) ==
185 	    sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
186 	_CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
187 	_CASSERT(sizeof(struct __slot_desc) == 8);
188 
189 	/* Initialize random user red zone cookie values */
190 	do {
191 		read_random(&__ch_umd_redzone_cookie,
192 		    sizeof(__ch_umd_redzone_cookie));
193 	} while (__ch_umd_redzone_cookie == 0);
194 
195 	SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
196 }
197 
198 int
channel_init(void)199 channel_init(void)
200 {
201 	int error = 0;
202 
203 	SK_LOCK_ASSERT_HELD();
204 	ASSERT(!__ch_inited);
205 
206 	_CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
207 	_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
208 
209 	ch_redzone_init();
210 
211 	ASSERT(skmem_tag_ch_key == NULL);
212 	skmem_tag_ch_key = kern_allocation_name_allocate(SKMEM_TAG_CH_KEY, 0);
213 	ASSERT(skmem_tag_ch_key != NULL);
214 
215 	__ch_inited = 1;
216 
217 	return error;
218 }
219 
220 void
channel_fini(void)221 channel_fini(void)
222 {
223 	SK_LOCK_ASSERT_HELD();
224 
225 	if (__ch_inited) {
226 		if (skmem_tag_ch_key != NULL) {
227 			kern_allocation_name_release(skmem_tag_ch_key);
228 			skmem_tag_ch_key = NULL;
229 		}
230 
231 		__ch_umd_redzone_cookie = 0;
232 		__ch_inited = 0;
233 	}
234 }
235 
236 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)237 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
238 {
239 	csi->csi_flags = 0;
240 	csi->csi_pending = 0;
241 	if (mitigation) {
242 		csi->csi_interval = mit_ival;
243 		csi->csi_eff_interval = ch_mit_ival;    /* global override */
244 		atomic_bitset_32(&csi->csi_flags, CSI_MITIGATION);
245 		csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
246 		    csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
247 		/* this must not fail */
248 		VERIFY(csi->csi_tcall != NULL);
249 	} else {
250 		csi->csi_interval = 0;
251 		csi->csi_eff_interval = 0;
252 		csi->csi_tcall = NULL;
253 	}
254 	lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
255 	klist_init(&csi->csi_si.si_note);
256 }
257 
258 void
csi_destroy(struct ch_selinfo * csi)259 csi_destroy(struct ch_selinfo *csi)
260 {
261 	/* check if not already destroyed, else do it now */
262 	if ((atomic_bitset_32_ov(&csi->csi_flags, CSI_DESTROYED) &
263 	    CSI_DESTROYED) == 0) {
264 		CSI_LOCK(csi);
265 		/* must have been set by above atomic op */
266 		VERIFY(csi->csi_flags & CSI_DESTROYED);
267 		if (csi->csi_flags & CSI_MITIGATION) {
268 			thread_call_t tcall = csi->csi_tcall;
269 			VERIFY(tcall != NULL);
270 			CSI_UNLOCK(csi);
271 
272 			(void) thread_call_cancel_wait(tcall);
273 			if (!thread_call_free(tcall)) {
274 				boolean_t freed;
275 				(void) thread_call_cancel_wait(tcall);
276 				freed = thread_call_free(tcall);
277 				VERIFY(freed);
278 			}
279 
280 			CSI_LOCK(csi);
281 			csi->csi_tcall = NULL;
282 			atomic_bitclear_32(&csi->csi_flags, CSI_MITIGATION);
283 		}
284 		csi->csi_pending = 0;
285 		CSI_UNLOCK(csi);
286 
287 		selthreadclear(&csi->csi_si);
288 		/* now we don't need the mutex anymore */
289 		lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
290 	}
291 }
292 
293 /*
294  * Called only for select(2).
295  */
296 __attribute__((always_inline))
297 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)298 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
299 {
300 	struct selinfo *si = &csi->csi_si;
301 
302 	CSI_LOCK_ASSERT_HELD(csi);
303 	selrecord(p, si, wql);
304 }
305 
306 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)307 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
308 {
309 	struct ch_selinfo *csi = &kring->ckr_si;
310 
311 	CSI_LOCK(csi);
312 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
313 	    "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
314 	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
315 	    SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
316 
317 	csi_selrecord(csi, p, wql);
318 	CSI_UNLOCK(csi);
319 }
320 
321 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)322 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
323     void *wql)
324 {
325 	struct ch_selinfo *csi = &na->na_si[t];
326 
327 	CSI_LOCK(csi);
328 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
329 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
330 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
331 
332 	csi_selrecord(csi, p, wql);
333 	CSI_UNLOCK(csi);
334 }
335 
336 /*
337  * Called from na_post_event().
338  */
339 __attribute__((always_inline))
340 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)341 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
342     boolean_t selwake, uint32_t hint)
343 {
344 	struct selinfo *si = &csi->csi_si;
345 
346 	CSI_LOCK_ASSERT_HELD(csi);
347 	csi->csi_pending = 0;
348 	if (selwake) {
349 		selwakeup(si);
350 	}
351 	if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
352 		KNOTE(&si->si_note, hint);
353 	}
354 }
355 
356 __attribute__((always_inline))
357 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)358 csi_selwakeup_delayed(struct ch_selinfo *csi)
359 {
360 	CSI_LOCK_ASSERT_HELD(csi);
361 	ASSERT(csi->csi_flags & CSI_MITIGATION);
362 	ASSERT(csi->csi_tcall != NULL);
363 
364 	if (thread_call_isactive(csi->csi_tcall)) {
365 		csi->csi_pending++;
366 	} else if (!csi_tcall_start(csi)) {
367 		csi_selwakeup(csi, FALSE, FALSE, 0);
368 	}
369 }
370 
371 __attribute__((always_inline))
372 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)373 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
374     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
375 {
376 	CSI_LOCK_ASSERT_HELD(csi);
377 
378 	if (nodelay || within_kevent || !selwake || hint != 0 ||
379 	    !(csi->csi_flags & CSI_MITIGATION)) {
380 		csi_selwakeup(csi, within_kevent, selwake, hint);
381 	} else {
382 		csi_selwakeup_delayed(csi);
383 	}
384 }
385 
386 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)387 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
388     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
389 {
390 	struct ch_selinfo *csi = &kring->ckr_si;
391 
392 	CSI_LOCK(csi);
393 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
394 	    "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
395 	    (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
396 	    SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
397 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
398 	    within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
399 
400 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
401 	CSI_UNLOCK(csi);
402 }
403 
404 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)405 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
406     boolean_t within_kevent, boolean_t selwake, uint32_t hint)
407 {
408 	struct ch_selinfo *csi = &na->na_si[t];
409 
410 	CSI_LOCK(csi);
411 	SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
412 	    "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
413 	    (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
414 	    SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
415 	    within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
416 
417 	switch (t) {
418 	case NR_RX:
419 		if (!(na->na_flags & NAF_RX_MITIGATION)) {
420 			nodelay = TRUE;
421 		}
422 		break;
423 
424 	case NR_TX:
425 		if (!(na->na_flags & NAF_TX_MITIGATION)) {
426 			nodelay = TRUE;
427 		}
428 		break;
429 
430 	default:
431 		nodelay = TRUE;
432 		break;
433 	}
434 	csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
435 	CSI_UNLOCK(csi);
436 }
437 
438 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)439 csi_tcall_start(struct ch_selinfo *csi)
440 {
441 	uint64_t now, ival, deadline;
442 
443 	CSI_LOCK_ASSERT_HELD(csi);
444 	ASSERT(csi->csi_flags & CSI_MITIGATION);
445 	ASSERT(csi->csi_tcall != NULL);
446 
447 	/* pick up latest value */
448 	ival = csi_tcall_update_interval(csi);
449 
450 	/* if no mitigation, pass notification up now */
451 	if (__improbable(ival == 0)) {
452 		return FALSE;
453 	}
454 
455 	deadline = now = mach_absolute_time();
456 	clock_deadline_for_periodic_event(ival, now, &deadline);
457 	(void) thread_call_enter_delayed(csi->csi_tcall, deadline);
458 
459 	return TRUE;
460 }
461 
462 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)463 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
464 {
465 #pragma unused(arg1)
466 	struct ch_selinfo *csi = arg0;
467 
468 	CSI_LOCK(csi);
469 	csi_selwakeup(csi, FALSE, FALSE, 0);
470 	CSI_UNLOCK(csi);
471 
472 	CSI_LOCK(csi);
473 	if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
474 	    csi->csi_pending != 0 && !csi_tcall_start(csi))) {
475 		csi_selwakeup(csi, FALSE, FALSE, 0);
476 	}
477 	CSI_UNLOCK(csi);
478 }
479 
480 __attribute__((always_inline))
481 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)482 csi_tcall_update_interval(struct ch_selinfo *csi)
483 {
484 	uint64_t i = ch_mit_ival;
485 
486 	/* if global override was adjusted, update local copies */
487 	if (__improbable(csi->csi_eff_interval != i)) {
488 		ASSERT(csi->csi_flags & CSI_MITIGATION);
489 		csi->csi_interval = csi->csi_eff_interval =
490 		    ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
491 	}
492 
493 	return csi->csi_interval;
494 }
495 
496 /* return EV_EOF if the channel is defunct */
497 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)498 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
499 {
500 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
501 		if (kn) {
502 			kn->kn_flags |= EV_EOF;
503 		}
504 		return TRUE;
505 	}
506 	return FALSE;
507 }
508 
509 static void
filt_chrwdetach(struct knote * kn,boolean_t write)510 filt_chrwdetach(struct knote *kn, boolean_t write)
511 {
512 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
513 	struct ch_selinfo *csi;
514 	struct selinfo *si;
515 
516 	lck_mtx_lock(&ch->ch_lock);
517 	csi = ch->ch_si[write ? NR_TX : NR_RX];
518 	si = &csi->csi_si;
519 
520 	CSI_LOCK(csi);
521 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
522 	    "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
523 	    SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
524 	    write ? "write" : "read", si->si_flags);
525 
526 	if (KNOTE_DETACH(&si->si_note, kn)) {
527 		atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
528 	}
529 
530 	CSI_UNLOCK(csi);
531 	lck_mtx_unlock(&ch->ch_lock);
532 }
533 
534 static void
filt_chrdetach(struct knote * kn)535 filt_chrdetach(struct knote *kn)
536 {
537 	ASSERT(kn->kn_filter == EVFILT_READ);
538 	filt_chrwdetach(kn, FALSE);
539 }
540 
541 static void
filt_chwdetach(struct knote * kn)542 filt_chwdetach(struct knote *kn)
543 {
544 	ASSERT(kn->kn_filter == EVFILT_WRITE);
545 	filt_chrwdetach(kn, TRUE);
546 }
547 
548 /*
549  * callback from notifies (generated externally).
550  * This always marks the knote activated, so always
551  * return 1.
552  */
553 static int
filt_chrw(struct knote * kn,long hint,int events)554 filt_chrw(struct knote *kn, long hint, int events)
555 {
556 #if SK_LOG
557 	struct kern_channel *ch = kn->kn_hook;
558 #else
559 #pragma unused(kn)
560 #pragma unused(hint)
561 #pragma unused(events)
562 #endif
563 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
564 	    "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
565 	    SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
566 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
567 	    (events == POLLOUT) ?  "write" : "read",
568 	    (uint32_t)hint);
569 
570 	/* assume we are ready */
571 	return 1;
572 }
573 
574 static int
filt_chread(struct knote * kn,long hint)575 filt_chread(struct knote *kn, long hint)
576 {
577 	ASSERT(kn->kn_filter == EVFILT_READ);
578 	/* There is no hint for read/write event */
579 	if (hint != 0) {
580 		return 0;
581 	}
582 	return filt_chrw(kn, hint, POLLIN);
583 }
584 
585 static int
filt_chwrite(struct knote * kn,long hint)586 filt_chwrite(struct knote *kn, long hint)
587 {
588 	ASSERT(kn->kn_filter == EVFILT_WRITE);
589 	/* There is no hint for read/write event */
590 	if (hint != 0) {
591 		return 0;
592 	}
593 	return filt_chrw(kn, hint, POLLOUT);
594 }
595 
596 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)597 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
598 {
599 #pragma unused(kev)
600 	struct kern_channel *ch = kn->kn_hook;
601 	int ev = kn->kn_filter;
602 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
603 	int event_error = 0;
604 	int revents;
605 
606 	/* save off the new input fflags and data */
607 	kn->kn_sfflags = kev->fflags;
608 	kn->kn_sdata = kev->data;
609 
610 	lck_mtx_lock(&ch->ch_lock);
611 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
612 		lck_mtx_unlock(&ch->ch_lock);
613 		return 1;
614 	}
615 
616 	/* if a note-specific low watermark is given, validate it */
617 	if (kn->kn_sfflags & NOTE_LOWAT) {
618 		struct ch_ev_thresh note_thresh = {
619 			.cet_unit = (dir == NR_TX) ?
620 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
621 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
622 			.cet_value = (uint32_t)kn->kn_sdata
623 		};
624 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
625 		    &note_thresh) != 0) {
626 			SK_ERR("invalid NOTE_LOWAT threshold %u",
627 			    note_thresh.cet_value);
628 			knote_set_error(kn, EINVAL);
629 			lck_mtx_unlock(&ch->ch_lock);
630 			return 1;
631 		}
632 	}
633 
634 	/* capture new state just so we can return it */
635 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
636 	    &event_error, FALSE);
637 	lck_mtx_unlock(&ch->ch_lock);
638 
639 	if (revents & POLLERR) {
640 		ASSERT(event_error != 0);
641 		/*
642 		 * Setting a knote error here will confuse libdispatch, so we
643 		 * use EV_EOF instead.
644 		 */
645 		kn->kn_flags |= EV_EOF;
646 		return 1;
647 	} else {
648 		return (events & revents) != 0;
649 	}
650 }
651 
652 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)653 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
654 {
655 	ASSERT(kn->kn_filter == EVFILT_READ);
656 
657 	if (kev->flags & EV_ENABLE) {
658 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
659 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
660 		    kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
661 			    ((struct kern_channel *)kn->kn_hook)->ch_na));
662 	}
663 
664 	return filt_chtouch(kn, kev, POLLIN);
665 }
666 
667 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)668 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
669 {
670 	ASSERT(kn->kn_filter == EVFILT_WRITE);
671 	return filt_chtouch(kn, kev, POLLOUT);
672 }
673 
674 
675 /*
676  * Called from kevent.  We call ch_event(POLL[IN|OUT]) and
677  * return 0/1 accordingly.
678  */
679 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)680 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
681 {
682 	struct kern_channel *ch = kn->kn_hook;
683 	struct ch_event_result result;
684 	uint32_t lowat;
685 	int trigger_event = 1;
686 	int revents;
687 	int event_error;
688 	int64_t data;
689 
690 	lck_mtx_lock(&ch->ch_lock);
691 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
692 		knote_fill_kevent(kn, kev, 0);
693 		lck_mtx_unlock(&ch->ch_lock);
694 		return 1;
695 	}
696 
697 	revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
698 	    TRUE, &event_error, FALSE);
699 
700 	if (revents & POLLERR) {
701 		ASSERT(event_error != 0);
702 		lck_mtx_unlock(&ch->ch_lock);
703 		/*
704 		 * Setting a knote error here will confuse libdispatch, so we
705 		 * use EV_EOF instead.
706 		 */
707 		kn->kn_flags |= EV_EOF;
708 		knote_fill_kevent_with_sdata(kn, kev);
709 		return 1;
710 	}
711 
712 	trigger_event = (events & revents) != 0;
713 
714 	if (events == POLLOUT) {
715 		lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
716 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
717 		    kn->kn_sdata > lowat) {
718 			lowat = (uint32_t)kn->kn_sdata;
719 		}
720 
721 		data = result.tx_data;
722 
723 		if (result.tx_data < lowat) {
724 			trigger_event = 0;
725 		}
726 	} else {
727 		lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
728 		if ((kn->kn_sfflags & NOTE_LOWAT) &&
729 		    kn->kn_sdata > lowat) {
730 			lowat = (uint32_t)kn->kn_sdata;
731 		}
732 
733 		data = result.rx_data;
734 
735 		if (result.rx_data < lowat) {
736 			trigger_event = 0;
737 		}
738 	}
739 
740 	if (trigger_event) {
741 		knote_fill_kevent(kn, kev, data);
742 	}
743 
744 	lck_mtx_unlock(&ch->ch_lock);
745 
746 	return trigger_event;
747 }
748 
749 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)750 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
751 {
752 	ASSERT(kn->kn_filter == EVFILT_READ);
753 	return filt_chprocess(kn, kev, POLLIN);
754 }
755 
756 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)757 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
758 {
759 	ASSERT(kn->kn_filter == EVFILT_WRITE);
760 	return filt_chprocess(kn, kev, POLLOUT);
761 }
762 
763 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)764 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
765 {
766 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
767 	struct nexus_adapter *na;
768 	struct ch_selinfo *csi;
769 	int ev = kn->kn_filter;
770 	enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
771 	int revents;
772 	int events;
773 	int event_error = 0;
774 
775 	ASSERT((kn->kn_filter == EVFILT_READ) ||
776 	    (kn->kn_filter == EVFILT_WRITE));
777 
778 	/* ch_kqfilter() should have acquired the lock */
779 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
780 
781 	na = ch->ch_na;
782 	/* if a note-specific low watermark is given, validate it */
783 	if (kn->kn_sfflags & NOTE_LOWAT) {
784 		struct ch_ev_thresh note_thresh = {
785 			.cet_unit = (dir == NR_TX) ?
786 		    ch->ch_info->cinfo_tx_lowat.cet_unit :
787 		    ch->ch_info->cinfo_rx_lowat.cet_unit,
788 			.cet_value = (uint32_t)kn->kn_sdata
789 		};
790 		if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
791 		    &note_thresh) != 0) {
792 			SK_ERR("invalid NOTE_LOWAT threshold %u",
793 			    note_thresh.cet_value);
794 			knote_set_error(kn, EINVAL);
795 			return 0;
796 		}
797 	}
798 
799 	/* the si is indicated in the channel */
800 	csi = ch->ch_si[dir];
801 	CSI_LOCK(csi);
802 
803 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
804 		atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
805 	}
806 
807 	CSI_UNLOCK(csi);
808 
809 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
810 	    na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
811 	    (kn->kn_flags & EV_POLL) ? "poll," : "",
812 	    (ev == EVFILT_WRITE) ?  "write" : "read");
813 
814 	/* capture current state */
815 	events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
816 
817 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
818 		revents = events;
819 	} else {
820 		/* filt_chprocess() will fill in the kn_sdata field */
821 		revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
822 		    NULL, TRUE, &event_error, FALSE);
823 	}
824 
825 	if (revents & POLLERR) {
826 		ASSERT(event_error != 0);
827 		kn->kn_flags |= EV_EOF;
828 		return 1;
829 	} else {
830 		return (events & revents) != 0;
831 	}
832 }
833 
834 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)835 filt_chan_extended_common(struct knote *kn, long ev_hint)
836 {
837 	/*
838 	 * This function is not always called with the same set of locks held,
839 	 * hence it is only allowed to manipulate kn_fflags, with atomics.
840 	 *
841 	 * the f_event / f_process functions may run concurrently.
842 	 */
843 	uint32_t add_fflags = 0;
844 
845 	if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
846 		add_fflags |= NOTE_FLOW_ADV_UPDATE;
847 	}
848 	if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
849 		add_fflags |= NOTE_CHANNEL_EVENT;
850 	}
851 	if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
852 		add_fflags |= NOTE_IF_ADV_UPD;
853 	}
854 	if (add_fflags) {
855 		/* Reset any events that are not requested on this knote */
856 		add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
857 		os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
858 		return add_fflags != 0;
859 	}
860 	return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
861 }
862 
863 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)864 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
865     uint32_t fflags, long *hint)
866 {
867 	int revents, event_error = 0;
868 
869 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
870 	*hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
871 
872 	if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
873 	    ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
874 		/* capture new state to return */
875 		revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
876 		    NULL, TRUE, &event_error, TRUE);
877 		if (revents & POLLERR) {
878 			ASSERT(event_error != 0);
879 			/*
880 			 * Setting a knote error here will confuse libdispatch,
881 			 * so we use EV_EOF instead.
882 			 */
883 			kn->kn_flags |= EV_EOF;
884 		} else if ((revents & POLLIN) != 0) {
885 			*hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
886 		}
887 	}
888 	/*
889 	 * if the sync operation on event ring didn't find any events
890 	 * then indicate that the channel event is not active.
891 	 */
892 	if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
893 		/*
894 		 * Avoid a costly atomic when the bit is already cleared.
895 		 */
896 		uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
897 		if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
898 			os_atomic_andnot(&kn->kn_fflags,
899 			    CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
900 		}
901 	}
902 }
903 
904 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)905 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
906 {
907 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
908 	struct ch_selinfo *csi;
909 	long hint = 0;
910 
911 	_CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
912 	_CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
913 	_CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
914 
915 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
916 
917 	/* ch_kqfilter() should have acquired the lock */
918 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
919 
920 	csi = ch->ch_si[NR_TX];
921 	CSI_LOCK(csi);
922 	if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
923 		atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
924 	}
925 	CSI_UNLOCK(csi);
926 
927 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
928 		return 1;
929 	}
930 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
931 		atomic_bitset_32(&ch->ch_na->na_flags,
932 		    NAF_CHANNEL_EVENT_ATTACHED);
933 	}
934 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
935 	if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
936 		/* on registration force an event */
937 		hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
938 	}
939 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
940 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
941 	    "EVFILT_NW_CHANNEL");
942 	return filt_chan_extended_common(kn, hint);
943 }
944 
945 static void
filt_che_detach(struct knote * kn)946 filt_che_detach(struct knote *kn)
947 {
948 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
949 	struct ch_selinfo *csi;
950 
951 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
952 
953 	lck_mtx_lock(&ch->ch_lock);
954 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
955 		atomic_bitclear_32(&ch->ch_na->na_flags,
956 		    NAF_CHANNEL_EVENT_ATTACHED);
957 	}
958 	csi = ch->ch_si[NR_TX];
959 	CSI_LOCK(csi);
960 	if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
961 		atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
962 	}
963 	CSI_UNLOCK(csi);
964 	lck_mtx_unlock(&ch->ch_lock);
965 
966 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
967 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
968 	    "EVFILT_NW_CHANNEL");
969 }
970 
971 static int
filt_che_event(struct knote * kn,long hint)972 filt_che_event(struct knote *kn, long hint)
973 {
974 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
975 
976 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
977 	if (hint == 0) {
978 		return 0;
979 	}
980 	if (__improbable(ch_filt_check_defunct(ch, NULL))) {
981 		return 1;
982 	}
983 	if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
984 		VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
985 	}
986 	SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
987 	    ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
988 	    CHAN_FILT_HINT_BITS);
989 	return filt_chan_extended_common(kn, hint);
990 }
991 
992 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)993 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
994 {
995 	int ret;
996 	long hint = 0;
997 	struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
998 
999 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1000 	/* save off the new input fflags and data */
1001 	kn->kn_sfflags = kev->fflags;
1002 	kn->kn_sdata = kev->data;
1003 
1004 	lck_mtx_lock(&ch->ch_lock);
1005 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
1006 		ret = 1;
1007 		goto done;
1008 	}
1009 	if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1010 		if (kev->flags & EV_ENABLE) {
1011 			atomic_bitset_32(&ch->ch_na->na_flags,
1012 			    NAF_CHANNEL_EVENT_ATTACHED);
1013 		} else if (kev->flags & EV_DISABLE) {
1014 			atomic_bitclear_32(&ch->ch_na->na_flags,
1015 			    NAF_CHANNEL_EVENT_ATTACHED);
1016 		}
1017 	}
1018 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1019 	ret = filt_chan_extended_common(kn, hint);
1020 done:
1021 	lck_mtx_unlock(&ch->ch_lock);
1022 	return ret;
1023 }
1024 
1025 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1026 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1027 {
1028 	int ret;
1029 	long hint = 0;
1030 	struct kern_channel *ch = kn->kn_hook;
1031 
1032 	ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1033 	lck_mtx_lock(&ch->ch_lock);
1034 	if (__improbable(ch_filt_check_defunct(ch, kn))) {
1035 		ret = 1;
1036 		goto done;
1037 	}
1038 	che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1039 	ret = filt_chan_extended_common(kn, hint);
1040 done:
1041 	lck_mtx_unlock(&ch->ch_lock);
1042 	if (ret != 0) {
1043 		/*
1044 		 * This filter historically behaves like EV_CLEAR,
1045 		 * even when EV_CLEAR wasn't set.
1046 		 */
1047 		knote_fill_kevent(kn, kev, 0);
1048 		kn->kn_fflags = 0;
1049 	}
1050 	return ret;
1051 }
1052 
1053 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1054 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1055     struct kevent_qos_s *kev)
1056 {
1057 	int result;
1058 
1059 	lck_mtx_lock(&ch->ch_lock);
1060 	VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1061 
1062 	if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1063 	    na_reject_channel(ch, ch->ch_na))) {
1064 		SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1065 		    ch->ch_pid, ch->ch_flags, CHANF_BITS);
1066 		knote_set_error(kn, ENXIO);
1067 		lck_mtx_unlock(&ch->ch_lock);
1068 		return 0;
1069 	}
1070 
1071 	switch (kn->kn_filter) {
1072 	case EVFILT_READ:
1073 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1074 		break;
1075 
1076 	case EVFILT_WRITE:
1077 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1078 		break;
1079 
1080 	case EVFILT_NW_CHANNEL:
1081 		kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1082 		break;
1083 
1084 	default:
1085 		lck_mtx_unlock(&ch->ch_lock);
1086 		SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1087 		    ch->ch_pid, kn->kn_filter);
1088 		knote_set_error(kn, EINVAL);
1089 		return 0;
1090 	}
1091 
1092 	kn->kn_hook = ch;
1093 	/* call the appropriate sub-filter attach with the channel lock held */
1094 	result = knote_fops(kn)->f_attach(kn, kev);
1095 	lck_mtx_unlock(&ch->ch_lock);
1096 	return result;
1097 }
1098 
1099 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1100 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1101 {
1102 	return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1103 }
1104 
1105 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1106 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1107 {
1108 	int revents;
1109 	int event_error = 0;
1110 
1111 	lck_mtx_lock(&ch->ch_lock);
1112 	revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1113 	    FALSE);
1114 	lck_mtx_unlock(&ch->ch_lock);
1115 
1116 	ASSERT((revents & POLLERR) == 0 || event_error != 0);
1117 
1118 	return revents;
1119 }
1120 
1121 #if SK_LOG
1122 /* Hoisted out of line to reduce kernel stack footprint */
1123 SK_LOG_ATTRIBUTE
1124 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1125 ch_event_log(const char *prefix, const struct kern_channel *ch,
1126     struct proc *p, const struct nexus_adapter *na,
1127     int events, int revents)
1128 {
1129 	SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1130 	    "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1131 	    SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1132 	    SK_KVA(current_thread()), events, revents);
1133 }
1134 #endif /* SK_LOG */
1135 
1136 /*
1137  * select(2), poll(2) and kevent(2) handlers for channels.
1138  *
1139  * Can be called for one or more rings.  Return true the event mask
1140  * corresponding to ready events.  If there are no ready events, do
1141  * a selrecord on either individual selinfo or on the global one.
1142  * Device-dependent parts (locking and sync of tx/rx rings)
1143  * are done through callbacks.
1144  */
1145 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1146 ch_event(struct kern_channel *ch, int events, void *wql,
1147     struct proc *p, struct ch_event_result *result,
1148     const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1149 {
1150 	struct nexus_adapter *na;
1151 	struct __kern_channel_ring *kring;
1152 	uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1153 	uint32_t ready_tx_data = 0, ready_rx_data = 0;
1154 	sk_protect_t protect = NULL;
1155 
1156 #define want_tx want[NR_TX]
1157 #define want_rx want[NR_RX]
1158 	/*
1159 	 * In order to avoid nested locks, we need to "double check"
1160 	 * txsync and rxsync if we decide to do a selrecord().
1161 	 * retry_tx (and retry_rx, later) prevent looping forever.
1162 	 */
1163 	boolean_t retry_tx = TRUE, retry_rx = TRUE;
1164 	int found, error = 0;
1165 	int s;
1166 
1167 	net_update_uptime();
1168 
1169 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1170 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1171 
1172 	*errno = 0;
1173 
1174 	if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1175 	    ch->ch_schema == NULL)) {
1176 		SK_ERR("%s(%d): channel is defunct or no longer bound",
1177 		    ch->ch_name, ch->ch_pid);
1178 		revents = POLLERR;
1179 		*errno = ENXIO;
1180 		goto done;
1181 	}
1182 
1183 	/* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1184 	if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1185 		atomic_bitclear_32(&ch->ch_flags, CHANF_DEFUNCT_SKIP);
1186 	}
1187 
1188 	na = ch->ch_na;
1189 	if (__improbable(na == NULL ||
1190 	    !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1191 		SK_ERR("%s(%d): channel is non-permissive",
1192 		    ch->ch_name, ch->ch_pid);
1193 		revents = POLLERR;
1194 		*errno = ENXIO;
1195 		goto done;
1196 	}
1197 
1198 	/* mark thread with sync-in-progress flag */
1199 	protect = sk_sync_protect();
1200 
1201 	/* update our work timestamp */
1202 	na->na_work_ts = _net_uptime;
1203 
1204 	/* and make this channel eligible for draining again */
1205 	if (na->na_flags & NAF_DRAINING) {
1206 		atomic_bitclear_32(&na->na_flags, NAF_DRAINING);
1207 	}
1208 
1209 #if SK_LOG
1210 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1211 		ch_event_log("enter", ch, p, na, events, revents);
1212 	}
1213 #endif
1214 	if (is_ch_event) {
1215 		goto process_channel_event;
1216 	}
1217 
1218 	want_tx = (events & (POLLOUT | POLLWRNORM));
1219 	want_rx = (events & (POLLIN | POLLRDNORM));
1220 
1221 	/*
1222 	 * check_all_{tx|rx} are set if the channel has more than one ring
1223 	 * AND the file descriptor is bound to all of them.  If so, we sleep
1224 	 * on the "global" selinfo, otherwise we sleep on individual selinfo
1225 	 * The interrupt routine in the driver wake one or the other (or both)
1226 	 * depending on which clients are active.
1227 	 *
1228 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1229 	 * txsync() is called if we run out of buffers on POLLOUT.
1230 	 */
1231 	check_all_tx = ch_is_multiplex(ch, NR_TX);
1232 	check_all_rx = ch_is_multiplex(ch, NR_RX);
1233 
1234 	/*
1235 	 * If want_tx is still set, we must issue txsync calls
1236 	 * (on all rings, to avoid that the tx rings stall).
1237 	 * XXX should also check head != khead on the tx rings.
1238 	 */
1239 	if (want_tx) {
1240 		ring_id_t first_tx = ch->ch_first[NR_TX];
1241 		ring_id_t last_tx = ch->ch_last[NR_TX];
1242 
1243 		channel_threshold_unit_t tx_unit =
1244 		    ch->ch_info->cinfo_tx_lowat.cet_unit;
1245 
1246 		/*
1247 		 * The first round checks if anyone is ready, if not
1248 		 * do a selrecord and another round to handle races.
1249 		 * want_tx goes to 0 if any space is found, and is
1250 		 * used to skip rings with no pending transmissions.
1251 		 */
1252 flush_tx:
1253 		for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1254 			kring = &na->na_tx_rings[i];
1255 			if (!want_tx &&
1256 			    kring->ckr_ring->ring_head == kring->ckr_khead) {
1257 				continue;
1258 			}
1259 
1260 			/* only one thread does txsync */
1261 			s = kr_enter(kring, TRUE);
1262 			ASSERT(s == 0);
1263 
1264 			error = 0;
1265 			DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1266 			    ch, struct __kern_channel_ring *, kring);
1267 			if (kr_txsync_prologue(ch, kring, p) >=
1268 			    kring->ckr_num_slots) {
1269 				kr_log_bad_ring(kring);
1270 				revents |= POLLERR;
1271 				error = EFAULT;
1272 				if (*errno == 0) {
1273 					*errno = EFAULT;
1274 				}
1275 			} else {
1276 				if (kring->ckr_na_sync(kring, p, 0)) {
1277 					revents |= POLLERR;
1278 					error = EIO;
1279 					if (*errno == 0) {
1280 						*errno = EIO;
1281 					}
1282 				} else {
1283 					kr_txsync_finalize(ch, kring, p);
1284 				}
1285 			}
1286 			DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1287 			    ch, struct __kern_channel_ring *, kring, int,
1288 			    error);
1289 
1290 			/*
1291 			 * If we found new slots, notify potential listeners on
1292 			 * the same ring. Since we just did a txsync, look at
1293 			 * the copies of cur,tail in the kring.
1294 			 */
1295 			found = kring->ckr_rhead != kring->ckr_rtail;
1296 			kr_exit(kring);
1297 			if (found) { /* notify other listeners */
1298 				revents |= want_tx;
1299 				want_tx = 0;
1300 				(void) kring->ckr_na_notify(kring, p,
1301 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1302 			}
1303 
1304 			/*
1305 			 * Add this ring's free data to our running
1306 			 * tally for userspace.
1307 			 */
1308 			if (result != NULL) {
1309 				switch (tx_unit) {
1310 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1311 					ready_tx_data += kring->ckr_ready_bytes;
1312 					break;
1313 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1314 					ready_tx_data += kring->ckr_ready_slots;
1315 					break;
1316 				}
1317 			}
1318 		}
1319 		if (want_tx && retry_tx && !is_kevent) {
1320 			if (check_all_tx) {
1321 				csi_selrecord_all(na, NR_TX, p, wql);
1322 			} else {
1323 				csi_selrecord_one(&na->na_tx_rings[first_tx],
1324 				    p, wql);
1325 			}
1326 			retry_tx = FALSE;
1327 			goto flush_tx;
1328 		}
1329 	}
1330 
1331 	/*
1332 	 * If want_rx is still set scan receive rings.
1333 	 * Do it on all rings because otherwise we starve.
1334 	 */
1335 	if (want_rx) {
1336 		ring_id_t first_rx = ch->ch_first[NR_RX];
1337 		ring_id_t last_rx = ch->ch_last[NR_RX];
1338 		channel_threshold_unit_t rx_unit =
1339 		    ch->ch_info->cinfo_rx_lowat.cet_unit;
1340 
1341 		/* two rounds here for race avoidance */
1342 do_retry_rx:
1343 		for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1344 			kring = &na->na_rx_rings[i];
1345 
1346 			/* only one thread does rxsync */
1347 			s = kr_enter(kring, TRUE);
1348 			ASSERT(s == 0);
1349 
1350 			error = 0;
1351 			DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1352 			    ch, struct __kern_channel_ring *, kring);
1353 			if (kr_rxsync_prologue(ch, kring, p) >=
1354 			    kring->ckr_num_slots) {
1355 				kr_log_bad_ring(kring);
1356 				revents |= POLLERR;
1357 				error = EFAULT;
1358 				if (*errno == 0) {
1359 					*errno = EFAULT;
1360 				}
1361 			} else {
1362 				/* now we can use kring->rhead, rtail */
1363 				if (kring->ckr_na_sync(kring, p, 0)) {
1364 					revents |= POLLERR;
1365 					error = EIO;
1366 					if (*errno == 0) {
1367 						*errno = EIO;
1368 					}
1369 				} else {
1370 					kr_rxsync_finalize(ch, kring, p);
1371 				}
1372 			}
1373 
1374 			DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1375 			    ch, struct __kern_channel_ring *, kring, int,
1376 			    error);
1377 
1378 			found = kring->ckr_rhead != kring->ckr_rtail;
1379 			kr_exit(kring);
1380 			if (found) {
1381 				revents |= want_rx;
1382 				retry_rx = FALSE;
1383 				(void) kring->ckr_na_notify(kring, p,
1384 				    (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1385 			}
1386 
1387 			/*
1388 			 * Add this ring's readable data to our running
1389 			 * tally for userspace.
1390 			 */
1391 			if (result != NULL) {
1392 				switch (rx_unit) {
1393 				case CHANNEL_THRESHOLD_UNIT_BYTES:
1394 					ready_rx_data += kring->ckr_ready_bytes;
1395 					break;
1396 				case CHANNEL_THRESHOLD_UNIT_SLOTS:
1397 					ready_rx_data += kring->ckr_ready_slots;
1398 					break;
1399 				}
1400 			}
1401 		}
1402 
1403 		if (retry_rx && !is_kevent) {
1404 			if (check_all_rx) {
1405 				csi_selrecord_all(na, NR_RX, p, wql);
1406 			} else {
1407 				csi_selrecord_one(&na->na_rx_rings[first_rx],
1408 				    p, wql);
1409 			}
1410 		}
1411 		if (retry_rx) {
1412 			retry_rx = FALSE;
1413 			goto do_retry_rx;
1414 		}
1415 	}
1416 
1417 	if (result != NULL) {
1418 		result->tx_data = ready_tx_data;
1419 		result->rx_data = ready_rx_data;
1420 	}
1421 	goto skip_channel_event;
1422 
1423 process_channel_event:
1424 	/*
1425 	 * perform sync operation on the event ring to make the channel
1426 	 * events enqueued in the ring visible to user-space.
1427 	 */
1428 
1429 	/* select() and poll() not supported for event ring */
1430 	ASSERT(is_kevent);
1431 	VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1432 	kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1433 
1434 	/* only one thread does the sync */
1435 	s = kr_enter(kring, TRUE);
1436 	ASSERT(s == 0);
1437 	if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1438 		kr_log_bad_ring(kring);
1439 		revents |= POLLERR;
1440 		if (*errno == 0) {
1441 			*errno = EFAULT;
1442 		}
1443 	} else {
1444 		if (kring->ckr_na_sync(kring, p, 0)) {
1445 			revents |= POLLERR;
1446 			if (*errno == 0) {
1447 				*errno = EIO;
1448 			}
1449 		} else {
1450 			kr_event_sync_finalize(ch, kring, p);
1451 		}
1452 	}
1453 	found = (kring->ckr_rhead != kring->ckr_rtail);
1454 	kr_exit(kring);
1455 	if (found) {
1456 		revents |= (events & POLLIN);
1457 	}
1458 
1459 skip_channel_event:
1460 #if SK_LOG
1461 	if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1462 		ch_event_log("exit", ch, p, na, events, revents);
1463 	}
1464 #endif /* SK_LOG */
1465 
1466 	/* unmark thread with sync-in-progress flag */
1467 	sk_sync_unprotect(protect);
1468 
1469 done:
1470 	ASSERT(!sk_is_sync_protected());
1471 
1472 	return revents;
1473 #undef want_tx
1474 #undef want_rx
1475 }
1476 
1477 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1478 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1479 {
1480 	struct kern_channel *ch;
1481 
1482 	SK_LOCK_ASSERT_HELD();
1483 
1484 	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1485 		struct ch_info *cinfo = ch->ch_info;
1486 
1487 		/* see comments in ch_open() */
1488 		if (cinfo->cinfo_nx_port != port) {
1489 			continue;
1490 		} else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1491 			continue;
1492 		} else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1493 		    ring_id != cinfo->cinfo_ch_ring_id &&
1494 		    ring_id != CHANNEL_RING_ID_ANY) {
1495 			continue;
1496 		}
1497 
1498 		/* found a match */
1499 		break;
1500 	}
1501 
1502 	if (ch != NULL) {
1503 		ch_retain_locked(ch);
1504 	}
1505 
1506 	return ch;
1507 }
1508 
1509 #if SK_LOG
1510 /* Hoisted out of line to reduce kernel stack footprint */
1511 SK_LOG_ATTRIBUTE
1512 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1513 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1514 {
1515 	uuid_string_t uuidstr;
1516 
1517 	SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1518 	    sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1519 	    sk_uuid_unparse(p_uuid, uuidstr), port);
1520 }
1521 
1522 SK_LOG_ATTRIBUTE
1523 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,const char * mode_bits,int err)1524 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1525     uint32_t mode, const char *mode_bits, int err)
1526 {
1527 	SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1528 	    sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1529 	    mode, mode_bits, err);
1530 }
1531 #endif /* SK_LOG */
1532 
1533 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1534 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1535 {
1536 	uint32_t mode = init->ci_ch_mode;
1537 	nexus_port_t port = init->ci_nx_port;
1538 	ring_id_t ring = init->ci_ch_ring_id;
1539 	struct kern_channel *ch = NULL, *ch0 = NULL;
1540 	struct nxbind *nxb = NULL;
1541 	struct kern_nexus *nx;
1542 	struct chreq chr;
1543 	uuid_t p_uuid;
1544 	kauth_cred_t cred;
1545 
1546 	cred = kauth_cred_get();
1547 	ASSERT(!uuid_is_null(init->ci_nx_uuid));
1548 	proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1549 	*err = 0;
1550 
1551 	/* make sure we don't allow userland to set kernel-only flags */
1552 	mode &= CHMODE_MASK;
1553 
1554 	SK_LOCK();
1555 
1556 	nx = nx_find(init->ci_nx_uuid, TRUE);
1557 	if (nx == NULL) {
1558 		*err = ENOENT;
1559 		goto done;
1560 	}
1561 
1562 	/* port (zero-based) must be within the domain's range */
1563 	if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1564 		*err = EDOM;
1565 		goto done;
1566 	}
1567 	VERIFY(port != NEXUS_PORT_ANY);
1568 
1569 	if (mode & CHMODE_LOW_LATENCY) {
1570 		if ((*err = skywalk_priv_check_cred(p, cred,
1571 		    PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1572 			goto done;
1573 		}
1574 	}
1575 
1576 	/* "no copy" is valid only when at least one tx/rx mon flag is set */
1577 	if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1578 		mode &= ~CHMODE_MONITOR_NO_COPY;
1579 	}
1580 
1581 	if (mode & CHMODE_MONITOR) {
1582 		if ((*err = skywalk_priv_check_cred(p, cred,
1583 		    PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1584 			goto done;
1585 		}
1586 		/* Don't allow non-root processes to monitor channels. */
1587 		if (kauth_cred_issuser(cred) == 0) {
1588 			*err = EPERM;
1589 			goto done;
1590 		}
1591 	}
1592 
1593 	/*
1594 	 * Check with the nexus to see if the port is bound; if so, prepare
1595 	 * our nxbind structure that we'll need to pass down to the nexus
1596 	 * for it compare.  If the caller provides a key, we take it over
1597 	 * and will free it ourselves (as part of freeing nxbind.)
1598 	 *
1599 	 * If this is a monitor channel, skip this altogether since the check
1600 	 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1601 	 */
1602 	if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1603 		void *key = (void *)(init->ci_key);
1604 
1605 #if SK_LOG
1606 		if (__improbable(sk_verbose != 0)) {
1607 			ch_open_log1(p_uuid, p, port);
1608 		}
1609 #endif /* SK_LOG */
1610 
1611 		nxb = nxb_alloc(Z_WAITOK);
1612 		nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1613 		nxb->nxb_uniqueid = proc_uniqueid(p);
1614 		nxb->nxb_pid = proc_pid(p);
1615 		nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1616 		uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1617 		if (key != NULL) {
1618 			nxb->nxb_flags |= NXBF_MATCH_KEY;
1619 			nxb->nxb_key_len = init->ci_key_len;
1620 			nxb->nxb_key = key;
1621 			init->ci_key = USER_ADDR_NULL;  /* take over */
1622 		}
1623 	}
1624 
1625 	/*
1626 	 * There can only be one owner of {port,ring_id} tuple.  Once
1627 	 * owned, this can be made available among multiple monitors.
1628 	 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1629 	 * all rings.  Further attempts to own any or all of the rings
1630 	 * will be declined.
1631 	 *
1632 	 * Multiple monitors are allowed to exist.  If a channel has been
1633 	 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1634 	 * monitored.  If an owning channel has been bound to an individual
1635 	 * ring, only that ring can be monitored, either by specifying the
1636 	 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1637 	 *
1638 	 * For example, assuming a 2-rings setup for port 'p':
1639 	 *
1640 	 * owner{p,-1}
1641 	 *      will allow:
1642 	 *              monitor{p,-1}, monitor{p,0}, monitor{p,1}
1643 	 *      will not allow:
1644 	 *              owner{p,-1}, owner{p,0}, owner{p,1}
1645 	 *
1646 	 * owner{p,0}
1647 	 *      will allow:
1648 	 *		owner{p,1}, monitor{p,-1}, monitor{p,0}
1649 	 *	will not allow:
1650 	 *		owner{p,-1}, owner{p,0}, monitor{p,1}
1651 	 */
1652 	if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1653 		SK_D("found ch0 0x%llx", SK_KVA(ch0));
1654 		/*
1655 		 * Unless this is a monitor channel, allow only at
1656 		 * most one owner of the {port,ring_id} tuple.
1657 		 */
1658 		if (!(mode & CHMODE_MONITOR)) {
1659 #if SK_LOG
1660 			uuid_string_t uuidstr;
1661 			char *na_name = (ch0->ch_na != NULL) ?
1662 			    ch0->ch_na->na_name : "";
1663 
1664 			SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1665 			    "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1666 			    sk_uuid_unparse(nx->nx_uuid, uuidstr),
1667 			    ch0->ch_name, ch0->ch_pid);
1668 #endif /* SK_LOG */
1669 			*err = EBUSY;
1670 			goto done;
1671 		}
1672 	} else if (mode & CHMODE_MONITOR) {
1673 		*err = ENXIO;
1674 		goto done;
1675 	}
1676 
1677 	bzero(&chr, sizeof(chr));
1678 	chr.cr_tx_lowat = init->ci_tx_lowat;
1679 	chr.cr_rx_lowat = init->ci_rx_lowat;
1680 	chr.cr_port = port;
1681 	chr.cr_mode = mode;
1682 	chr.cr_ring_id = ring;
1683 
1684 	/* upon success, returns a channel with reference held */
1685 	ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1686 
1687 done:
1688 
1689 #if SK_LOG
1690 	if (__improbable(sk_verbose != 0)) {
1691 		ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1692 	}
1693 #endif /* SK_LOG */
1694 
1695 	if (ch0 != NULL) {
1696 		(void) ch_release_locked(ch0);
1697 	}
1698 
1699 	if (nx != NULL) {
1700 		(void) nx_release_locked(nx);
1701 	}
1702 
1703 	if (nxb != NULL) {
1704 		nxb_free(nxb);
1705 	}
1706 
1707 	SK_UNLOCK();
1708 
1709 	return ch;
1710 }
1711 
1712 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1713 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1714     int *err)
1715 {
1716 	struct kern_channel *ch = NULL;
1717 
1718 	SK_LOCK_ASSERT_HELD();
1719 	*err = 0;
1720 
1721 	ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1722 	ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1723 	ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1724 	ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1725 	chr->cr_mode |= CHMODE_KERNEL;
1726 	if (nonxref) {
1727 		chr->cr_mode |= CHMODE_NO_NXREF;
1728 	} else {
1729 		chr->cr_mode &= ~CHMODE_NO_NXREF;
1730 	}
1731 
1732 	/* upon success, returns a channel with reference held */
1733 	ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1734 	if (ch != NULL) {
1735 		/*
1736 		 * nonxref channels don't hold any reference to the nexus,
1737 		 * since otherwise we'll never be able to close them when
1738 		 * the last regular channel of the nexus is closed, as part
1739 		 * of the nexus's destructor operation.  Release the nonxref
1740 		 * channel reference now, but make sure the nexus has at
1741 		 * least 3 refs: global list, provider list and the nonxref
1742 		 * channel itself, before doing that.
1743 		 */
1744 		if (nonxref) {
1745 			ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1746 			ASSERT(nx->nx_refcnt > 3);
1747 			(void) nx_release_locked(nx);
1748 		}
1749 	}
1750 
1751 #if SK_LOG
1752 	uuid_string_t uuidstr;
1753 	SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1754 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ?
1755 	    ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id,
1756 	    sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1757 	    CHMODE_BITS, *err);
1758 #endif /* SK_LOG */
1759 
1760 	return ch;
1761 }
1762 
1763 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1764 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1765 {
1766 #pragma unused(special)
1767 #if SK_LOG
1768 	uuid_string_t uuidstr;
1769 	const char *na_name = (ch->ch_na != NULL) ?
1770 	    ch->ch_na->na_name : "";
1771 	const char *nxdom_name = (ch->ch_nexus != NULL) ?
1772 	    NX_DOM(ch->ch_nexus)->nxdom_name : "";
1773 	const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1774 	    NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1775 
1776 	SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1777 	    SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1778 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1779 	SK_D("  UUID:    %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1780 	    uuidstr));
1781 	SK_D("  flags:   0x%b", ch->ch_flags, CHANF_BITS);
1782 #endif /* SK_LOG */
1783 	struct kern_nexus *nx = ch->ch_nexus;
1784 
1785 	if (!locked) {
1786 		SK_LOCK();
1787 	}
1788 
1789 	SK_LOCK_ASSERT_HELD();
1790 	/*
1791 	 * If the channel is participating in the interface advisory
1792 	 * notification, remove it from the nexus.
1793 	 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1794 	 * is held in exclusive mode.
1795 	 */
1796 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1797 	if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1798 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1799 		    kern_channel, ch_link_if_adv);
1800 		atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
1801 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1802 			nx_netif_config_interface_advisory(nx, false);
1803 		}
1804 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1805 		lck_mtx_lock(&ch->ch_lock);
1806 		(void) ch_release_locked(ch);
1807 	} else {
1808 		lck_rw_done(&nx->nx_ch_if_adv_lock);
1809 		lck_mtx_lock(&ch->ch_lock);
1810 	}
1811 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1812 	/*
1813 	 * Mark the channel as closing to prevent further setopt requests;
1814 	 * this flag is set once here and never gets cleared.
1815 	 */
1816 	ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1817 	atomic_bitset_32(&ch->ch_flags, CHANF_CLOSING);
1818 
1819 	if (special) {
1820 		VERIFY(ch->ch_flags & CHANF_KERNEL);
1821 	} else {
1822 		VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1823 	}
1824 
1825 	ch->ch_fd = -1;
1826 
1827 	/* may be called as part of failure cleanup, so check */
1828 	if (ch->ch_flags & CHANF_ATTACHED) {
1829 		boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1830 
1831 		/* caller must hold an extra ref */
1832 		ASSERT(ch->ch_refcnt > 1);
1833 
1834 		/* disconnect from nexus */
1835 		ch_disconnect(ch);
1836 
1837 		/*
1838 		 * If this was the last regular channel and the nexus
1839 		 * has been closed, detach it and finish up the job.
1840 		 * If this was a nonxref channel, there is nothing
1841 		 * left to do; see comments in ch_open_special().
1842 		 */
1843 		if (!nonxref) {
1844 			STAILQ_REMOVE(&nx->nx_ch_head, ch,
1845 			    kern_channel, ch_link);
1846 			nx->nx_ch_count--;
1847 			if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1848 			    (nx->nx_flags & NXF_CLOSED)) {
1849 				ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1850 				nx_detach(nx);
1851 			}
1852 			(void) nx_release_locked(nx);
1853 		} else {
1854 			ASSERT(ch->ch_flags & CHANF_KERNEL);
1855 			STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1856 			    kern_channel, ch_link);
1857 		}
1858 
1859 		atomic_bitclear_32(&ch->ch_flags, CHANF_ATTACHED);
1860 		ch->ch_nexus = NULL;
1861 
1862 		(void) ch_release_locked(ch);   /* for the list */
1863 	}
1864 
1865 	lck_mtx_unlock(&ch->ch_lock);
1866 	if (!locked) {
1867 		SK_UNLOCK();
1868 	}
1869 }
1870 
1871 void
ch_close(struct kern_channel * ch,boolean_t locked)1872 ch_close(struct kern_channel *ch, boolean_t locked)
1873 {
1874 	ch_close_common(ch, locked, FALSE);
1875 }
1876 
1877 void
ch_close_special(struct kern_channel * ch)1878 ch_close_special(struct kern_channel *ch)
1879 {
1880 	ch_close_common(ch, TRUE, TRUE);
1881 }
1882 
1883 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1884 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1885     struct ch_ev_thresh *cet)
1886 {
1887 	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1888 	uint32_t bmin, bmax, smin, smax;
1889 	int err = 0;
1890 
1891 	if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1892 	    cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1893 		err = EINVAL;
1894 		goto done;
1895 	}
1896 
1897 	smin = 1;       /* minimum 1 slot */
1898 	bmin = 1;       /* minimum 1 byte */
1899 
1900 	if (t == NR_TX) {
1901 		ASSERT(nxp->nxp_tx_slots > 0);
1902 		smax = (nxp->nxp_tx_slots - 1);
1903 	} else {
1904 		ASSERT(nxp->nxp_rx_slots > 0);
1905 		smax = (nxp->nxp_rx_slots - 1);
1906 	}
1907 	bmax = (smax * nxp->nxp_buf_size);
1908 
1909 	switch (cet->cet_unit) {
1910 	case CHANNEL_THRESHOLD_UNIT_BYTES:
1911 		if (cet->cet_value < bmin) {
1912 			cet->cet_value = bmin;
1913 		} else if (cet->cet_value > bmax) {
1914 			cet->cet_value = bmax;
1915 		}
1916 		break;
1917 
1918 	case CHANNEL_THRESHOLD_UNIT_SLOTS:
1919 		if (cet->cet_value < smin) {
1920 			cet->cet_value = smin;
1921 		} else if (cet->cet_value > smax) {
1922 			cet->cet_value = smax;
1923 		}
1924 		break;
1925 	}
1926 
1927 done:
1928 	return err;
1929 }
1930 
1931 #if SK_LOG
1932 /* Hoisted out of line to reduce kernel stack footprint */
1933 SK_LOG_ATTRIBUTE
1934 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1935 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1936     const struct chreq *chr, const struct kern_channel *ch,
1937     const struct kern_nexus_domain_provider *nxdom_prov,
1938     struct proc *p)
1939 {
1940 	struct __user_channel_schema *ch_schema = ch->ch_schema;
1941 	uuid_string_t uuidstr;
1942 	unsigned int n;
1943 	ring_id_t i, j;
1944 
1945 	ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1946 	if (ch_schema != NULL) {
1947 		SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1948 		SK_D("  kern_name:     \"%s\"", ch_schema->csm_kern_name);
1949 		SK_D("  kern_uuid:     %s",
1950 		    sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1951 		SK_D("  flags:         0x%b", ch_schema->csm_flags, CSM_BITS);
1952 		SK_D("  tx_rings:      %u [%u,%u]", ch_schema->csm_tx_rings,
1953 		    cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1954 		SK_D("  rx_rings:      %u [%u,%u]", ch_schema->csm_rx_rings,
1955 		    cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1956 
1957 		j = ch->ch_last[NR_TX];
1958 		for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1959 			SK_D("  tx_ring_%u_off: 0x%llx", i,
1960 			    (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1961 			SK_D("  tx_sd_%u_off:   0x%llx", i,
1962 			    (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1963 		}
1964 		j = n;
1965 		for (n = 0, i = ch->ch_first[NR_RX];
1966 		    i < ch->ch_last[NR_RX]; n++, i++) {
1967 			SK_D("  rx_ring_%u_off: 0x%llx", i,
1968 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1969 			SK_D("  rx_sd_%u_off:   0x%llx", i,
1970 			    (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1971 		}
1972 		SK_D("  md_type:       %u", ch_schema->csm_md_type);
1973 		SK_D("  md_subtype:    %u", ch_schema->csm_md_subtype);
1974 		SK_D("  stats_ofs:     0x%llx", ch_schema->csm_stats_ofs);
1975 		SK_D("  stats_type:    %u", ch_schema->csm_stats_type);
1976 		SK_D("  flowadv_ofs:   0x%llx", ch_schema->csm_flowadv_ofs);
1977 		SK_D("  flowadv_max:   %u", ch_schema->csm_flowadv_max);
1978 		SK_D("  nexusadv_ofs:  0x%llx", ch_schema->csm_nexusadv_ofs);
1979 	}
1980 
1981 	SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1982 	    SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1983 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1984 	    cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1985 	SK_D("  ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1986 	SK_D("  nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1987 	SK_D("  flags:   0x%b", ch->ch_flags, CHANF_BITS);
1988 	SK_D("  task:    0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1989 	    sk_proc_name_address(p), sk_proc_pid(p));
1990 	SK_D("  txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1991 	    ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1992 	    "bytes" : "slots"));
1993 	SK_D("  rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1994 	    ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1995 	    "bytes" : "slots"));
1996 	SK_D("  mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
1997 	SK_D("  mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1998 	SK_D("  mapsize: 0x%llx (%llu KB)",
1999 	    (uint64_t)cinfo->cinfo_mem_map_size,
2000 	    (uint64_t)cinfo->cinfo_mem_map_size >> 10);
2001 	SK_D("  memsize: 0x%llx (%llu KB)",
2002 	    (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
2003 	SK_D("  offset:  0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
2004 }
2005 
2006 SK_LOG_ATTRIBUTE
2007 static void
ch_connect_log2(const struct kern_nexus * nx,int err)2008 ch_connect_log2(const struct kern_nexus *nx, int err)
2009 {
2010 	uuid_string_t nx_uuidstr;
2011 
2012 	SK_ERR("Error connecting to nexus UUID %s: %d",
2013 	    sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2014 }
2015 #endif /* SK_LOG */
2016 
2017 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,int fd,int * err)2018 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2019     struct nxbind *nxb, struct proc *p, int fd, int *err)
2020 {
2021 	struct kern_nexus_domain_provider *nxdom_prov;
2022 	struct kern_channel *ch = NULL;
2023 	struct ch_info *cinfo = NULL;
2024 	uint32_t ch_mode = chr->cr_mode;
2025 	boolean_t config = FALSE;
2026 	struct nxdom *nxdom;
2027 	boolean_t reserved_port = FALSE;
2028 
2029 	ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2030 	ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2031 	SK_LOCK_ASSERT_HELD();
2032 
2033 	/* validate thresholds before we proceed any further */
2034 	if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2035 	    (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2036 		goto done;
2037 	}
2038 
2039 	if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2040 		*err = ENOTSUP;
2041 		goto done;
2042 	}
2043 
2044 	ch = ch_alloc(Z_WAITOK);
2045 
2046 	lck_mtx_lock(&ch->ch_lock);
2047 
2048 	uuid_generate_random(ch->ch_info->cinfo_ch_id);
2049 	ch->ch_fd = fd;
2050 	ch->ch_pid = proc_pid(p);
2051 	(void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2052 	    proc_name_address(p));
2053 
2054 	nxdom_prov = NX_DOM_PROV(nx);
2055 	nxdom = NX_DOM(nx);
2056 
2057 	if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2058 		/*
2059 		 * CHANF_KERNEL implies a channel opened by a kernel
2060 		 * subsystem, and is triggered by the CHMODE_KERNEL
2061 		 * flag which (only ever) set by ch_open_special().
2062 		 *
2063 		 * CHANF_NONXREF can be optionally set based on the
2064 		 * CHMODE_NO_NXREF request flag.  This must only be
2065 		 * set by ch_open_special() as well, hence we verify.
2066 		 */
2067 		ASSERT(p == kernproc);
2068 		ASSERT(ch_mode & CHMODE_KERNEL);
2069 		atomic_bitset_32(&ch->ch_flags, CHANF_KERNEL);
2070 		if (ch_mode & CHMODE_NO_NXREF) {
2071 			atomic_bitset_32(&ch->ch_flags, CHANF_NONXREF);
2072 		}
2073 
2074 		config = (ch_mode & CHMODE_CONFIG) != 0;
2075 		if (chr->cr_port == NEXUS_PORT_ANY) {
2076 			if (nxdom->nxdom_find_port == NULL) {
2077 				*err = ENOTSUP;
2078 				goto done;
2079 			}
2080 
2081 			/*
2082 			 * If ephemeral port request, find one for client;
2083 			 * we ask for the reserved port range if this is
2084 			 * a configuration request (CHMODE_CONFIG).
2085 			 */
2086 			if ((*err = nxdom->nxdom_find_port(nx,
2087 			    config, &chr->cr_port)) != 0) {
2088 				goto done;
2089 			}
2090 		}
2091 	}
2092 
2093 	if (skywalk_check_platform_binary(p)) {
2094 		atomic_bitset_32(&ch->ch_flags, CHANF_PLATFORM);
2095 	}
2096 
2097 	ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2098 
2099 	reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2100 	    (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2101 	if (!config && reserved_port) {
2102 		*err = EDOM;
2103 		goto done;
2104 	}
2105 
2106 	SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2107 	    sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2108 
2109 	if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2110 	    nx, ch, chr, ch0, nxb, p)) != 0) {
2111 		goto done;
2112 	}
2113 
2114 	cinfo = ch->ch_info;
2115 	uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2116 	/* for easy access to immutables */
2117 	bcopy((void *)nx->nx_prov->nxprov_params,
2118 	    (void *)&cinfo->cinfo_nxprov_params, sizeof(struct nxprov_params));
2119 	cinfo->cinfo_ch_mode = ch_mode;
2120 	cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2121 	cinfo->cinfo_nx_port = chr->cr_port;
2122 	cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2123 	cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2124 	cinfo->cinfo_schema_offset = chr->cr_memoffset;
2125 	cinfo->cinfo_num_bufs =
2126 	    skmem_arena_nexus(ch->ch_na->na_arena)->
2127 	    arn_rx_pp->pp_buf_region->skr_params.srp_c_obj_cnt;
2128 	/*
2129 	 * ch_last is really the number of rings, but we need to return
2130 	 * the actual zero-based ring ID to the client.  Make sure that
2131 	 * is the case here and adjust last_{tx,rx}_ring accordingly.
2132 	 */
2133 	ASSERT((ch->ch_last[NR_TX] > 0) ||
2134 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2135 	ASSERT((ch->ch_last[NR_RX] > 0) ||
2136 	    (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2137 	cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2138 	cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2139 	cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2140 	cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2141 	cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2142 	cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2143 
2144 	if (ch_mode & CHMODE_NO_NXREF) {
2145 		ASSERT(ch_mode & CHMODE_KERNEL);
2146 		STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2147 	} else {
2148 		STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2149 		nx->nx_ch_count++;
2150 	}
2151 	atomic_bitset_32(&ch->ch_flags, CHANF_ATTACHED);
2152 	ch->ch_nexus = nx;
2153 	nx_retain_locked(nx);   /* hold a ref on the nexus */
2154 
2155 	ch_retain_locked(ch);   /* one for being in the list */
2156 	ch_retain_locked(ch);   /* one for the caller */
2157 
2158 	/*
2159 	 * Now that we've successfully created the nexus adapter, inform the
2160 	 * nexus provider about the rings and the slots within each ring.
2161 	 * This is a no-op for internal nexus providers.
2162 	 */
2163 	if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2164 		lck_mtx_unlock(&ch->ch_lock);
2165 
2166 		/* gracefully close this fully-formed channel */
2167 		if (ch->ch_flags & CHANF_KERNEL) {
2168 			ch_close_special(ch);
2169 		} else {
2170 			ch_close(ch, TRUE);
2171 		}
2172 		(void) ch_release_locked(ch);
2173 		ch = NULL;
2174 		goto done;
2175 	}
2176 
2177 	ASSERT(ch->ch_schema == NULL ||
2178 	    (ch->ch_schema->csm_flags & CSM_ACTIVE));
2179 
2180 #if SK_LOG
2181 	if (__improbable(sk_verbose != 0)) {
2182 		ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2183 	}
2184 #endif /* SK_LOG */
2185 
2186 done:
2187 	if (ch != NULL) {
2188 		lck_mtx_unlock(&ch->ch_lock);
2189 	}
2190 	if (*err != 0) {
2191 #if SK_LOG
2192 		if (__improbable(sk_verbose != 0)) {
2193 			ch_connect_log2(nx, *err);
2194 		}
2195 #endif /* SK_LOG */
2196 		if (ch != NULL) {
2197 			ch_free(ch);
2198 			ch = NULL;
2199 		}
2200 	}
2201 	return ch;
2202 }
2203 
2204 static void
ch_disconnect(struct kern_channel * ch)2205 ch_disconnect(struct kern_channel *ch)
2206 {
2207 	struct kern_nexus *nx = ch->ch_nexus;
2208 	struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2209 
2210 	SK_LOCK_ASSERT_HELD();
2211 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2212 
2213 	/*
2214 	 * Inform the nexus provider that the channel has been quiesced
2215 	 * and disconnected from the nexus port.  This is a no-op for
2216 	 * internal nexus providers.
2217 	 */
2218 	nxprov_advise_disconnect(nx, ch);
2219 
2220 	/* Finally, let the domain provider tear down the instance */
2221 	nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2222 }
2223 
2224 void
ch_deactivate(struct kern_channel * ch)2225 ch_deactivate(struct kern_channel *ch)
2226 {
2227 	/*
2228 	 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2229 	 * it will never be set again.  Doing this will cause
2230 	 * os_channel_is_defunct() to indicate that the channel
2231 	 * is defunct and is no longer usable (thus should be
2232 	 * immediately closed).
2233 	 */
2234 	if (ch->ch_schema != NULL &&
2235 	    (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2236 		atomic_bitclear_32(__DECONST(uint32_t *,
2237 		    &ch->ch_schema->csm_flags), CSM_ACTIVE);
2238 		/* make this globally visible */
2239 		membar_sync();
2240 	}
2241 }
2242 
2243 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2244 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2245 {
2246 #pragma unused(ch)
2247 	int err = 0;
2248 
2249 	if (sopt->sopt_dir != SOPT_SET) {
2250 		sopt->sopt_dir = SOPT_SET;
2251 	}
2252 
2253 	switch (sopt->sopt_name) {
2254 	case CHOPT_TX_LOWAT_THRESH:
2255 		err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2256 		break;
2257 
2258 	case CHOPT_RX_LOWAT_THRESH:
2259 		err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2260 		break;
2261 
2262 	case CHOPT_IF_ADV_CONF:
2263 		err = ch_configure_interface_advisory_event(ch, sopt);
2264 		break;
2265 
2266 	default:
2267 		err = ENOPROTOOPT;
2268 		break;
2269 	}
2270 
2271 	return err;
2272 }
2273 
2274 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2275 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2276 {
2277 #pragma unused(ch)
2278 	int err = 0;
2279 
2280 	if (sopt->sopt_dir != SOPT_GET) {
2281 		sopt->sopt_dir = SOPT_GET;
2282 	}
2283 
2284 	switch (sopt->sopt_name) {
2285 	case CHOPT_TX_LOWAT_THRESH:
2286 		err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2287 		break;
2288 
2289 	case CHOPT_RX_LOWAT_THRESH:
2290 		err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2291 		break;
2292 
2293 	default:
2294 		err = ENOPROTOOPT;
2295 		break;
2296 	}
2297 
2298 	return err;
2299 }
2300 
2301 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2302 ch_configure_interface_advisory_event(struct kern_channel *ch,
2303     struct sockopt *sopt)
2304 {
2305 	int err = 0;
2306 	boolean_t enable = 0;
2307 	struct kern_nexus *nx = ch->ch_nexus;
2308 
2309 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2310 	SK_LOCK_ASSERT_NOTHELD();
2311 
2312 	if (sopt->sopt_val == USER_ADDR_NULL) {
2313 		return EINVAL;
2314 	}
2315 	if (nx->nx_adv.nxv_adv == NULL) {
2316 		return ENOTSUP;
2317 	}
2318 	err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2319 	if (err != 0) {
2320 		return err;
2321 	}
2322 
2323 	/*
2324 	 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2325 	 * ordering requirement; check if the channel is closing once ch_lock
2326 	 * is reacquired and bail if so.
2327 	 */
2328 	lck_mtx_unlock(&ch->ch_lock);
2329 	SK_LOCK();
2330 	lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2331 	lck_mtx_lock(&ch->ch_lock);
2332 	if (ch->ch_flags & CHANF_CLOSING) {
2333 		err = ENXIO;
2334 		goto done;
2335 	}
2336 
2337 	/*
2338 	 * if interface advisory reporting is enabled on the channel then
2339 	 * add the channel to the list of channels eligible for interface
2340 	 * advisory update on the nexus. If disabled, remove from the list.
2341 	 */
2342 	if (enable) {
2343 		if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2344 			ASSERT(err == 0);
2345 			goto done;
2346 		}
2347 		bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2348 		atomic_bitset_32(&ch->ch_flags, CHANF_IF_ADV);
2349 		STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2350 		if (enable_adv) {
2351 			nx_netif_config_interface_advisory(nx, true);
2352 		}
2353 		ch_retain_locked(ch);   /* for being in the IF ADV list */
2354 	} else {
2355 		if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2356 			ASSERT(err == 0);
2357 			goto done;
2358 		}
2359 		STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2360 		    ch_link_if_adv);
2361 		atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
2362 		if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2363 			nx_netif_config_interface_advisory(nx, false);
2364 		}
2365 		(void) ch_release_locked(ch);
2366 	}
2367 
2368 done:
2369 	lck_mtx_unlock(&ch->ch_lock);
2370 	lck_rw_done(&nx->nx_ch_if_adv_lock);
2371 	SK_UNLOCK();
2372 	lck_mtx_lock(&ch->ch_lock);
2373 
2374 	return err;
2375 }
2376 
2377 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2378 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2379     struct sockopt *sopt)
2380 {
2381 	struct ch_ev_thresh cet, *ocet;
2382 	int err = 0;
2383 
2384 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2385 
2386 	if (sopt->sopt_val == USER_ADDR_NULL) {
2387 		return EINVAL;
2388 	}
2389 
2390 	bzero(&cet, sizeof(cet));
2391 	err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2392 	if (err == 0) {
2393 		err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2394 		if (err == 0) {
2395 			if (t == NR_TX) {
2396 				ocet = &ch->ch_info->cinfo_tx_lowat;
2397 			} else {
2398 				ocet = &ch->ch_info->cinfo_rx_lowat;
2399 			}
2400 
2401 			/* if there is no change, we're done */
2402 			if (ocet->cet_unit == cet.cet_unit &&
2403 			    ocet->cet_value == cet.cet_value) {
2404 				return 0;
2405 			}
2406 
2407 			*ocet = cet;
2408 
2409 			for_rx_tx(t) {
2410 				ring_id_t qfirst = ch->ch_first[t];
2411 				ring_id_t qlast = ch->ch_last[t];
2412 				uint32_t i;
2413 
2414 				for (i = qfirst; i < qlast; i++) {
2415 					struct __kern_channel_ring *kring =
2416 					    &NAKR(ch->ch_na, t)[i];
2417 
2418 					(void) kring->ckr_na_notify(kring,
2419 					    sopt->sopt_p, 0);
2420 				}
2421 			}
2422 
2423 			(void) sooptcopyout(sopt, &cet, sizeof(cet));
2424 		}
2425 	}
2426 
2427 	return err;
2428 }
2429 
2430 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2431 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2432     struct sockopt *sopt)
2433 {
2434 	struct ch_ev_thresh cet;
2435 
2436 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2437 
2438 	if (sopt->sopt_val == USER_ADDR_NULL) {
2439 		return EINVAL;
2440 	}
2441 
2442 	if (t == NR_TX) {
2443 		cet = ch->ch_info->cinfo_tx_lowat;
2444 	} else {
2445 		cet = ch->ch_info->cinfo_rx_lowat;
2446 	}
2447 
2448 	return sooptcopyout(sopt, &cet, sizeof(cet));
2449 }
2450 
2451 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2452 ch_alloc(zalloc_flags_t how)
2453 {
2454 	struct kern_channel *ch;
2455 
2456 	ch = zalloc_flags(ch_zone, how | Z_ZERO);
2457 	if (ch) {
2458 		lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2459 		ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2460 	}
2461 	return ch;
2462 }
2463 
2464 static void
ch_free(struct kern_channel * ch)2465 ch_free(struct kern_channel *ch)
2466 {
2467 	ASSERT(ch->ch_refcnt == 0);
2468 	ASSERT(ch->ch_pp == NULL);
2469 	ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2470 	    CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2471 	lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2472 	SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2473 	ASSERT(ch->ch_info != NULL);
2474 	zfree(ch_info_zone, ch->ch_info);
2475 	ch->ch_info = NULL;
2476 	zfree(ch_zone, ch);
2477 }
2478 
2479 void
ch_retain_locked(struct kern_channel * ch)2480 ch_retain_locked(struct kern_channel *ch)
2481 {
2482 	SK_LOCK_ASSERT_HELD();
2483 
2484 	ch->ch_refcnt++;
2485 	VERIFY(ch->ch_refcnt != 0);
2486 }
2487 
2488 void
ch_retain(struct kern_channel * ch)2489 ch_retain(struct kern_channel *ch)
2490 {
2491 	SK_LOCK();
2492 	ch_retain_locked(ch);
2493 	SK_UNLOCK();
2494 }
2495 
2496 int
ch_release_locked(struct kern_channel * ch)2497 ch_release_locked(struct kern_channel *ch)
2498 {
2499 	int oldref = ch->ch_refcnt;
2500 
2501 	SK_LOCK_ASSERT_HELD();
2502 
2503 	VERIFY(ch->ch_refcnt != 0);
2504 	if (--ch->ch_refcnt == 0) {
2505 		ch_free(ch);
2506 	}
2507 
2508 	return oldref == 1;
2509 }
2510 
2511 int
ch_release(struct kern_channel * ch)2512 ch_release(struct kern_channel *ch)
2513 {
2514 	int lastref;
2515 
2516 	SK_LOCK();
2517 	lastref = ch_release_locked(ch);
2518 	SK_UNLOCK();
2519 
2520 	return lastref;
2521 }
2522 
2523 void
ch_dtor(void * arg)2524 ch_dtor(void *arg)
2525 {
2526 	struct kern_channel *ch = arg;
2527 
2528 	SK_LOCK();
2529 	ch_close(ch, TRUE);
2530 	(void) ch_release_locked(ch);
2531 	SK_UNLOCK();
2532 }
2533