1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61
62 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63
64 struct ch_event_result {
65 uint32_t tx_data;
66 uint32_t rx_data;
67 };
68
69 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72
73 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77 boolean_t, boolean_t, uint32_t);
78 static boolean_t csi_tcall_start(struct ch_selinfo *);
79 static void csi_tcall(thread_call_param_t, thread_call_param_t);
80 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81
82 static void ch_redzone_init(void);
83 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85 ring_id_t);
86 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87 struct ch_ev_thresh *);
88 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89 struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90 static void ch_disconnect(struct kern_channel *);
91 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92 struct sockopt *);
93 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94 struct sockopt *);
95 static struct kern_channel *ch_alloc(zalloc_flags_t);
96 static void ch_free(struct kern_channel *);
97 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98 struct sockopt *sopt);
99
100 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101 static void filt_chrwdetach(struct knote *, boolean_t);
102 static void filt_chrdetach(struct knote *);
103 static void filt_chwdetach(struct knote *);
104 static int filt_chrw(struct knote *, long, int);
105 static int filt_chread(struct knote *, long);
106 static int filt_chwrite(struct knote *, long);
107
108 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115 static void filt_che_detach(struct knote *);
116 static int filt_che_event(struct knote *, long);
117 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118 static int filt_che_process(struct knote *, struct kevent_qos_s *);
119 static int filt_chan_extended_common(struct knote *, long);
120
121 static int ch_event(struct kern_channel *ch, int events,
122 void *wql, struct proc *p, struct ch_event_result *,
123 const boolean_t is_kevent, int *errno, const boolean_t);
124
125 const struct filterops skywalk_channel_rfiltops = {
126 .f_isfd = 1,
127 .f_attach = filt_chrwattach,
128 .f_detach = filt_chrdetach,
129 .f_event = filt_chread,
130 .f_touch = filt_chrtouch,
131 .f_process = filt_chrprocess,
132 };
133
134 const struct filterops skywalk_channel_wfiltops = {
135 .f_isfd = 1,
136 .f_attach = filt_chrwattach,
137 .f_detach = filt_chwdetach,
138 .f_event = filt_chwrite,
139 .f_touch = filt_chwtouch,
140 .f_process = filt_chwprocess,
141 };
142
143 const struct filterops skywalk_channel_efiltops = {
144 .f_isfd = 1,
145 .f_attach = filt_che_attach,
146 .f_detach = filt_che_detach,
147 .f_event = filt_che_event,
148 .f_touch = filt_che_touch,
149 .f_process = filt_che_process,
150 };
151
152 /* mitigation intervals in ns */
153 #define CH_MIT_IVAL_MIN NSEC_PER_USEC
154
155 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156
157 #if (DEVELOPMENT || DEBUG)
158 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162 #endif /* !DEVELOPMENT && !DEBUG */
163
164 static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel);
165
166 static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info);
167
168 static int __ch_inited = 0;
169
170 /*
171 * Global cookies to hold the random numbers used for verifying
172 * user metadata red zone violations.
173 */
174 uint64_t __ch_umd_redzone_cookie = 0;
175
176 #define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key"
177 SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
178
179 static void
ch_redzone_init(void)180 ch_redzone_init(void)
181 {
182 _CASSERT(sizeof(__ch_umd_redzone_cookie) ==
183 sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
184 _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
185 _CASSERT(sizeof(struct __slot_desc) == 8);
186
187 /* Initialize random user red zone cookie values */
188 do {
189 read_random(&__ch_umd_redzone_cookie,
190 sizeof(__ch_umd_redzone_cookie));
191 } while (__ch_umd_redzone_cookie == 0);
192
193 SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
194 }
195
196 int
channel_init(void)197 channel_init(void)
198 {
199 int error = 0;
200
201 SK_LOCK_ASSERT_HELD();
202 ASSERT(!__ch_inited);
203
204 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
205 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
206
207 ch_redzone_init();
208
209 __ch_inited = 1;
210
211 return error;
212 }
213
214 void
channel_fini(void)215 channel_fini(void)
216 {
217 SK_LOCK_ASSERT_HELD();
218
219 if (__ch_inited) {
220 __ch_umd_redzone_cookie = 0;
221 __ch_inited = 0;
222 }
223 }
224
225 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)226 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
227 {
228 csi->csi_flags = 0;
229 csi->csi_pending = 0;
230 if (mitigation) {
231 csi->csi_interval = mit_ival;
232 csi->csi_eff_interval = ch_mit_ival; /* global override */
233 os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed);
234 csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
235 csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
236 /* this must not fail */
237 VERIFY(csi->csi_tcall != NULL);
238 } else {
239 csi->csi_interval = 0;
240 csi->csi_eff_interval = 0;
241 csi->csi_tcall = NULL;
242 }
243 lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
244 klist_init(&csi->csi_si.si_note);
245 }
246
247 void
csi_destroy(struct ch_selinfo * csi)248 csi_destroy(struct ch_selinfo *csi)
249 {
250 /* check if not already destroyed, else do it now */
251 if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) &
252 CSI_DESTROYED) == 0) {
253 CSI_LOCK(csi);
254 /* must have been set by above atomic op */
255 VERIFY(csi->csi_flags & CSI_DESTROYED);
256 if (csi->csi_flags & CSI_MITIGATION) {
257 thread_call_t __single tcall = csi->csi_tcall;
258 VERIFY(tcall != NULL);
259 CSI_UNLOCK(csi);
260
261 (void) thread_call_cancel_wait(tcall);
262 if (!thread_call_free(tcall)) {
263 boolean_t freed;
264 (void) thread_call_cancel_wait(tcall);
265 freed = thread_call_free(tcall);
266 VERIFY(freed);
267 }
268
269 CSI_LOCK(csi);
270 csi->csi_tcall = NULL;
271 os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION,
272 relaxed);
273 }
274 csi->csi_pending = 0;
275 CSI_UNLOCK(csi);
276
277 selthreadclear(&csi->csi_si);
278 /* now we don't need the mutex anymore */
279 lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
280 }
281 }
282
283 /*
284 * Called only for select(2).
285 */
286 __attribute__((always_inline))
287 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)288 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
289 {
290 struct selinfo *si = &csi->csi_si;
291
292 CSI_LOCK_ASSERT_HELD(csi);
293 selrecord(p, si, wql);
294 }
295
296 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)297 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
298 {
299 struct ch_selinfo *csi = &kring->ckr_si;
300
301 CSI_LOCK(csi);
302 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
303 "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
304 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
305 SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
306
307 csi_selrecord(csi, p, wql);
308 CSI_UNLOCK(csi);
309 }
310
311 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)312 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
313 void *wql)
314 {
315 struct ch_selinfo *csi = &na->na_si[t];
316
317 CSI_LOCK(csi);
318 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
319 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
320 SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
321
322 csi_selrecord(csi, p, wql);
323 CSI_UNLOCK(csi);
324 }
325
326 /*
327 * Called from na_post_event().
328 */
329 __attribute__((always_inline))
330 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)331 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
332 boolean_t selwake, uint32_t hint)
333 {
334 struct selinfo *si = &csi->csi_si;
335
336 CSI_LOCK_ASSERT_HELD(csi);
337 csi->csi_pending = 0;
338 if (selwake) {
339 selwakeup(si);
340 }
341 if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
342 KNOTE(&si->si_note, hint);
343 }
344 }
345
346 __attribute__((always_inline))
347 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)348 csi_selwakeup_delayed(struct ch_selinfo *csi)
349 {
350 CSI_LOCK_ASSERT_HELD(csi);
351 ASSERT(csi->csi_flags & CSI_MITIGATION);
352 ASSERT(csi->csi_tcall != NULL);
353
354 if (thread_call_isactive(csi->csi_tcall)) {
355 csi->csi_pending++;
356 } else if (!csi_tcall_start(csi)) {
357 csi_selwakeup(csi, FALSE, FALSE, 0);
358 }
359 }
360
361 __attribute__((always_inline))
362 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)363 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
364 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
365 {
366 CSI_LOCK_ASSERT_HELD(csi);
367
368 if (nodelay || within_kevent || !selwake || hint != 0 ||
369 !(csi->csi_flags & CSI_MITIGATION)) {
370 csi_selwakeup(csi, within_kevent, selwake, hint);
371 } else {
372 csi_selwakeup_delayed(csi);
373 }
374 }
375
376 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)377 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
378 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
379 {
380 struct ch_selinfo *csi = &kring->ckr_si;
381
382 CSI_LOCK(csi);
383 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
384 "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
385 (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
386 SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
387 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
388 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
389
390 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
391 CSI_UNLOCK(csi);
392 }
393
394 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)395 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
396 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
397 {
398 struct ch_selinfo *csi = &na->na_si[t];
399
400 CSI_LOCK(csi);
401 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
402 "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
403 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
404 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
405 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
406
407 switch (t) {
408 case NR_RX:
409 if (!(na->na_flags & NAF_RX_MITIGATION)) {
410 nodelay = TRUE;
411 }
412 break;
413
414 case NR_TX:
415 if (!(na->na_flags & NAF_TX_MITIGATION)) {
416 nodelay = TRUE;
417 }
418 break;
419
420 default:
421 nodelay = TRUE;
422 break;
423 }
424 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
425 CSI_UNLOCK(csi);
426 }
427
428 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)429 csi_tcall_start(struct ch_selinfo *csi)
430 {
431 uint64_t now, ival, deadline;
432
433 CSI_LOCK_ASSERT_HELD(csi);
434 ASSERT(csi->csi_flags & CSI_MITIGATION);
435 ASSERT(csi->csi_tcall != NULL);
436
437 /* pick up latest value */
438 ival = csi_tcall_update_interval(csi);
439
440 /* if no mitigation, pass notification up now */
441 if (__improbable(ival == 0)) {
442 return FALSE;
443 }
444
445 deadline = now = mach_absolute_time();
446 clock_deadline_for_periodic_event(ival, now, &deadline);
447 (void) thread_call_enter_delayed(csi->csi_tcall, deadline);
448
449 return TRUE;
450 }
451
452 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)453 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
454 {
455 #pragma unused(arg1)
456 struct ch_selinfo *csi = (struct ch_selinfo *__single)arg0;
457
458 CSI_LOCK(csi);
459 csi_selwakeup(csi, FALSE, FALSE, 0);
460 CSI_UNLOCK(csi);
461
462 CSI_LOCK(csi);
463 if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
464 csi->csi_pending != 0 && !csi_tcall_start(csi))) {
465 csi_selwakeup(csi, FALSE, FALSE, 0);
466 }
467 CSI_UNLOCK(csi);
468 }
469
470 __attribute__((always_inline))
471 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)472 csi_tcall_update_interval(struct ch_selinfo *csi)
473 {
474 uint64_t i = ch_mit_ival;
475
476 /* if global override was adjusted, update local copies */
477 if (__improbable(csi->csi_eff_interval != i)) {
478 ASSERT(csi->csi_flags & CSI_MITIGATION);
479 csi->csi_interval = csi->csi_eff_interval =
480 ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
481 }
482
483 return csi->csi_interval;
484 }
485
486 /* return EV_EOF if the channel is defunct */
487 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)488 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
489 {
490 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
491 if (kn) {
492 kn->kn_flags |= EV_EOF;
493 }
494 return TRUE;
495 }
496 return FALSE;
497 }
498
499 static void
filt_chrwdetach(struct knote * kn,boolean_t write)500 filt_chrwdetach(struct knote *kn, boolean_t write)
501 {
502 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
503 struct ch_selinfo *csi;
504 struct selinfo *si;
505
506 lck_mtx_lock(&ch->ch_lock);
507 csi = ch->ch_si[write ? NR_TX : NR_RX];
508 si = &csi->csi_si;
509
510 CSI_LOCK(csi);
511 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
512 "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
513 SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
514 write ? "write" : "read", si->si_flags);
515
516 if (KNOTE_DETACH(&si->si_note, kn)) {
517 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
518 }
519
520 CSI_UNLOCK(csi);
521 lck_mtx_unlock(&ch->ch_lock);
522 }
523
524 static void
filt_chrdetach(struct knote * kn)525 filt_chrdetach(struct knote *kn)
526 {
527 ASSERT(kn->kn_filter == EVFILT_READ);
528 filt_chrwdetach(kn, FALSE);
529 }
530
531 static void
filt_chwdetach(struct knote * kn)532 filt_chwdetach(struct knote *kn)
533 {
534 ASSERT(kn->kn_filter == EVFILT_WRITE);
535 filt_chrwdetach(kn, TRUE);
536 }
537
538 /*
539 * callback from notifies (generated externally).
540 * This always marks the knote activated, so always
541 * return 1.
542 */
543 static int
filt_chrw(struct knote * kn,long hint,int events)544 filt_chrw(struct knote *kn, long hint, int events)
545 {
546 #if SK_LOG
547 struct kern_channel *ch = (struct kern_channel *__single)
548 knote_kn_hook_get_raw(kn);
549 #else
550 #pragma unused(kn)
551 #pragma unused(hint)
552 #pragma unused(events)
553 #endif
554 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
555 "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
556 SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
557 (kn->kn_flags & EV_POLL) ? "poll," : "",
558 (events == POLLOUT) ? "write" : "read",
559 (uint32_t)hint);
560
561 /* assume we are ready */
562 return 1;
563 }
564
565 static int
filt_chread(struct knote * kn,long hint)566 filt_chread(struct knote *kn, long hint)
567 {
568 ASSERT(kn->kn_filter == EVFILT_READ);
569 /* There is no hint for read/write event */
570 if (hint != 0) {
571 return 0;
572 }
573 return filt_chrw(kn, hint, POLLIN);
574 }
575
576 static int
filt_chwrite(struct knote * kn,long hint)577 filt_chwrite(struct knote *kn, long hint)
578 {
579 ASSERT(kn->kn_filter == EVFILT_WRITE);
580 /* There is no hint for read/write event */
581 if (hint != 0) {
582 return 0;
583 }
584 return filt_chrw(kn, hint, POLLOUT);
585 }
586
587 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)588 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
589 {
590 #pragma unused(kev)
591 /*
592 * -fbounds-safety: This seems like an example of interop with code that
593 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
594 */
595 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
596 knote_kn_hook_get_raw(kn));
597 int ev = kn->kn_filter;
598 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
599 int event_error = 0;
600 int revents;
601
602 /* save off the new input fflags and data */
603 kn->kn_sfflags = kev->fflags;
604 kn->kn_sdata = kev->data;
605
606 lck_mtx_lock(&ch->ch_lock);
607 if (__improbable(ch_filt_check_defunct(ch, kn))) {
608 lck_mtx_unlock(&ch->ch_lock);
609 return 1;
610 }
611
612 /* if a note-specific low watermark is given, validate it */
613 if (kn->kn_sfflags & NOTE_LOWAT) {
614 struct ch_ev_thresh note_thresh = {
615 .cet_unit = (dir == NR_TX) ?
616 ch->ch_info->cinfo_tx_lowat.cet_unit :
617 ch->ch_info->cinfo_rx_lowat.cet_unit,
618 .cet_value = (uint32_t)kn->kn_sdata
619 };
620 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
621 ¬e_thresh) != 0) {
622 SK_ERR("invalid NOTE_LOWAT threshold %u",
623 note_thresh.cet_value);
624 knote_set_error(kn, EINVAL);
625 lck_mtx_unlock(&ch->ch_lock);
626 return 1;
627 }
628 }
629
630 /* capture new state just so we can return it */
631 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
632 &event_error, FALSE);
633 lck_mtx_unlock(&ch->ch_lock);
634
635 if (revents & POLLERR) {
636 ASSERT(event_error != 0);
637 /*
638 * Setting a knote error here will confuse libdispatch, so we
639 * use EV_EOF instead.
640 */
641 kn->kn_flags |= EV_EOF;
642 return 1;
643 } else {
644 return (events & revents) != 0;
645 }
646 }
647
648 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)649 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
650 {
651 ASSERT(kn->kn_filter == EVFILT_READ);
652
653 if (kev->flags & EV_ENABLE) {
654 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
655 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
656 kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
657 ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na));
658 }
659
660 return filt_chtouch(kn, kev, POLLIN);
661 }
662
663 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)664 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
665 {
666 ASSERT(kn->kn_filter == EVFILT_WRITE);
667 return filt_chtouch(kn, kev, POLLOUT);
668 }
669
670
671 /*
672 * Called from kevent. We call ch_event(POLL[IN|OUT]) and
673 * return 0/1 accordingly.
674 */
675 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)676 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
677 {
678 /*
679 * -fbounds-safety: This seems like an example of interop with code that
680 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
681 */
682 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
683 knote_kn_hook_get_raw(kn));
684 struct ch_event_result result;
685 uint32_t lowat;
686 int trigger_event = 1;
687 int revents;
688 int event_error;
689 int64_t data;
690
691 lck_mtx_lock(&ch->ch_lock);
692 if (__improbable(ch_filt_check_defunct(ch, kn))) {
693 knote_fill_kevent(kn, kev, 0);
694 lck_mtx_unlock(&ch->ch_lock);
695 return 1;
696 }
697
698 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
699 TRUE, &event_error, FALSE);
700
701 if (revents & POLLERR) {
702 ASSERT(event_error != 0);
703 lck_mtx_unlock(&ch->ch_lock);
704 /*
705 * Setting a knote error here will confuse libdispatch, so we
706 * use EV_EOF instead.
707 */
708 kn->kn_flags |= EV_EOF;
709 knote_fill_kevent_with_sdata(kn, kev);
710 return 1;
711 }
712
713 trigger_event = (events & revents) != 0;
714
715 if (events == POLLOUT) {
716 lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
717 if ((kn->kn_sfflags & NOTE_LOWAT) &&
718 kn->kn_sdata > lowat) {
719 lowat = (uint32_t)kn->kn_sdata;
720 }
721
722 data = result.tx_data;
723
724 if (result.tx_data < lowat) {
725 trigger_event = 0;
726 }
727 } else {
728 lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
729 if ((kn->kn_sfflags & NOTE_LOWAT) &&
730 kn->kn_sdata > lowat) {
731 lowat = (uint32_t)kn->kn_sdata;
732 }
733
734 data = result.rx_data;
735
736 if (result.rx_data < lowat) {
737 trigger_event = 0;
738 }
739 }
740
741 if (trigger_event) {
742 knote_fill_kevent(kn, kev, data);
743 }
744
745 lck_mtx_unlock(&ch->ch_lock);
746
747 return trigger_event;
748 }
749
750 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)751 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
752 {
753 ASSERT(kn->kn_filter == EVFILT_READ);
754 return filt_chprocess(kn, kev, POLLIN);
755 }
756
757 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)758 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
759 {
760 ASSERT(kn->kn_filter == EVFILT_WRITE);
761 return filt_chprocess(kn, kev, POLLOUT);
762 }
763
764 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)765 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
766 {
767 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
768 struct nexus_adapter *na;
769 struct ch_selinfo *csi;
770 int ev = kn->kn_filter;
771 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
772 int revents;
773 int events;
774 int event_error = 0;
775
776 ASSERT((kn->kn_filter == EVFILT_READ) ||
777 (kn->kn_filter == EVFILT_WRITE));
778
779 /* ch_kqfilter() should have acquired the lock */
780 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
781
782 na = ch->ch_na;
783 /* if a note-specific low watermark is given, validate it */
784 if (kn->kn_sfflags & NOTE_LOWAT) {
785 struct ch_ev_thresh note_thresh = {
786 .cet_unit = (dir == NR_TX) ?
787 ch->ch_info->cinfo_tx_lowat.cet_unit :
788 ch->ch_info->cinfo_rx_lowat.cet_unit,
789 .cet_value = (uint32_t)kn->kn_sdata
790 };
791 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
792 ¬e_thresh) != 0) {
793 SK_ERR("invalid NOTE_LOWAT threshold %u",
794 note_thresh.cet_value);
795 knote_set_error(kn, EINVAL);
796 return 0;
797 }
798 }
799
800 /* the si is indicated in the channel */
801 csi = ch->ch_si[dir];
802 CSI_LOCK(csi);
803
804 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
805 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
806 }
807
808 CSI_UNLOCK(csi);
809
810 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
811 na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
812 (kn->kn_flags & EV_POLL) ? "poll," : "",
813 (ev == EVFILT_WRITE) ? "write" : "read");
814
815 /* capture current state */
816 events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
817
818 if (__improbable(ch_filt_check_defunct(ch, kn))) {
819 revents = events;
820 } else {
821 /* filt_chprocess() will fill in the kn_sdata field */
822 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
823 NULL, TRUE, &event_error, FALSE);
824 }
825
826 if (revents & POLLERR) {
827 ASSERT(event_error != 0);
828 kn->kn_flags |= EV_EOF;
829 return 1;
830 } else {
831 return (events & revents) != 0;
832 }
833 }
834
835 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)836 filt_chan_extended_common(struct knote *kn, long ev_hint)
837 {
838 /*
839 * This function is not always called with the same set of locks held,
840 * hence it is only allowed to manipulate kn_fflags, with atomics.
841 *
842 * the f_event / f_process functions may run concurrently.
843 */
844 uint32_t add_fflags = 0;
845
846 if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
847 add_fflags |= NOTE_FLOW_ADV_UPDATE;
848 }
849 if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
850 add_fflags |= NOTE_CHANNEL_EVENT;
851 }
852 if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
853 add_fflags |= NOTE_IF_ADV_UPD;
854 }
855 if (add_fflags) {
856 /* Reset any events that are not requested on this knote */
857 add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
858 os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
859 return add_fflags != 0;
860 }
861 return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
862 }
863
864 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)865 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
866 uint32_t fflags, long *hint)
867 {
868 int revents, event_error = 0;
869
870 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
871 *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
872
873 if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
874 ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
875 /* capture new state to return */
876 revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
877 NULL, TRUE, &event_error, TRUE);
878 if (revents & POLLERR) {
879 ASSERT(event_error != 0);
880 /*
881 * Setting a knote error here will confuse libdispatch,
882 * so we use EV_EOF instead.
883 */
884 kn->kn_flags |= EV_EOF;
885 } else if ((revents & POLLIN) != 0) {
886 *hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
887 }
888 }
889 /*
890 * if the sync operation on event ring didn't find any events
891 * then indicate that the channel event is not active.
892 */
893 if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
894 /*
895 * Avoid a costly atomic when the bit is already cleared.
896 */
897 uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
898 if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
899 os_atomic_andnot(&kn->kn_fflags,
900 CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
901 }
902 }
903 }
904
905 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)906 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
907 {
908 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
909 struct ch_selinfo *csi;
910 long hint = 0;
911
912 _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
913 _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
914 _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
915
916 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
917
918 /* ch_kqfilter() should have acquired the lock */
919 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
920
921 csi = ch->ch_si[NR_TX];
922 CSI_LOCK(csi);
923 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
924 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
925 }
926 CSI_UNLOCK(csi);
927
928 if (__improbable(ch_filt_check_defunct(ch, kn))) {
929 return 1;
930 }
931 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
932 os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed);
933 }
934 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
935 if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
936 /* on registration force an event */
937 hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
938 }
939 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
940 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
941 "EVFILT_NW_CHANNEL");
942 return filt_chan_extended_common(kn, hint);
943 }
944
945 static void
filt_che_detach(struct knote * kn)946 filt_che_detach(struct knote *kn)
947 {
948 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
949 struct ch_selinfo *csi;
950
951 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
952
953 lck_mtx_lock(&ch->ch_lock);
954 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
955 os_atomic_andnot(&ch->ch_na->na_flags,
956 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
957 }
958 csi = ch->ch_si[NR_TX];
959 CSI_LOCK(csi);
960 if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
961 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
962 }
963 CSI_UNLOCK(csi);
964 lck_mtx_unlock(&ch->ch_lock);
965
966 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
967 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
968 "EVFILT_NW_CHANNEL");
969 }
970
971 static int
filt_che_event(struct knote * kn,long hint)972 filt_che_event(struct knote *kn, long hint)
973 {
974 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
975
976 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
977 if (hint == 0) {
978 return 0;
979 }
980 if (__improbable(ch_filt_check_defunct(ch, NULL))) {
981 return 1;
982 }
983 if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
984 VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
985 }
986 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
987 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
988 CHAN_FILT_HINT_BITS);
989 return filt_chan_extended_common(kn, hint);
990 }
991
992 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)993 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
994 {
995 int ret;
996 long hint = 0;
997 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
998
999 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1000 /* save off the new input fflags and data */
1001 kn->kn_sfflags = kev->fflags;
1002 kn->kn_sdata = kev->data;
1003
1004 lck_mtx_lock(&ch->ch_lock);
1005 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1006 ret = 1;
1007 goto done;
1008 }
1009 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1010 if (kev->flags & EV_ENABLE) {
1011 os_atomic_or(&ch->ch_na->na_flags,
1012 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1013 } else if (kev->flags & EV_DISABLE) {
1014 os_atomic_andnot(&ch->ch_na->na_flags,
1015 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1016 }
1017 }
1018 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1019 ret = filt_chan_extended_common(kn, hint);
1020 done:
1021 lck_mtx_unlock(&ch->ch_lock);
1022 return ret;
1023 }
1024
1025 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1026 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1027 {
1028 int ret;
1029 long hint = 0;
1030
1031 /*
1032 * -fbounds-safety: This seems like an example of interop with code that
1033 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
1034 */
1035 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
1036 knote_kn_hook_get_raw(kn));
1037
1038 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1039 lck_mtx_lock(&ch->ch_lock);
1040 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1041 ret = 1;
1042 goto done;
1043 }
1044 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1045 ret = filt_chan_extended_common(kn, hint);
1046 done:
1047 lck_mtx_unlock(&ch->ch_lock);
1048 if (ret != 0) {
1049 /*
1050 * This filter historically behaves like EV_CLEAR,
1051 * even when EV_CLEAR wasn't set.
1052 */
1053 knote_fill_kevent(kn, kev, 0);
1054 kn->kn_fflags = 0;
1055 }
1056 return ret;
1057 }
1058
1059 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1060 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1061 struct kevent_qos_s *kev)
1062 {
1063 int result;
1064
1065 lck_mtx_lock(&ch->ch_lock);
1066 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1067
1068 if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1069 na_reject_channel(ch, ch->ch_na))) {
1070 SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1071 ch->ch_pid, ch->ch_flags, CHANF_BITS);
1072 knote_set_error(kn, ENXIO);
1073 lck_mtx_unlock(&ch->ch_lock);
1074 return 0;
1075 }
1076
1077 switch (kn->kn_filter) {
1078 case EVFILT_READ:
1079 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1080 break;
1081
1082 case EVFILT_WRITE:
1083 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1084 break;
1085
1086 case EVFILT_NW_CHANNEL:
1087 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1088 break;
1089
1090 default:
1091 lck_mtx_unlock(&ch->ch_lock);
1092 SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1093 ch->ch_pid, kn->kn_filter);
1094 knote_set_error(kn, EINVAL);
1095 return 0;
1096 }
1097
1098 knote_kn_hook_set_raw(kn, ch);
1099 /* call the appropriate sub-filter attach with the channel lock held */
1100 result = knote_fops(kn)->f_attach(kn, kev);
1101 lck_mtx_unlock(&ch->ch_lock);
1102 return result;
1103 }
1104
1105 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1106 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1107 {
1108 return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1109 }
1110
1111 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1112 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1113 {
1114 int revents;
1115 int event_error = 0;
1116
1117 lck_mtx_lock(&ch->ch_lock);
1118 revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1119 FALSE);
1120 lck_mtx_unlock(&ch->ch_lock);
1121
1122 ASSERT((revents & POLLERR) == 0 || event_error != 0);
1123
1124 return revents;
1125 }
1126
1127 #if SK_LOG
1128 /* Hoisted out of line to reduce kernel stack footprint */
1129 SK_LOG_ATTRIBUTE
1130 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1131 ch_event_log(const char *prefix, const struct kern_channel *ch,
1132 struct proc *p, const struct nexus_adapter *na,
1133 int events, int revents)
1134 {
1135 SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1136 "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1137 SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1138 SK_KVA(current_thread()), events, revents);
1139 }
1140 #endif /* SK_LOG */
1141
1142 /*
1143 * select(2), poll(2) and kevent(2) handlers for channels.
1144 *
1145 * Can be called for one or more rings. Return true the event mask
1146 * corresponding to ready events. If there are no ready events, do
1147 * a selrecord on either individual selinfo or on the global one.
1148 * Device-dependent parts (locking and sync of tx/rx rings)
1149 * are done through callbacks.
1150 */
1151 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1152 ch_event(struct kern_channel *ch, int events, void *wql,
1153 struct proc *p, struct ch_event_result *result,
1154 const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1155 {
1156 struct nexus_adapter *na;
1157 struct __kern_channel_ring *kring;
1158 uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1159 uint32_t ready_tx_data = 0, ready_rx_data = 0;
1160 sk_protect_t protect = NULL;
1161
1162 #define want_tx want[NR_TX]
1163 #define want_rx want[NR_RX]
1164 /*
1165 * In order to avoid nested locks, we need to "double check"
1166 * txsync and rxsync if we decide to do a selrecord().
1167 * retry_tx (and retry_rx, later) prevent looping forever.
1168 */
1169 boolean_t retry_tx = TRUE, retry_rx = TRUE;
1170 int found, error = 0;
1171 int s;
1172
1173 net_update_uptime();
1174
1175 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1176 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1177
1178 *errno = 0;
1179
1180 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1181 ch->ch_schema == NULL)) {
1182 SK_ERR("%s(%d): channel is defunct or no longer bound",
1183 ch->ch_name, ch->ch_pid);
1184 revents = POLLERR;
1185 *errno = ENXIO;
1186 goto done;
1187 }
1188
1189 /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1190 if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1191 os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
1192 }
1193
1194 na = ch->ch_na;
1195 if (__improbable(na == NULL ||
1196 !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1197 SK_ERR("%s(%d): channel is non-permissive",
1198 ch->ch_name, ch->ch_pid);
1199 revents = POLLERR;
1200 *errno = ENXIO;
1201 goto done;
1202 }
1203
1204 /* mark thread with sync-in-progress flag */
1205 protect = sk_sync_protect();
1206
1207 /* update our work timestamp */
1208 na->na_work_ts = _net_uptime;
1209
1210 /* and make this channel eligible for draining again */
1211 if (na->na_flags & NAF_DRAINING) {
1212 os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed);
1213 }
1214
1215 #if SK_LOG
1216 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1217 ch_event_log("enter", ch, p, na, events, revents);
1218 }
1219 #endif
1220 if (is_ch_event) {
1221 goto process_channel_event;
1222 }
1223
1224 want_tx = (events & (POLLOUT | POLLWRNORM));
1225 want_rx = (events & (POLLIN | POLLRDNORM));
1226
1227 /*
1228 * check_all_{tx|rx} are set if the channel has more than one ring
1229 * AND the file descriptor is bound to all of them. If so, we sleep
1230 * on the "global" selinfo, otherwise we sleep on individual selinfo
1231 * The interrupt routine in the driver wake one or the other (or both)
1232 * depending on which clients are active.
1233 *
1234 * rxsync() is only called if we run out of buffers on a POLLIN.
1235 * txsync() is called if we run out of buffers on POLLOUT.
1236 */
1237 check_all_tx = ch_is_multiplex(ch, NR_TX);
1238 check_all_rx = ch_is_multiplex(ch, NR_RX);
1239
1240 /*
1241 * If want_tx is still set, we must issue txsync calls
1242 * (on all rings, to avoid that the tx rings stall).
1243 * XXX should also check head != khead on the tx rings.
1244 */
1245 if (want_tx) {
1246 ring_id_t first_tx = ch->ch_first[NR_TX];
1247 ring_id_t last_tx = ch->ch_last[NR_TX];
1248
1249 channel_threshold_unit_t tx_unit =
1250 ch->ch_info->cinfo_tx_lowat.cet_unit;
1251
1252 /*
1253 * The first round checks if anyone is ready, if not
1254 * do a selrecord and another round to handle races.
1255 * want_tx goes to 0 if any space is found, and is
1256 * used to skip rings with no pending transmissions.
1257 */
1258 flush_tx:
1259 for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1260 kring = &na->na_tx_rings[i];
1261 if (!want_tx &&
1262 kring->ckr_ring->ring_head == kring->ckr_khead) {
1263 continue;
1264 }
1265
1266 /* only one thread does txsync */
1267 s = kr_enter(kring, TRUE);
1268 ASSERT(s == 0);
1269
1270 error = 0;
1271 DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1272 ch, struct __kern_channel_ring *, kring);
1273 if (kr_txsync_prologue(ch, kring, p) >=
1274 kring->ckr_num_slots) {
1275 kr_log_bad_ring(kring);
1276 revents |= POLLERR;
1277 error = EFAULT;
1278 if (*errno == 0) {
1279 *errno = EFAULT;
1280 }
1281 } else {
1282 if (kring->ckr_na_sync(kring, p, 0)) {
1283 revents |= POLLERR;
1284 error = EIO;
1285 if (*errno == 0) {
1286 *errno = EIO;
1287 }
1288 } else {
1289 kr_txsync_finalize(ch, kring, p);
1290 }
1291 }
1292 DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1293 ch, struct __kern_channel_ring *, kring, int,
1294 error);
1295
1296 /*
1297 * If we found new slots, notify potential listeners on
1298 * the same ring. Since we just did a txsync, look at
1299 * the copies of cur,tail in the kring.
1300 */
1301 found = kring->ckr_rhead != kring->ckr_rtail;
1302 kr_exit(kring);
1303 if (found) { /* notify other listeners */
1304 revents |= want_tx;
1305 want_tx = 0;
1306 (void) kring->ckr_na_notify(kring, p,
1307 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1308 }
1309
1310 /*
1311 * Add this ring's free data to our running
1312 * tally for userspace.
1313 */
1314 if (result != NULL) {
1315 switch (tx_unit) {
1316 case CHANNEL_THRESHOLD_UNIT_BYTES:
1317 ready_tx_data += kring->ckr_ready_bytes;
1318 break;
1319 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1320 ready_tx_data += kring->ckr_ready_slots;
1321 break;
1322 }
1323 }
1324 }
1325 if (want_tx && retry_tx && !is_kevent) {
1326 if (check_all_tx) {
1327 csi_selrecord_all(na, NR_TX, p, wql);
1328 } else {
1329 csi_selrecord_one(&na->na_tx_rings[first_tx],
1330 p, wql);
1331 }
1332 retry_tx = FALSE;
1333 goto flush_tx;
1334 }
1335 }
1336
1337 /*
1338 * If want_rx is still set scan receive rings.
1339 * Do it on all rings because otherwise we starve.
1340 */
1341 if (want_rx) {
1342 ring_id_t first_rx = ch->ch_first[NR_RX];
1343 ring_id_t last_rx = ch->ch_last[NR_RX];
1344 channel_threshold_unit_t rx_unit =
1345 ch->ch_info->cinfo_rx_lowat.cet_unit;
1346
1347 /* two rounds here for race avoidance */
1348 do_retry_rx:
1349 for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1350 kring = &na->na_rx_rings[i];
1351
1352 /* only one thread does rxsync */
1353 s = kr_enter(kring, TRUE);
1354 ASSERT(s == 0);
1355
1356 error = 0;
1357 DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1358 ch, struct __kern_channel_ring *, kring);
1359 if (kr_rxsync_prologue(ch, kring, p) >=
1360 kring->ckr_num_slots) {
1361 kr_log_bad_ring(kring);
1362 revents |= POLLERR;
1363 error = EFAULT;
1364 if (*errno == 0) {
1365 *errno = EFAULT;
1366 }
1367 } else {
1368 /* now we can use kring->rhead, rtail */
1369 if (kring->ckr_na_sync(kring, p, 0)) {
1370 revents |= POLLERR;
1371 error = EIO;
1372 if (*errno == 0) {
1373 *errno = EIO;
1374 }
1375 } else {
1376 kr_rxsync_finalize(ch, kring, p);
1377 }
1378 }
1379
1380 DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1381 ch, struct __kern_channel_ring *, kring, int,
1382 error);
1383
1384 found = kring->ckr_rhead != kring->ckr_rtail;
1385 kr_exit(kring);
1386 if (found) {
1387 revents |= want_rx;
1388 retry_rx = FALSE;
1389 (void) kring->ckr_na_notify(kring, p,
1390 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1391 }
1392
1393 /*
1394 * Add this ring's readable data to our running
1395 * tally for userspace.
1396 */
1397 if (result != NULL) {
1398 switch (rx_unit) {
1399 case CHANNEL_THRESHOLD_UNIT_BYTES:
1400 ready_rx_data += kring->ckr_ready_bytes;
1401 break;
1402 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1403 ready_rx_data += kring->ckr_ready_slots;
1404 break;
1405 }
1406 }
1407 }
1408
1409 if (retry_rx && !is_kevent) {
1410 if (check_all_rx) {
1411 csi_selrecord_all(na, NR_RX, p, wql);
1412 } else {
1413 csi_selrecord_one(&na->na_rx_rings[first_rx],
1414 p, wql);
1415 }
1416 }
1417 if (retry_rx) {
1418 retry_rx = FALSE;
1419 goto do_retry_rx;
1420 }
1421 }
1422
1423 if (result != NULL) {
1424 result->tx_data = ready_tx_data;
1425 result->rx_data = ready_rx_data;
1426 }
1427 goto skip_channel_event;
1428
1429 process_channel_event:
1430 /*
1431 * perform sync operation on the event ring to make the channel
1432 * events enqueued in the ring visible to user-space.
1433 */
1434
1435 /* select() and poll() not supported for event ring */
1436 ASSERT(is_kevent);
1437 VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1438 kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1439
1440 /* only one thread does the sync */
1441 s = kr_enter(kring, TRUE);
1442 ASSERT(s == 0);
1443 if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1444 kr_log_bad_ring(kring);
1445 revents |= POLLERR;
1446 if (*errno == 0) {
1447 *errno = EFAULT;
1448 }
1449 } else {
1450 if (kring->ckr_na_sync(kring, p, 0)) {
1451 revents |= POLLERR;
1452 if (*errno == 0) {
1453 *errno = EIO;
1454 }
1455 } else {
1456 kr_event_sync_finalize(ch, kring, p);
1457 }
1458 }
1459 found = (kring->ckr_rhead != kring->ckr_rtail);
1460 kr_exit(kring);
1461 if (found) {
1462 revents |= (events & POLLIN);
1463 }
1464
1465 skip_channel_event:
1466 #if SK_LOG
1467 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1468 ch_event_log("exit", ch, p, na, events, revents);
1469 }
1470 #endif /* SK_LOG */
1471
1472 /* unmark thread with sync-in-progress flag */
1473 sk_sync_unprotect(protect);
1474
1475 done:
1476 ASSERT(!sk_is_sync_protected());
1477
1478 return revents;
1479 #undef want_tx
1480 #undef want_rx
1481 }
1482
1483 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1484 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1485 {
1486 struct kern_channel *ch;
1487
1488 SK_LOCK_ASSERT_HELD();
1489
1490 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1491 struct ch_info *cinfo = ch->ch_info;
1492
1493 /* see comments in ch_open() */
1494 if (cinfo->cinfo_nx_port != port) {
1495 continue;
1496 } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1497 continue;
1498 } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1499 ring_id != cinfo->cinfo_ch_ring_id &&
1500 ring_id != CHANNEL_RING_ID_ANY) {
1501 continue;
1502 }
1503
1504 /* found a match */
1505 break;
1506 }
1507
1508 if (ch != NULL) {
1509 ch_retain_locked(ch);
1510 }
1511
1512 return ch;
1513 }
1514
1515 #if SK_LOG
1516 /* Hoisted out of line to reduce kernel stack footprint */
1517 SK_LOG_ATTRIBUTE
1518 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1519 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1520 {
1521 uuid_string_t uuidstr;
1522
1523 SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1524 sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1525 sk_uuid_unparse(p_uuid, uuidstr), port);
1526 }
1527
1528 SK_LOG_ATTRIBUTE
1529 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,const char * mode_bits,int err)1530 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1531 uint32_t mode, const char *mode_bits, int err)
1532 {
1533 SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1534 sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1535 mode, mode_bits, err);
1536 }
1537 #endif /* SK_LOG */
1538
1539 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1540 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1541 {
1542 uint32_t mode = init->ci_ch_mode;
1543 nexus_port_t port = init->ci_nx_port;
1544 ring_id_t ring = init->ci_ch_ring_id;
1545 struct kern_channel *ch = NULL, *ch0 = NULL;
1546 struct nxbind *nxb = NULL;
1547 struct kern_nexus *nx;
1548 struct chreq chr;
1549 uuid_t p_uuid;
1550 kauth_cred_t cred;
1551
1552 cred = kauth_cred_get();
1553 ASSERT(!uuid_is_null(init->ci_nx_uuid));
1554 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1555 *err = 0;
1556
1557 /* make sure we don't allow userland to set kernel-only flags */
1558 mode &= CHMODE_MASK;
1559
1560 SK_LOCK();
1561
1562 nx = nx_find(init->ci_nx_uuid, TRUE);
1563 if (nx == NULL) {
1564 *err = ENOENT;
1565 goto done;
1566 }
1567 if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1568 *err = EBUSY;
1569 goto done;
1570 }
1571
1572 /* port (zero-based) must be within the domain's range */
1573 if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1574 *err = EDOM;
1575 goto done;
1576 }
1577 VERIFY(port != NEXUS_PORT_ANY);
1578
1579 if (mode & CHMODE_LOW_LATENCY) {
1580 if ((*err = skywalk_priv_check_cred(p, cred,
1581 PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1582 goto done;
1583 }
1584 }
1585
1586 /* "no copy" is valid only when at least one tx/rx mon flag is set */
1587 if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1588 mode &= ~CHMODE_MONITOR_NO_COPY;
1589 }
1590
1591 if (mode & CHMODE_MONITOR) {
1592 if ((*err = skywalk_priv_check_cred(p, cred,
1593 PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1594 goto done;
1595 }
1596 /* Don't allow non-root processes to monitor channels. */
1597 if (kauth_cred_issuser(cred) == 0) {
1598 *err = EPERM;
1599 goto done;
1600 }
1601 }
1602
1603 /*
1604 * Check with the nexus to see if the port is bound; if so, prepare
1605 * our nxbind structure that we'll need to pass down to the nexus
1606 * for it compare. If the caller provides a key, we take it over
1607 * and will free it ourselves (as part of freeing nxbind.)
1608 *
1609 * If this is a monitor channel, skip this altogether since the check
1610 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1611 */
1612 if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1613 /*
1614 * -fbounds-safety: ci_key is user_addr_t (aka uint64_t), so
1615 * can't mark it as __sized_by. Forge it instead.
1616 */
1617 void *key = __unsafe_forge_bidi_indexable(void *, init->ci_key,
1618 init->ci_key_len);
1619
1620 #if SK_LOG
1621 if (__improbable(sk_verbose != 0)) {
1622 ch_open_log1(p_uuid, p, port);
1623 }
1624 #endif /* SK_LOG */
1625
1626 nxb = nxb_alloc(Z_WAITOK);
1627 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1628 nxb->nxb_uniqueid = proc_uniqueid(p);
1629 nxb->nxb_pid = proc_pid(p);
1630 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1631 uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1632 if (key != NULL) {
1633 nxb->nxb_flags |= NXBF_MATCH_KEY;
1634 nxb->nxb_key_len = init->ci_key_len;
1635 nxb->nxb_key = key;
1636 init->ci_key = USER_ADDR_NULL; /* take over */
1637 }
1638 }
1639
1640 /*
1641 * There can only be one owner of {port,ring_id} tuple. Once
1642 * owned, this can be made available among multiple monitors.
1643 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1644 * all rings. Further attempts to own any or all of the rings
1645 * will be declined.
1646 *
1647 * Multiple monitors are allowed to exist. If a channel has been
1648 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1649 * monitored. If an owning channel has been bound to an individual
1650 * ring, only that ring can be monitored, either by specifying the
1651 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1652 *
1653 * For example, assuming a 2-rings setup for port 'p':
1654 *
1655 * owner{p,-1}
1656 * will allow:
1657 * monitor{p,-1}, monitor{p,0}, monitor{p,1}
1658 * will not allow:
1659 * owner{p,-1}, owner{p,0}, owner{p,1}
1660 *
1661 * owner{p,0}
1662 * will allow:
1663 * owner{p,1}, monitor{p,-1}, monitor{p,0}
1664 * will not allow:
1665 * owner{p,-1}, owner{p,0}, monitor{p,1}
1666 */
1667 if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1668 SK_D("found ch0 0x%llx", SK_KVA(ch0));
1669 /*
1670 * Unless this is a monitor channel, allow only at
1671 * most one owner of the {port,ring_id} tuple.
1672 */
1673 if (!(mode & CHMODE_MONITOR)) {
1674 #if SK_LOG
1675 uuid_string_t uuidstr;
1676 char *na_name = (ch0->ch_na != NULL) ?
1677 ch0->ch_na->na_name : "";
1678
1679 SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1680 "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1681 sk_uuid_unparse(nx->nx_uuid, uuidstr),
1682 ch0->ch_name, ch0->ch_pid);
1683 #endif /* SK_LOG */
1684 *err = EBUSY;
1685 goto done;
1686 }
1687 } else if (mode & CHMODE_MONITOR) {
1688 *err = ENXIO;
1689 goto done;
1690 }
1691
1692 bzero(&chr, sizeof(chr));
1693 chr.cr_tx_lowat = init->ci_tx_lowat;
1694 chr.cr_rx_lowat = init->ci_rx_lowat;
1695 chr.cr_port = port;
1696 chr.cr_mode = mode;
1697 chr.cr_ring_id = ring;
1698
1699 /* upon success, returns a channel with reference held */
1700 ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1701
1702 done:
1703
1704 #if SK_LOG
1705 if (__improbable(sk_verbose != 0)) {
1706 ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1707 }
1708 #endif /* SK_LOG */
1709
1710 if (ch0 != NULL) {
1711 (void) ch_release_locked(ch0);
1712 }
1713
1714 if (nx != NULL) {
1715 (void) nx_release_locked(nx);
1716 }
1717
1718 if (nxb != NULL) {
1719 nxb_free(nxb);
1720 }
1721
1722 SK_UNLOCK();
1723
1724 return ch;
1725 }
1726
1727 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1728 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1729 int *err)
1730 {
1731 struct kern_channel *ch = NULL;
1732
1733 SK_LOCK_ASSERT_HELD();
1734 if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1735 *err = EBUSY;
1736 goto done;
1737 }
1738 *err = 0;
1739
1740 ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1741 ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1742 ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1743 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1744 chr->cr_mode |= CHMODE_KERNEL;
1745 if (nonxref) {
1746 chr->cr_mode |= CHMODE_NO_NXREF;
1747 } else {
1748 chr->cr_mode &= ~CHMODE_NO_NXREF;
1749 }
1750
1751 /* upon success, returns a channel with reference held */
1752 ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1753 if (ch != NULL) {
1754 /*
1755 * nonxref channels don't hold any reference to the nexus,
1756 * since otherwise we'll never be able to close them when
1757 * the last regular channel of the nexus is closed, as part
1758 * of the nexus's destructor operation. Release the nonxref
1759 * channel reference now, but make sure the nexus has at
1760 * least 3 refs: global list, provider list and the nonxref
1761 * channel itself, before doing that.
1762 */
1763 if (nonxref) {
1764 ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1765 ASSERT(nx->nx_refcnt > 3);
1766 (void) nx_release_locked(nx);
1767 }
1768 }
1769
1770 #if SK_LOG
1771 uuid_string_t uuidstr;
1772 const char * na_name = NULL;
1773 const char * nxdom_prov_name = NULL;
1774
1775 if (ch != NULL && ch->ch_na != NULL) {
1776 na_name = ch->ch_na->na_name;
1777 }
1778 if (nx->nx_prov != NULL) {
1779 nxdom_prov_name = NX_DOM_PROV(nx)->nxdom_prov_name;
1780 }
1781 SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1782 SK_KVA(nx),
1783 (nxdom_prov_name != NULL) ? nxdom_prov_name : "",
1784 (na_name != NULL) ? na_name : "",
1785 (int)chr->cr_port, (int)chr->cr_ring_id,
1786 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1787 CHMODE_BITS, *err);
1788 #endif /* SK_LOG */
1789
1790 done:
1791 return ch;
1792 }
1793
1794 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1795 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1796 {
1797 #pragma unused(special)
1798 #if SK_LOG
1799 uuid_string_t uuidstr;
1800 const char *na_name = (ch->ch_na != NULL) ?
1801 ch->ch_na->na_name : "";
1802 const char *__null_terminated nxdom_name = "";
1803 if (ch->ch_nexus != NULL) {
1804 nxdom_name = NX_DOM(ch->ch_nexus)->nxdom_name;
1805 }
1806 const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1807 NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1808
1809 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1810 SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1811 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1812 SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1813 uuidstr));
1814 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1815 #endif /* SK_LOG */
1816 struct kern_nexus *nx = ch->ch_nexus;
1817
1818 if (!locked) {
1819 SK_LOCK();
1820 }
1821
1822 SK_LOCK_ASSERT_HELD();
1823 /*
1824 * If the channel is participating in the interface advisory
1825 * notification, remove it from the nexus.
1826 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1827 * is held in exclusive mode.
1828 */
1829 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1830 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1831 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1832 kern_channel, ch_link_if_adv);
1833 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
1834 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1835 nx_netif_config_interface_advisory(nx, false);
1836 }
1837 lck_rw_done(&nx->nx_ch_if_adv_lock);
1838 lck_mtx_lock(&ch->ch_lock);
1839 (void) ch_release_locked(ch);
1840 } else {
1841 lck_rw_done(&nx->nx_ch_if_adv_lock);
1842 lck_mtx_lock(&ch->ch_lock);
1843 }
1844 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1845 /*
1846 * Mark the channel as closing to prevent further setopt requests;
1847 * this flag is set once here and never gets cleared.
1848 */
1849 ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1850 os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed);
1851
1852 if (special) {
1853 VERIFY(ch->ch_flags & CHANF_KERNEL);
1854 } else {
1855 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1856 }
1857
1858 ch->ch_fd = -1;
1859
1860 /* may be called as part of failure cleanup, so check */
1861 if (ch->ch_flags & CHANF_ATTACHED) {
1862 boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1863
1864 /* caller must hold an extra ref */
1865 ASSERT(ch->ch_refcnt > 1);
1866
1867 /* disconnect from nexus */
1868 ch_disconnect(ch);
1869
1870 /*
1871 * If this was the last regular channel and the nexus
1872 * has been closed, detach it and finish up the job.
1873 * If this was a nonxref channel, there is nothing
1874 * left to do; see comments in ch_open_special().
1875 */
1876 if (!nonxref) {
1877 STAILQ_REMOVE(&nx->nx_ch_head, ch,
1878 kern_channel, ch_link);
1879 nx->nx_ch_count--;
1880 if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1881 (nx->nx_flags & NXF_CLOSED)) {
1882 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1883 nx_detach(nx);
1884 }
1885 (void) nx_release_locked(nx);
1886 } else {
1887 ASSERT(ch->ch_flags & CHANF_KERNEL);
1888 STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1889 kern_channel, ch_link);
1890 }
1891
1892 os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed);
1893 ch->ch_nexus = NULL;
1894
1895 (void) ch_release_locked(ch); /* for the list */
1896 }
1897
1898 lck_mtx_unlock(&ch->ch_lock);
1899 if (!locked) {
1900 SK_UNLOCK();
1901 }
1902 }
1903
1904 void
ch_close(struct kern_channel * ch,boolean_t locked)1905 ch_close(struct kern_channel *ch, boolean_t locked)
1906 {
1907 ch_close_common(ch, locked, FALSE);
1908 }
1909
1910 void
ch_close_special(struct kern_channel * ch)1911 ch_close_special(struct kern_channel *ch)
1912 {
1913 ch_close_common(ch, TRUE, TRUE);
1914 }
1915
1916 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1917 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1918 struct ch_ev_thresh *cet)
1919 {
1920 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1921 uint32_t bmin, bmax, smin, smax;
1922 int err = 0;
1923
1924 if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1925 cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1926 err = EINVAL;
1927 goto done;
1928 }
1929
1930 smin = 1; /* minimum 1 slot */
1931 bmin = 1; /* minimum 1 byte */
1932
1933 if (t == NR_TX) {
1934 ASSERT(nxp->nxp_tx_slots > 0);
1935 smax = (nxp->nxp_tx_slots - 1);
1936 } else {
1937 ASSERT(nxp->nxp_rx_slots > 0);
1938 smax = (nxp->nxp_rx_slots - 1);
1939 }
1940 bmax = (smax * nxp->nxp_buf_size);
1941
1942 switch (cet->cet_unit) {
1943 case CHANNEL_THRESHOLD_UNIT_BYTES:
1944 if (cet->cet_value < bmin) {
1945 cet->cet_value = bmin;
1946 } else if (cet->cet_value > bmax) {
1947 cet->cet_value = bmax;
1948 }
1949 break;
1950
1951 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1952 if (cet->cet_value < smin) {
1953 cet->cet_value = smin;
1954 } else if (cet->cet_value > smax) {
1955 cet->cet_value = smax;
1956 }
1957 break;
1958 }
1959
1960 done:
1961 return err;
1962 }
1963
1964 #if SK_LOG
1965 /* Hoisted out of line to reduce kernel stack footprint */
1966 SK_LOG_ATTRIBUTE
1967 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1968 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1969 const struct chreq *chr, const struct kern_channel *ch,
1970 const struct kern_nexus_domain_provider *nxdom_prov,
1971 struct proc *p)
1972 {
1973 struct __user_channel_schema *ch_schema = ch->ch_schema;
1974 uuid_string_t uuidstr;
1975 unsigned int n;
1976 ring_id_t i, j;
1977
1978 ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1979 if (ch_schema != NULL) {
1980 SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1981 SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name);
1982 SK_D(" kern_uuid: %s",
1983 sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1984 SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS);
1985 SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings,
1986 cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1987 SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings,
1988 cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1989
1990 j = ch->ch_last[NR_TX];
1991 for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1992 SK_D(" tx_ring_%u_off: 0x%llx", i,
1993 (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1994 SK_D(" tx_sd_%u_off: 0x%llx", i,
1995 (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1996 }
1997 j = n;
1998 for (n = 0, i = ch->ch_first[NR_RX];
1999 i < ch->ch_last[NR_RX]; n++, i++) {
2000 SK_D(" rx_ring_%u_off: 0x%llx", i,
2001 (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
2002 SK_D(" rx_sd_%u_off: 0x%llx", i,
2003 (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
2004 }
2005 SK_D(" md_type: %u", ch_schema->csm_md_type);
2006 SK_D(" md_subtype: %u", ch_schema->csm_md_subtype);
2007 SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs);
2008 SK_D(" stats_type: %u", ch_schema->csm_stats_type);
2009 SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs);
2010 SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max);
2011 SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs);
2012 }
2013
2014 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
2015 SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
2016 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
2017 cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
2018 SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
2019 SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
2020 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
2021 SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
2022 sk_proc_name_address(p), sk_proc_pid(p));
2023 SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
2024 ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
2025 "bytes" : "slots"));
2026 SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
2027 ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
2028 "bytes" : "slots"));
2029 SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
2030 SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
2031 SK_D(" mapsize: 0x%llx (%llu KB)",
2032 (uint64_t)cinfo->cinfo_mem_map_size,
2033 (uint64_t)cinfo->cinfo_mem_map_size >> 10);
2034 SK_D(" memsize: 0x%llx (%llu KB)",
2035 (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
2036 SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
2037 }
2038
2039 SK_LOG_ATTRIBUTE
2040 static void
ch_connect_log2(const struct kern_nexus * nx,int err)2041 ch_connect_log2(const struct kern_nexus *nx, int err)
2042 {
2043 uuid_string_t nx_uuidstr;
2044
2045 SK_ERR("Error connecting to nexus UUID %s: %d",
2046 sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2047 }
2048 #endif /* SK_LOG */
2049
2050 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,int fd,int * err)2051 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2052 struct nxbind *nxb, struct proc *p, int fd, int *err)
2053 {
2054 struct kern_nexus_domain_provider *nxdom_prov;
2055 struct kern_channel *ch = NULL;
2056 struct ch_info *cinfo = NULL;
2057 uint32_t ch_mode = chr->cr_mode;
2058 boolean_t config = FALSE;
2059 struct nxdom *nxdom;
2060 boolean_t reserved_port = FALSE;
2061
2062 ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2063 ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2064 SK_LOCK_ASSERT_HELD();
2065
2066 /* validate thresholds before we proceed any further */
2067 if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2068 (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2069 goto done;
2070 }
2071
2072 if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2073 *err = ENOTSUP;
2074 goto done;
2075 }
2076
2077 ch = ch_alloc(Z_WAITOK);
2078
2079 lck_mtx_lock(&ch->ch_lock);
2080
2081 uuid_generate_random(ch->ch_info->cinfo_ch_id);
2082 ch->ch_fd = fd;
2083 ch->ch_pid = proc_pid(p);
2084 (void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2085 proc_name_address(p));
2086
2087 nxdom_prov = NX_DOM_PROV(nx);
2088 nxdom = NX_DOM(nx);
2089
2090 if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2091 /*
2092 * CHANF_KERNEL implies a channel opened by a kernel
2093 * subsystem, and is triggered by the CHMODE_KERNEL
2094 * flag which (only ever) set by ch_open_special().
2095 *
2096 * CHANF_NONXREF can be optionally set based on the
2097 * CHMODE_NO_NXREF request flag. This must only be
2098 * set by ch_open_special() as well, hence we verify.
2099 */
2100 ASSERT(p == kernproc);
2101 ASSERT(ch_mode & CHMODE_KERNEL);
2102 os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed);
2103 if (ch_mode & CHMODE_NO_NXREF) {
2104 os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed);
2105 }
2106
2107 config = (ch_mode & CHMODE_CONFIG) != 0;
2108 if (chr->cr_port == NEXUS_PORT_ANY) {
2109 if (nxdom->nxdom_find_port == NULL) {
2110 *err = ENOTSUP;
2111 goto done;
2112 }
2113
2114 /*
2115 * If ephemeral port request, find one for client;
2116 * we ask for the reserved port range if this is
2117 * a configuration request (CHMODE_CONFIG).
2118 */
2119 if ((*err = nxdom->nxdom_find_port(nx,
2120 config, &chr->cr_port)) != 0) {
2121 goto done;
2122 }
2123 }
2124 }
2125
2126 if (skywalk_check_platform_binary(p)) {
2127 os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed);
2128 }
2129
2130 ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2131
2132 reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2133 (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2134 if (!config && reserved_port) {
2135 *err = EDOM;
2136 goto done;
2137 }
2138
2139 SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2140 sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2141
2142 if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2143 nx, ch, chr, ch0, nxb, p)) != 0) {
2144 goto done;
2145 }
2146
2147 cinfo = ch->ch_info;
2148 uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2149 /* for easy access to immutables */
2150 bcopy(nx->nx_prov->nxprov_params, &cinfo->cinfo_nxprov_params,
2151 sizeof(struct nxprov_params));
2152 cinfo->cinfo_ch_mode = ch_mode;
2153 cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2154 cinfo->cinfo_nx_port = chr->cr_port;
2155 cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2156 cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2157 cinfo->cinfo_schema_offset = chr->cr_memoffset;
2158 cinfo->cinfo_num_bufs =
2159 PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2160 /*
2161 * ch_last is really the number of rings, but we need to return
2162 * the actual zero-based ring ID to the client. Make sure that
2163 * is the case here and adjust last_{tx,rx}_ring accordingly.
2164 */
2165 ASSERT((ch->ch_last[NR_TX] > 0) ||
2166 (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2167 ASSERT((ch->ch_last[NR_RX] > 0) ||
2168 (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2169 cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2170 cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2171 cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2172 cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2173 cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2174 cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2175
2176 if (ch_mode & CHMODE_NO_NXREF) {
2177 ASSERT(ch_mode & CHMODE_KERNEL);
2178 STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2179 } else {
2180 STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2181 nx->nx_ch_count++;
2182 }
2183 os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed);
2184 ch->ch_nexus = nx;
2185 nx_retain_locked(nx); /* hold a ref on the nexus */
2186
2187 ch_retain_locked(ch); /* one for being in the list */
2188 ch_retain_locked(ch); /* one for the caller */
2189
2190 /*
2191 * Now that we've successfully created the nexus adapter, inform the
2192 * nexus provider about the rings and the slots within each ring.
2193 * This is a no-op for internal nexus providers.
2194 */
2195 if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2196 lck_mtx_unlock(&ch->ch_lock);
2197
2198 /* gracefully close this fully-formed channel */
2199 if (ch->ch_flags & CHANF_KERNEL) {
2200 ch_close_special(ch);
2201 } else {
2202 ch_close(ch, TRUE);
2203 }
2204 (void) ch_release_locked(ch);
2205 ch = NULL;
2206 goto done;
2207 }
2208
2209 ASSERT(ch->ch_schema == NULL ||
2210 (ch->ch_schema->csm_flags & CSM_ACTIVE));
2211
2212 #if SK_LOG
2213 if (__improbable(sk_verbose != 0)) {
2214 ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2215 }
2216 #endif /* SK_LOG */
2217
2218 done:
2219 if (ch != NULL) {
2220 lck_mtx_unlock(&ch->ch_lock);
2221 }
2222 if (*err != 0) {
2223 #if SK_LOG
2224 if (__improbable(sk_verbose != 0)) {
2225 ch_connect_log2(nx, *err);
2226 }
2227 #endif /* SK_LOG */
2228 if (ch != NULL) {
2229 ch_free(ch);
2230 ch = NULL;
2231 }
2232 }
2233 return ch;
2234 }
2235
2236 static void
ch_disconnect(struct kern_channel * ch)2237 ch_disconnect(struct kern_channel *ch)
2238 {
2239 struct kern_nexus *nx = ch->ch_nexus;
2240 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2241
2242 SK_LOCK_ASSERT_HELD();
2243 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2244
2245 /*
2246 * Inform the nexus provider that the channel has been quiesced
2247 * and disconnected from the nexus port. This is a no-op for
2248 * internal nexus providers.
2249 */
2250 nxprov_advise_disconnect(nx, ch);
2251
2252 /* Finally, let the domain provider tear down the instance */
2253 nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2254 }
2255
2256 void
ch_deactivate(struct kern_channel * ch)2257 ch_deactivate(struct kern_channel *ch)
2258 {
2259 /*
2260 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2261 * it will never be set again. Doing this will cause
2262 * os_channel_is_defunct() to indicate that the channel
2263 * is defunct and is no longer usable (thus should be
2264 * immediately closed).
2265 */
2266 if (ch->ch_schema != NULL &&
2267 (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2268 os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags),
2269 CSM_ACTIVE, relaxed);
2270 /* make this globally visible */
2271 os_atomic_thread_fence(seq_cst);
2272 }
2273 }
2274
2275 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2276 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2277 {
2278 #pragma unused(ch)
2279 int err = 0;
2280
2281 if (sopt->sopt_dir != SOPT_SET) {
2282 sopt->sopt_dir = SOPT_SET;
2283 }
2284
2285 switch (sopt->sopt_name) {
2286 case CHOPT_TX_LOWAT_THRESH:
2287 err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2288 break;
2289
2290 case CHOPT_RX_LOWAT_THRESH:
2291 err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2292 break;
2293
2294 case CHOPT_IF_ADV_CONF:
2295 err = ch_configure_interface_advisory_event(ch, sopt);
2296 break;
2297
2298 default:
2299 err = ENOPROTOOPT;
2300 break;
2301 }
2302
2303 return err;
2304 }
2305
2306 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2307 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2308 {
2309 #pragma unused(ch)
2310 int err = 0;
2311
2312 if (sopt->sopt_dir != SOPT_GET) {
2313 sopt->sopt_dir = SOPT_GET;
2314 }
2315
2316 switch (sopt->sopt_name) {
2317 case CHOPT_TX_LOWAT_THRESH:
2318 err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2319 break;
2320
2321 case CHOPT_RX_LOWAT_THRESH:
2322 err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2323 break;
2324
2325 default:
2326 err = ENOPROTOOPT;
2327 break;
2328 }
2329
2330 return err;
2331 }
2332
2333 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2334 ch_configure_interface_advisory_event(struct kern_channel *ch,
2335 struct sockopt *sopt)
2336 {
2337 int err = 0;
2338 boolean_t enable = 0;
2339 struct kern_nexus *nx = ch->ch_nexus;
2340
2341 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2342 SK_LOCK_ASSERT_NOTHELD();
2343
2344 if (sopt->sopt_val == USER_ADDR_NULL) {
2345 return EINVAL;
2346 }
2347 if (nx->nx_adv.nxv_adv == NULL) {
2348 return ENOTSUP;
2349 }
2350 err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2351 if (err != 0) {
2352 return err;
2353 }
2354
2355 /*
2356 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2357 * ordering requirement; check if the channel is closing once ch_lock
2358 * is reacquired and bail if so.
2359 */
2360 lck_mtx_unlock(&ch->ch_lock);
2361 SK_LOCK();
2362 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2363 lck_mtx_lock(&ch->ch_lock);
2364 if (ch->ch_flags & CHANF_CLOSING) {
2365 err = ENXIO;
2366 goto done;
2367 }
2368
2369 /*
2370 * if interface advisory reporting is enabled on the channel then
2371 * add the channel to the list of channels eligible for interface
2372 * advisory update on the nexus. If disabled, remove from the list.
2373 */
2374 if (enable) {
2375 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2376 ASSERT(err == 0);
2377 goto done;
2378 }
2379 bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2380 os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2381 STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2382 if (enable_adv) {
2383 nx_netif_config_interface_advisory(nx, true);
2384 }
2385 ch_retain_locked(ch); /* for being in the IF ADV list */
2386 } else {
2387 if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2388 ASSERT(err == 0);
2389 goto done;
2390 }
2391 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2392 ch_link_if_adv);
2393 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2394 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2395 nx_netif_config_interface_advisory(nx, false);
2396 }
2397 (void) ch_release_locked(ch);
2398 }
2399
2400 done:
2401 lck_mtx_unlock(&ch->ch_lock);
2402 lck_rw_done(&nx->nx_ch_if_adv_lock);
2403 SK_UNLOCK();
2404 lck_mtx_lock(&ch->ch_lock);
2405
2406 return err;
2407 }
2408
2409 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2410 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2411 struct sockopt *sopt)
2412 {
2413 struct ch_ev_thresh cet, *ocet;
2414 int err = 0;
2415
2416 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2417
2418 if (sopt->sopt_val == USER_ADDR_NULL) {
2419 return EINVAL;
2420 }
2421
2422 bzero(&cet, sizeof(cet));
2423 err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2424 if (err == 0) {
2425 err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2426 if (err == 0) {
2427 if (t == NR_TX) {
2428 ocet = &ch->ch_info->cinfo_tx_lowat;
2429 } else {
2430 ocet = &ch->ch_info->cinfo_rx_lowat;
2431 }
2432
2433 /* if there is no change, we're done */
2434 if (ocet->cet_unit == cet.cet_unit &&
2435 ocet->cet_value == cet.cet_value) {
2436 return 0;
2437 }
2438
2439 *ocet = cet;
2440
2441 for_rx_tx(t) {
2442 ring_id_t qfirst = ch->ch_first[t];
2443 ring_id_t qlast = ch->ch_last[t];
2444 uint32_t i;
2445
2446 for (i = qfirst; i < qlast; i++) {
2447 struct __kern_channel_ring *kring =
2448 &NAKR(ch->ch_na, t)[i];
2449
2450 (void) kring->ckr_na_notify(kring,
2451 sopt->sopt_p, 0);
2452 }
2453 }
2454
2455 (void) sooptcopyout(sopt, &cet, sizeof(cet));
2456 }
2457 }
2458
2459 return err;
2460 }
2461
2462 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2463 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2464 struct sockopt *sopt)
2465 {
2466 struct ch_ev_thresh cet;
2467
2468 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2469
2470 if (sopt->sopt_val == USER_ADDR_NULL) {
2471 return EINVAL;
2472 }
2473
2474 if (t == NR_TX) {
2475 cet = ch->ch_info->cinfo_tx_lowat;
2476 } else {
2477 cet = ch->ch_info->cinfo_rx_lowat;
2478 }
2479
2480 return sooptcopyout(sopt, &cet, sizeof(cet));
2481 }
2482
2483 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2484 ch_alloc(zalloc_flags_t how)
2485 {
2486 struct kern_channel *ch;
2487
2488 ch = zalloc_flags(ch_zone, how | Z_ZERO);
2489 if (ch) {
2490 lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2491 ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2492 }
2493 return ch;
2494 }
2495
2496 static void
ch_free(struct kern_channel * ch)2497 ch_free(struct kern_channel *ch)
2498 {
2499 ASSERT(ch->ch_refcnt == 0);
2500 ASSERT(ch->ch_pp == NULL);
2501 ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2502 CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2503 lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2504 SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2505 ASSERT(ch->ch_info != NULL);
2506 zfree(ch_info_zone, ch->ch_info);
2507 ch->ch_info = NULL;
2508 zfree(ch_zone, ch);
2509 }
2510
2511 void
ch_retain_locked(struct kern_channel * ch)2512 ch_retain_locked(struct kern_channel *ch)
2513 {
2514 SK_LOCK_ASSERT_HELD();
2515
2516 ch->ch_refcnt++;
2517 VERIFY(ch->ch_refcnt != 0);
2518 }
2519
2520 void
ch_retain(struct kern_channel * ch)2521 ch_retain(struct kern_channel *ch)
2522 {
2523 SK_LOCK();
2524 ch_retain_locked(ch);
2525 SK_UNLOCK();
2526 }
2527
2528 int
ch_release_locked(struct kern_channel * ch)2529 ch_release_locked(struct kern_channel *ch)
2530 {
2531 int oldref = ch->ch_refcnt;
2532
2533 SK_LOCK_ASSERT_HELD();
2534
2535 VERIFY(ch->ch_refcnt != 0);
2536 if (--ch->ch_refcnt == 0) {
2537 ch_free(ch);
2538 }
2539
2540 return oldref == 1;
2541 }
2542
2543 int
ch_release(struct kern_channel * ch)2544 ch_release(struct kern_channel *ch)
2545 {
2546 int lastref;
2547
2548 SK_LOCK();
2549 lastref = ch_release_locked(ch);
2550 SK_UNLOCK();
2551
2552 return lastref;
2553 }
2554
2555 /*
2556 * -fbounds-safety: Why is the arg void *? All callers pass struct kern_channel *
2557 */
2558 void
ch_dtor(struct kern_channel * arg)2559 ch_dtor(struct kern_channel *arg)
2560 {
2561 struct kern_channel *ch = arg;
2562
2563 SK_LOCK();
2564 ch_close(ch, TRUE);
2565 (void) ch_release_locked(ch);
2566 SK_UNLOCK();
2567 }
2568