1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61
62 #include <kern/uipc_domain.h>
63
64 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
65
66 struct ch_event_result {
67 uint32_t tx_data;
68 uint32_t rx_data;
69 };
70
71 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
72 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
73 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
74
75 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
76 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
77 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
78 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
79 boolean_t, boolean_t, uint32_t);
80 static boolean_t csi_tcall_start(struct ch_selinfo *);
81 static void csi_tcall(thread_call_param_t, thread_call_param_t);
82 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
83
84 static void ch_redzone_init(void);
85 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
86 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
87 ring_id_t);
88 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
89 struct ch_ev_thresh *);
90 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
91 struct nxbind *, struct proc *, int, int *);
92 static void ch_disconnect(struct kern_channel *);
93 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
94 struct sockopt *);
95 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
96 struct sockopt *);
97 static struct kern_channel *ch_alloc(zalloc_flags_t);
98 static void ch_free(struct kern_channel *);
99 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
100 struct sockopt *sopt);
101
102 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
103 static void filt_chrwdetach(struct knote *, boolean_t);
104 static void filt_chrdetach(struct knote *);
105 static void filt_chwdetach(struct knote *);
106 static int filt_chrw(struct knote *, long, int);
107 static int filt_chread(struct knote *, long);
108 static int filt_chwrite(struct knote *, long);
109
110 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
111 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
112 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
113 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
114 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
115 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
116 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
117 static void filt_che_detach(struct knote *);
118 static int filt_che_event(struct knote *, long);
119 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
120 static int filt_che_process(struct knote *, struct kevent_qos_s *);
121 static int filt_chan_extended_common(struct knote *, long);
122
123 static int ch_event(struct kern_channel *ch, int events,
124 void *wql, struct proc *p, struct ch_event_result *,
125 const boolean_t is_kevent, int *errno, const boolean_t);
126
127 const struct filterops skywalk_channel_rfiltops = {
128 .f_isfd = 1,
129 .f_attach = filt_chrwattach,
130 .f_detach = filt_chrdetach,
131 .f_event = filt_chread,
132 .f_touch = filt_chrtouch,
133 .f_process = filt_chrprocess,
134 };
135
136 const struct filterops skywalk_channel_wfiltops = {
137 .f_isfd = 1,
138 .f_attach = filt_chrwattach,
139 .f_detach = filt_chwdetach,
140 .f_event = filt_chwrite,
141 .f_touch = filt_chwtouch,
142 .f_process = filt_chwprocess,
143 };
144
145 const struct filterops skywalk_channel_efiltops = {
146 .f_isfd = 1,
147 .f_attach = filt_che_attach,
148 .f_detach = filt_che_detach,
149 .f_event = filt_che_event,
150 .f_touch = filt_che_touch,
151 .f_process = filt_che_process,
152 };
153
154 /* mitigation intervals in ns */
155 #define CH_MIT_IVAL_MIN NSEC_PER_USEC
156
157 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
158
159 #if (DEVELOPMENT || DEBUG)
160 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
161 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
162 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
163 CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
164 #endif /* !DEVELOPMENT && !DEBUG */
165
166 static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel);
167
168 static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info);
169
170 static int __ch_inited = 0;
171
172 /*
173 * Global cookies to hold the random numbers used for verifying
174 * user metadata red zone violations.
175 */
176 uint64_t __ch_umd_redzone_cookie = 0;
177
178 #define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key"
179 SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
180
181 static void
ch_redzone_init(void)182 ch_redzone_init(void)
183 {
184 static_assert(sizeof(__ch_umd_redzone_cookie) == sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
185 static_assert(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
186 static_assert(sizeof(struct __slot_desc) == 8);
187
188 /* Initialize random user red zone cookie values */
189 do {
190 read_random(&__ch_umd_redzone_cookie,
191 sizeof(__ch_umd_redzone_cookie));
192 } while (__ch_umd_redzone_cookie == 0);
193
194 SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
195 }
196
197 int
channel_init(void)198 channel_init(void)
199 {
200 int error = 0;
201
202 SK_LOCK_ASSERT_HELD();
203 ASSERT(!__ch_inited);
204
205 static_assert(offsetof(struct __user_packet, pkt_qum) == 0);
206 static_assert(offsetof(struct __kern_packet, pkt_qum) == 0);
207
208 ch_redzone_init();
209
210 __ch_inited = 1;
211
212 return error;
213 }
214
215 void
channel_fini(void)216 channel_fini(void)
217 {
218 SK_LOCK_ASSERT_HELD();
219
220 if (__ch_inited) {
221 __ch_umd_redzone_cookie = 0;
222 __ch_inited = 0;
223 }
224 }
225
226 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)227 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
228 {
229 csi->csi_flags = 0;
230 csi->csi_pending = 0;
231 if (mitigation) {
232 csi->csi_interval = mit_ival;
233 csi->csi_eff_interval = ch_mit_ival; /* global override */
234 os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed);
235 csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
236 csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
237 /* this must not fail */
238 VERIFY(csi->csi_tcall != NULL);
239 } else {
240 csi->csi_interval = 0;
241 csi->csi_eff_interval = 0;
242 csi->csi_tcall = NULL;
243 }
244 lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
245 klist_init(&csi->csi_si.si_note);
246 }
247
248 void
csi_destroy(struct ch_selinfo * csi)249 csi_destroy(struct ch_selinfo *csi)
250 {
251 /* check if not already destroyed, else do it now */
252 if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) &
253 CSI_DESTROYED) == 0) {
254 CSI_LOCK(csi);
255 /* must have been set by above atomic op */
256 VERIFY(csi->csi_flags & CSI_DESTROYED);
257 if (csi->csi_flags & CSI_MITIGATION) {
258 thread_call_t __single tcall = csi->csi_tcall;
259 VERIFY(tcall != NULL);
260 CSI_UNLOCK(csi);
261
262 (void) thread_call_cancel_wait(tcall);
263 if (!thread_call_free(tcall)) {
264 boolean_t freed;
265 (void) thread_call_cancel_wait(tcall);
266 freed = thread_call_free(tcall);
267 VERIFY(freed);
268 }
269
270 CSI_LOCK(csi);
271 csi->csi_tcall = NULL;
272 os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION,
273 relaxed);
274 }
275 csi->csi_pending = 0;
276 CSI_UNLOCK(csi);
277
278 selthreadclear(&csi->csi_si);
279 /* now we don't need the mutex anymore */
280 lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
281 }
282 }
283
284 /*
285 * Called only for select(2).
286 */
287 __attribute__((always_inline))
288 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)289 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
290 {
291 struct selinfo *si = &csi->csi_si;
292
293 CSI_LOCK_ASSERT_HELD(csi);
294 selrecord(p, si, wql);
295 }
296
297 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)298 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
299 {
300 struct ch_selinfo *csi = &kring->ckr_si;
301
302 CSI_LOCK(csi);
303 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) "
304 "si %p si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
305 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
306 SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
307
308 csi_selrecord(csi, p, wql);
309 CSI_UNLOCK(csi);
310 }
311
312 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)313 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
314 void *wql)
315 {
316 struct ch_selinfo *csi = &na->na_si[t];
317
318 CSI_LOCK(csi);
319 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p si_flags 0x%x",
320 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
321 SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
322
323 csi_selrecord(csi, p, wql);
324 CSI_UNLOCK(csi);
325 }
326
327 /*
328 * Called from na_post_event().
329 */
330 __attribute__((always_inline))
331 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)332 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
333 boolean_t selwake, uint32_t hint)
334 {
335 struct selinfo *si = &csi->csi_si;
336
337 CSI_LOCK_ASSERT_HELD(csi);
338 csi->csi_pending = 0;
339 if (selwake) {
340 selwakeup(si);
341 }
342 if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
343 KNOTE(&si->si_note, hint);
344 }
345 }
346
347 __attribute__((always_inline))
348 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)349 csi_selwakeup_delayed(struct ch_selinfo *csi)
350 {
351 CSI_LOCK_ASSERT_HELD(csi);
352 ASSERT(csi->csi_flags & CSI_MITIGATION);
353 ASSERT(csi->csi_tcall != NULL);
354
355 if (thread_call_isactive(csi->csi_tcall)) {
356 csi->csi_pending++;
357 } else if (!csi_tcall_start(csi)) {
358 csi_selwakeup(csi, FALSE, FALSE, 0);
359 }
360 }
361
362 __attribute__((always_inline))
363 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)364 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
365 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
366 {
367 CSI_LOCK_ASSERT_HELD(csi);
368
369 if (nodelay || within_kevent || !selwake || hint != 0 ||
370 !(csi->csi_flags & CSI_MITIGATION)) {
371 csi_selwakeup(csi, within_kevent, selwake, hint);
372 } else {
373 csi_selwakeup_delayed(csi);
374 }
375 }
376
377 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)378 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
379 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
380 {
381 struct ch_selinfo *csi = &kring->ckr_si;
382
383 CSI_LOCK(csi);
384 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) "
385 "si %p si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x",
386 (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
387 SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
388 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
389 within_kevent, selwake, hint);
390
391 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
392 CSI_UNLOCK(csi);
393 }
394
395 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)396 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
397 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
398 {
399 struct ch_selinfo *csi = &na->na_si[t];
400
401 CSI_LOCK(csi);
402 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p "
403 "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x",
404 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
405 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
406 within_kevent, selwake, hint);
407
408 switch (t) {
409 case NR_RX:
410 if (!(na->na_flags & NAF_RX_MITIGATION)) {
411 nodelay = TRUE;
412 }
413 break;
414
415 case NR_TX:
416 if (!(na->na_flags & NAF_TX_MITIGATION)) {
417 nodelay = TRUE;
418 }
419 break;
420
421 default:
422 nodelay = TRUE;
423 break;
424 }
425 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
426 CSI_UNLOCK(csi);
427 }
428
429 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)430 csi_tcall_start(struct ch_selinfo *csi)
431 {
432 uint64_t now, ival, deadline;
433
434 CSI_LOCK_ASSERT_HELD(csi);
435 ASSERT(csi->csi_flags & CSI_MITIGATION);
436 ASSERT(csi->csi_tcall != NULL);
437
438 /* pick up latest value */
439 ival = csi_tcall_update_interval(csi);
440
441 /* if no mitigation, pass notification up now */
442 if (__improbable(ival == 0)) {
443 return FALSE;
444 }
445
446 deadline = now = mach_absolute_time();
447 clock_deadline_for_periodic_event(ival, now, &deadline);
448 (void) thread_call_enter_delayed(csi->csi_tcall, deadline);
449
450 return TRUE;
451 }
452
453 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)454 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
455 {
456 #pragma unused(arg1)
457 struct ch_selinfo *csi = (struct ch_selinfo *__single)arg0;
458
459 CSI_LOCK(csi);
460 csi_selwakeup(csi, FALSE, FALSE, 0);
461 CSI_UNLOCK(csi);
462
463 CSI_LOCK(csi);
464 if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
465 csi->csi_pending != 0 && !csi_tcall_start(csi))) {
466 csi_selwakeup(csi, FALSE, FALSE, 0);
467 }
468 CSI_UNLOCK(csi);
469 }
470
471 __attribute__((always_inline))
472 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)473 csi_tcall_update_interval(struct ch_selinfo *csi)
474 {
475 uint64_t i = ch_mit_ival;
476
477 /* if global override was adjusted, update local copies */
478 if (__improbable(csi->csi_eff_interval != i)) {
479 ASSERT(csi->csi_flags & CSI_MITIGATION);
480 csi->csi_interval = csi->csi_eff_interval =
481 ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
482 }
483
484 return csi->csi_interval;
485 }
486
487 /* return EV_EOF if the channel is defunct */
488 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)489 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
490 {
491 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
492 if (kn) {
493 kn->kn_flags |= EV_EOF;
494 }
495 return TRUE;
496 }
497 return FALSE;
498 }
499
500 static void
filt_chrwdetach(struct knote * kn,boolean_t write)501 filt_chrwdetach(struct knote *kn, boolean_t write)
502 {
503 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
504 struct ch_selinfo *csi;
505 struct selinfo *si;
506
507 lck_mtx_lock(&ch->ch_lock);
508 csi = ch->ch_si[write ? NR_TX : NR_RX];
509 si = &csi->csi_si;
510
511 CSI_LOCK(csi);
512 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s) "
513 "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
514 SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
515 write ? "write" : "read", si->si_flags);
516
517 if (KNOTE_DETACH(&si->si_note, kn)) {
518 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
519 }
520
521 CSI_UNLOCK(csi);
522 lck_mtx_unlock(&ch->ch_lock);
523 }
524
525 static void
filt_chrdetach(struct knote * kn)526 filt_chrdetach(struct knote *kn)
527 {
528 ASSERT(kn->kn_filter == EVFILT_READ);
529 filt_chrwdetach(kn, FALSE);
530 }
531
532 static void
filt_chwdetach(struct knote * kn)533 filt_chwdetach(struct knote *kn)
534 {
535 ASSERT(kn->kn_filter == EVFILT_WRITE);
536 filt_chrwdetach(kn, TRUE);
537 }
538
539 /*
540 * callback from notifies (generated externally).
541 * This always marks the knote activated, so always
542 * return 1.
543 */
544 static int
filt_chrw(struct knote * kn,long hint,int events)545 filt_chrw(struct knote *kn, long hint, int events)
546 {
547 #if SK_LOG
548 struct kern_channel *ch = (struct kern_channel *__single)
549 knote_kn_hook_get_raw(kn);
550 #else
551 #pragma unused(kn)
552 #pragma unused(hint)
553 #pragma unused(events)
554 #endif
555 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p "
556 "kn %p (%s%s) hint 0x%x", ch->ch_na->na_name,
557 SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
558 (kn->kn_flags & EV_POLL) ? "poll," : "",
559 (events == POLLOUT) ? "write" : "read",
560 (uint32_t)hint);
561
562 /* assume we are ready */
563 return 1;
564 }
565
566 static int
filt_chread(struct knote * kn,long hint)567 filt_chread(struct knote *kn, long hint)
568 {
569 ASSERT(kn->kn_filter == EVFILT_READ);
570 /* There is no hint for read/write event */
571 if (hint != 0) {
572 return 0;
573 }
574 return filt_chrw(kn, hint, POLLIN);
575 }
576
577 static int
filt_chwrite(struct knote * kn,long hint)578 filt_chwrite(struct knote *kn, long hint)
579 {
580 ASSERT(kn->kn_filter == EVFILT_WRITE);
581 /* There is no hint for read/write event */
582 if (hint != 0) {
583 return 0;
584 }
585 return filt_chrw(kn, hint, POLLOUT);
586 }
587
588 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)589 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
590 {
591 #pragma unused(kev)
592 /*
593 * -fbounds-safety: This seems like an example of interop with code that
594 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
595 */
596 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
597 knote_kn_hook_get_raw(kn));
598 int ev = kn->kn_filter;
599 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
600 int event_error = 0;
601 int revents;
602
603 /* save off the new input fflags and data */
604 kn->kn_sfflags = kev->fflags;
605 kn->kn_sdata = kev->data;
606
607 lck_mtx_lock(&ch->ch_lock);
608 if (__improbable(ch_filt_check_defunct(ch, kn))) {
609 lck_mtx_unlock(&ch->ch_lock);
610 return 1;
611 }
612
613 /* if a note-specific low watermark is given, validate it */
614 if (kn->kn_sfflags & NOTE_LOWAT) {
615 struct ch_ev_thresh note_thresh = {
616 .cet_unit = (dir == NR_TX) ?
617 ch->ch_info->cinfo_tx_lowat.cet_unit :
618 ch->ch_info->cinfo_rx_lowat.cet_unit,
619 .cet_value = (uint32_t)kn->kn_sdata
620 };
621 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
622 ¬e_thresh) != 0) {
623 SK_ERR("invalid NOTE_LOWAT threshold %u",
624 note_thresh.cet_value);
625 knote_set_error(kn, EINVAL);
626 lck_mtx_unlock(&ch->ch_lock);
627 return 1;
628 }
629 }
630
631 /* capture new state just so we can return it */
632 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
633 &event_error, FALSE);
634 lck_mtx_unlock(&ch->ch_lock);
635
636 if (revents & POLLERR) {
637 ASSERT(event_error != 0);
638 /*
639 * Setting a knote error here will confuse libdispatch, so we
640 * use EV_EOF instead.
641 */
642 kn->kn_flags |= EV_EOF;
643 return 1;
644 } else {
645 return (events & revents) != 0;
646 }
647 }
648
649 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)650 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
651 {
652 ASSERT(kn->kn_filter == EVFILT_READ);
653
654 if (kev->flags & EV_ENABLE) {
655 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
656 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
657 kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
658 ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na));
659 }
660
661 return filt_chtouch(kn, kev, POLLIN);
662 }
663
664 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)665 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
666 {
667 ASSERT(kn->kn_filter == EVFILT_WRITE);
668 return filt_chtouch(kn, kev, POLLOUT);
669 }
670
671
672 /*
673 * Called from kevent. We call ch_event(POLL[IN|OUT]) and
674 * return 0/1 accordingly.
675 */
676 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)677 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
678 {
679 /*
680 * -fbounds-safety: This seems like an example of interop with code that
681 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
682 */
683 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
684 knote_kn_hook_get_raw(kn));
685 struct ch_event_result result;
686 uint32_t lowat;
687 int trigger_event = 1;
688 int revents;
689 int event_error;
690 int64_t data;
691
692 lck_mtx_lock(&ch->ch_lock);
693 if (__improbable(ch_filt_check_defunct(ch, kn))) {
694 knote_fill_kevent(kn, kev, 0);
695 lck_mtx_unlock(&ch->ch_lock);
696 return 1;
697 }
698
699 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
700 TRUE, &event_error, FALSE);
701
702 if (revents & POLLERR) {
703 ASSERT(event_error != 0);
704 lck_mtx_unlock(&ch->ch_lock);
705 /*
706 * Setting a knote error here will confuse libdispatch, so we
707 * use EV_EOF instead.
708 */
709 kn->kn_flags |= EV_EOF;
710 knote_fill_kevent_with_sdata(kn, kev);
711 return 1;
712 }
713
714 trigger_event = (events & revents) != 0;
715
716 if (events == POLLOUT) {
717 lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
718 if ((kn->kn_sfflags & NOTE_LOWAT) &&
719 kn->kn_sdata > lowat) {
720 lowat = (uint32_t)kn->kn_sdata;
721 }
722
723 data = result.tx_data;
724
725 if (result.tx_data < lowat) {
726 trigger_event = 0;
727 }
728 } else {
729 lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
730 if ((kn->kn_sfflags & NOTE_LOWAT) &&
731 kn->kn_sdata > lowat) {
732 lowat = (uint32_t)kn->kn_sdata;
733 }
734
735 data = result.rx_data;
736
737 if (result.rx_data < lowat) {
738 trigger_event = 0;
739 }
740 }
741
742 if (trigger_event) {
743 knote_fill_kevent(kn, kev, data);
744 }
745
746 lck_mtx_unlock(&ch->ch_lock);
747
748 return trigger_event;
749 }
750
751 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)752 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
753 {
754 ASSERT(kn->kn_filter == EVFILT_READ);
755 return filt_chprocess(kn, kev, POLLIN);
756 }
757
758 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)759 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
760 {
761 ASSERT(kn->kn_filter == EVFILT_WRITE);
762 return filt_chprocess(kn, kev, POLLOUT);
763 }
764
765 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)766 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
767 {
768 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
769 struct nexus_adapter *na;
770 struct ch_selinfo *csi;
771 int ev = kn->kn_filter;
772 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
773 int revents;
774 int events;
775 int event_error = 0;
776
777 ASSERT((kn->kn_filter == EVFILT_READ) ||
778 (kn->kn_filter == EVFILT_WRITE));
779
780 /* ch_kqfilter() should have acquired the lock */
781 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
782
783 na = ch->ch_na;
784 /* if a note-specific low watermark is given, validate it */
785 if (kn->kn_sfflags & NOTE_LOWAT) {
786 struct ch_ev_thresh note_thresh = {
787 .cet_unit = (dir == NR_TX) ?
788 ch->ch_info->cinfo_tx_lowat.cet_unit :
789 ch->ch_info->cinfo_rx_lowat.cet_unit,
790 .cet_value = (uint32_t)kn->kn_sdata
791 };
792 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
793 ¬e_thresh) != 0) {
794 SK_ERR("invalid NOTE_LOWAT threshold %u",
795 note_thresh.cet_value);
796 knote_set_error(kn, EINVAL);
797 return 0;
798 }
799 }
800
801 /* the si is indicated in the channel */
802 csi = ch->ch_si[dir];
803 CSI_LOCK(csi);
804
805 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
806 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
807 }
808
809 CSI_UNLOCK(csi);
810
811 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s)",
812 na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
813 (kn->kn_flags & EV_POLL) ? "poll," : "",
814 (ev == EVFILT_WRITE) ? "write" : "read");
815
816 /* capture current state */
817 events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
818
819 if (__improbable(ch_filt_check_defunct(ch, kn))) {
820 revents = events;
821 } else {
822 /* filt_chprocess() will fill in the kn_sdata field */
823 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
824 NULL, TRUE, &event_error, FALSE);
825 }
826
827 if (revents & POLLERR) {
828 ASSERT(event_error != 0);
829 kn->kn_flags |= EV_EOF;
830 return 1;
831 } else {
832 return (events & revents) != 0;
833 }
834 }
835
836 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)837 filt_chan_extended_common(struct knote *kn, long ev_hint)
838 {
839 /*
840 * This function is not always called with the same set of locks held,
841 * hence it is only allowed to manipulate kn_fflags, with atomics.
842 *
843 * the f_event / f_process functions may run concurrently.
844 */
845 uint32_t add_fflags = 0;
846
847 if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
848 add_fflags |= NOTE_FLOW_ADV_UPDATE;
849 }
850 if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
851 add_fflags |= NOTE_CHANNEL_EVENT;
852 }
853 if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
854 add_fflags |= NOTE_IF_ADV_UPD;
855 }
856 if (add_fflags) {
857 /* Reset any events that are not requested on this knote */
858 add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
859 os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
860 return add_fflags != 0;
861 }
862 return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
863 }
864
865 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)866 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
867 uint32_t fflags, long *hint)
868 {
869 int revents, event_error = 0;
870
871 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
872 *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
873
874 if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
875 ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
876 /* capture new state to return */
877 revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
878 NULL, TRUE, &event_error, TRUE);
879 if (revents & POLLERR) {
880 ASSERT(event_error != 0);
881 /*
882 * Setting a knote error here will confuse libdispatch,
883 * so we use EV_EOF instead.
884 */
885 kn->kn_flags |= EV_EOF;
886 } else if ((revents & POLLIN) != 0) {
887 *hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
888 }
889 }
890 /*
891 * if the sync operation on event ring didn't find any events
892 * then indicate that the channel event is not active.
893 */
894 if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
895 /*
896 * Avoid a costly atomic when the bit is already cleared.
897 */
898 uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
899 if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
900 os_atomic_andnot(&kn->kn_fflags,
901 CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
902 }
903 }
904 }
905
906 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)907 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
908 {
909 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
910 struct ch_selinfo *csi;
911 long hint = 0;
912
913 static_assert(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
914 static_assert(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
915 static_assert(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
916
917 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
918
919 /* ch_kqfilter() should have acquired the lock */
920 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
921
922 csi = ch->ch_si[NR_TX];
923 CSI_LOCK(csi);
924 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
925 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
926 }
927 CSI_UNLOCK(csi);
928
929 if (__improbable(ch_filt_check_defunct(ch, kn))) {
930 return 1;
931 }
932 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
933 os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed);
934 }
935 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
936 if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
937 /* on registration force an event */
938 hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
939 }
940 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)",
941 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
942 "EVFILT_NW_CHANNEL");
943 return filt_chan_extended_common(kn, hint);
944 }
945
946 static void
filt_che_detach(struct knote * kn)947 filt_che_detach(struct knote *kn)
948 {
949 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
950 struct ch_selinfo *csi;
951
952 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
953
954 lck_mtx_lock(&ch->ch_lock);
955 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
956 os_atomic_andnot(&ch->ch_na->na_flags,
957 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
958 }
959 csi = ch->ch_si[NR_TX];
960 CSI_LOCK(csi);
961 if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
962 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
963 }
964 CSI_UNLOCK(csi);
965 lck_mtx_unlock(&ch->ch_lock);
966
967 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)",
968 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
969 "EVFILT_NW_CHANNEL");
970 }
971
972 static int
filt_che_event(struct knote * kn,long hint)973 filt_che_event(struct knote *kn, long hint)
974 {
975 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
976
977 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
978 if (hint == 0) {
979 return 0;
980 }
981 if (__improbable(ch_filt_check_defunct(ch, NULL))) {
982 return 1;
983 }
984 if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
985 VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
986 }
987 SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p hint 0x%lx)",
988 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint);
989 return filt_chan_extended_common(kn, hint);
990 }
991
992 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)993 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
994 {
995 int ret;
996 long hint = 0;
997 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
998
999 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1000 /* save off the new input fflags and data */
1001 kn->kn_sfflags = kev->fflags;
1002 kn->kn_sdata = kev->data;
1003
1004 lck_mtx_lock(&ch->ch_lock);
1005 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1006 ret = 1;
1007 goto done;
1008 }
1009 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1010 if (kev->flags & EV_ENABLE) {
1011 os_atomic_or(&ch->ch_na->na_flags,
1012 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1013 } else if (kev->flags & EV_DISABLE) {
1014 os_atomic_andnot(&ch->ch_na->na_flags,
1015 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1016 }
1017 }
1018 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1019 ret = filt_chan_extended_common(kn, hint);
1020 done:
1021 lck_mtx_unlock(&ch->ch_lock);
1022 return ret;
1023 }
1024
1025 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1026 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1027 {
1028 int ret;
1029 long hint = 0;
1030
1031 /*
1032 * -fbounds-safety: This seems like an example of interop with code that
1033 * has -fbounds-safety disabled, which means we can use __unsafe_forge_*
1034 */
1035 struct kern_channel *ch = __unsafe_forge_single(struct kern_channel *,
1036 knote_kn_hook_get_raw(kn));
1037
1038 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1039 lck_mtx_lock(&ch->ch_lock);
1040 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1041 ret = 1;
1042 goto done;
1043 }
1044 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1045 ret = filt_chan_extended_common(kn, hint);
1046 done:
1047 lck_mtx_unlock(&ch->ch_lock);
1048 if (ret != 0) {
1049 /*
1050 * This filter historically behaves like EV_CLEAR,
1051 * even when EV_CLEAR wasn't set.
1052 */
1053 knote_fill_kevent(kn, kev, 0);
1054 kn->kn_fflags = 0;
1055 }
1056 return ret;
1057 }
1058
1059 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1060 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1061 struct kevent_qos_s *kev)
1062 {
1063 SK_LOG_VAR(char dbgbuf[CH_DBGBUF_SIZE]);
1064 int result;
1065
1066 lck_mtx_lock(&ch->ch_lock);
1067 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1068
1069 if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1070 na_reject_channel(ch, ch->ch_na))) {
1071 SK_ERR("channel is non-permissive %s",
1072 ch2str(ch, dbgbuf, sizeof(dbgbuf)));
1073 knote_set_error(kn, ENXIO);
1074 lck_mtx_unlock(&ch->ch_lock);
1075 return 0;
1076 }
1077
1078 switch (kn->kn_filter) {
1079 case EVFILT_READ:
1080 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1081 break;
1082
1083 case EVFILT_WRITE:
1084 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1085 break;
1086
1087 case EVFILT_NW_CHANNEL:
1088 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1089 break;
1090
1091 default:
1092 lck_mtx_unlock(&ch->ch_lock);
1093 SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1094 ch->ch_pid, kn->kn_filter);
1095 knote_set_error(kn, EINVAL);
1096 return 0;
1097 }
1098
1099 knote_kn_hook_set_raw(kn, ch);
1100 /* call the appropriate sub-filter attach with the channel lock held */
1101 result = knote_fops(kn)->f_attach(kn, kev);
1102 lck_mtx_unlock(&ch->ch_lock);
1103 return result;
1104 }
1105
1106 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1107 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1108 {
1109 return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1110 }
1111
1112 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1113 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1114 {
1115 int revents;
1116 int event_error = 0;
1117
1118 lck_mtx_lock(&ch->ch_lock);
1119 revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1120 FALSE);
1121 lck_mtx_unlock(&ch->ch_lock);
1122
1123 ASSERT((revents & POLLERR) == 0 || event_error != 0);
1124
1125 return revents;
1126 }
1127
1128 #if SK_LOG
1129 /* Hoisted out of line to reduce kernel stack footprint */
1130 SK_LOG_ATTRIBUTE
1131 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1132 ch_event_log(const char *prefix, const struct kern_channel *ch,
1133 struct proc *p, const struct nexus_adapter *na,
1134 int events, int revents)
1135 {
1136 SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (%p) ch %p %s(%d) "
1137 "th %p ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1138 SK_KVA(ch), sk_proc_name(p), sk_proc_pid(p),
1139 SK_KVA(current_thread()), events, revents);
1140 }
1141 #endif /* SK_LOG */
1142
1143 /*
1144 * select(2), poll(2) and kevent(2) handlers for channels.
1145 *
1146 * Can be called for one or more rings. Return true the event mask
1147 * corresponding to ready events. If there are no ready events, do
1148 * a selrecord on either individual selinfo or on the global one.
1149 * Device-dependent parts (locking and sync of tx/rx rings)
1150 * are done through callbacks.
1151 */
1152 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1153 ch_event(struct kern_channel *ch, int events, void *wql,
1154 struct proc *p, struct ch_event_result *result,
1155 const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1156 {
1157 struct nexus_adapter *na;
1158 struct __kern_channel_ring *kring;
1159 uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1160 uint32_t ready_tx_data = 0, ready_rx_data = 0;
1161 sk_protect_t protect = NULL;
1162
1163 #define want_tx want[NR_TX]
1164 #define want_rx want[NR_RX]
1165 /*
1166 * In order to avoid nested locks, we need to "double check"
1167 * txsync and rxsync if we decide to do a selrecord().
1168 * retry_tx (and retry_rx, later) prevent looping forever.
1169 */
1170 boolean_t retry_tx = TRUE, retry_rx = TRUE;
1171 int found, error = 0;
1172 int s;
1173
1174 net_update_uptime();
1175
1176 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1177 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1178
1179 *errno = 0;
1180
1181 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1182 ch->ch_schema == NULL)) {
1183 SK_ERR("%s(%d): channel is defunct or no longer bound",
1184 ch->ch_name, ch->ch_pid);
1185 revents = POLLERR;
1186 *errno = ENXIO;
1187 goto done;
1188 }
1189
1190 /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1191 if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1192 os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
1193 }
1194
1195 na = ch->ch_na;
1196 if (__improbable(na == NULL ||
1197 !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1198 SK_ERR("%s(%d): channel is non-permissive",
1199 ch->ch_name, ch->ch_pid);
1200 revents = POLLERR;
1201 *errno = ENXIO;
1202 goto done;
1203 }
1204
1205 /* mark thread with sync-in-progress flag */
1206 protect = sk_sync_protect();
1207
1208 /* update our work timestamp */
1209 na->na_work_ts = net_uptime();
1210
1211 /* and make this channel eligible for draining again */
1212 if (na->na_flags & NAF_DRAINING) {
1213 os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed);
1214 }
1215
1216 #if SK_LOG
1217 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1218 ch_event_log("enter", ch, p, na, events, revents);
1219 }
1220 #endif
1221 if (is_ch_event) {
1222 goto process_channel_event;
1223 }
1224
1225 want_tx = (events & (POLLOUT | POLLWRNORM));
1226 want_rx = (events & (POLLIN | POLLRDNORM));
1227
1228 /*
1229 * check_all_{tx|rx} are set if the channel has more than one ring
1230 * AND the file descriptor is bound to all of them. If so, we sleep
1231 * on the "global" selinfo, otherwise we sleep on individual selinfo
1232 * The interrupt routine in the driver wake one or the other (or both)
1233 * depending on which clients are active.
1234 *
1235 * rxsync() is only called if we run out of buffers on a POLLIN.
1236 * txsync() is called if we run out of buffers on POLLOUT.
1237 */
1238 check_all_tx = ch_is_multiplex(ch, NR_TX);
1239 check_all_rx = ch_is_multiplex(ch, NR_RX);
1240
1241 /*
1242 * If want_tx is still set, we must issue txsync calls
1243 * (on all rings, to avoid that the tx rings stall).
1244 * XXX should also check head != khead on the tx rings.
1245 */
1246 if (want_tx) {
1247 ring_id_t first_tx = ch->ch_first[NR_TX];
1248 ring_id_t last_tx = ch->ch_last[NR_TX];
1249
1250 channel_threshold_unit_t tx_unit =
1251 ch->ch_info->cinfo_tx_lowat.cet_unit;
1252
1253 /*
1254 * The first round checks if anyone is ready, if not
1255 * do a selrecord and another round to handle races.
1256 * want_tx goes to 0 if any space is found, and is
1257 * used to skip rings with no pending transmissions.
1258 */
1259 flush_tx:
1260 for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1261 kring = &na->na_tx_rings[i];
1262 if (!want_tx &&
1263 kring->ckr_ring->ring_head == kring->ckr_khead) {
1264 continue;
1265 }
1266
1267 /* only one thread does txsync */
1268 s = kr_enter(kring, TRUE);
1269 ASSERT(s == 0);
1270
1271 error = 0;
1272 DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1273 ch, struct __kern_channel_ring *, kring);
1274 if (kr_txsync_prologue(ch, kring, p) >=
1275 kring->ckr_num_slots) {
1276 kr_log_bad_ring(kring);
1277 revents |= POLLERR;
1278 error = EFAULT;
1279 if (*errno == 0) {
1280 *errno = EFAULT;
1281 }
1282 } else {
1283 if (kring->ckr_na_sync(kring, p, 0)) {
1284 revents |= POLLERR;
1285 error = EIO;
1286 if (*errno == 0) {
1287 *errno = EIO;
1288 }
1289 } else {
1290 kr_txsync_finalize(ch, kring, p);
1291 }
1292 }
1293 DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1294 ch, struct __kern_channel_ring *, kring, int,
1295 error);
1296
1297 /*
1298 * If we found new slots, notify potential listeners on
1299 * the same ring. Since we just did a txsync, look at
1300 * the copies of cur,tail in the kring.
1301 */
1302 found = kring->ckr_rhead != kring->ckr_rtail;
1303 kr_exit(kring);
1304 if (found) { /* notify other listeners */
1305 revents |= want_tx;
1306 want_tx = 0;
1307 (void) kring->ckr_na_notify(kring, p,
1308 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1309 }
1310
1311 /*
1312 * Add this ring's free data to our running
1313 * tally for userspace.
1314 */
1315 if (result != NULL) {
1316 switch (tx_unit) {
1317 case CHANNEL_THRESHOLD_UNIT_BYTES:
1318 ready_tx_data += kring->ckr_ready_bytes;
1319 break;
1320 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1321 ready_tx_data += kring->ckr_ready_slots;
1322 break;
1323 }
1324 }
1325 }
1326 if (want_tx && retry_tx && !is_kevent) {
1327 if (check_all_tx) {
1328 csi_selrecord_all(na, NR_TX, p, wql);
1329 } else {
1330 csi_selrecord_one(&na->na_tx_rings[first_tx],
1331 p, wql);
1332 }
1333 retry_tx = FALSE;
1334 goto flush_tx;
1335 }
1336 }
1337
1338 /*
1339 * If want_rx is still set scan receive rings.
1340 * Do it on all rings because otherwise we starve.
1341 */
1342 if (want_rx) {
1343 ring_id_t first_rx = ch->ch_first[NR_RX];
1344 ring_id_t last_rx = ch->ch_last[NR_RX];
1345 channel_threshold_unit_t rx_unit =
1346 ch->ch_info->cinfo_rx_lowat.cet_unit;
1347
1348 /* two rounds here for race avoidance */
1349 do_retry_rx:
1350 for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1351 kring = &na->na_rx_rings[i];
1352
1353 /* only one thread does rxsync */
1354 s = kr_enter(kring, TRUE);
1355 ASSERT(s == 0);
1356
1357 error = 0;
1358 DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1359 ch, struct __kern_channel_ring *, kring);
1360 if (kr_rxsync_prologue(ch, kring, p) >=
1361 kring->ckr_num_slots) {
1362 kr_log_bad_ring(kring);
1363 revents |= POLLERR;
1364 error = EFAULT;
1365 if (*errno == 0) {
1366 *errno = EFAULT;
1367 }
1368 } else {
1369 /* now we can use kring->rhead, rtail */
1370 if (kring->ckr_na_sync(kring, p, 0)) {
1371 revents |= POLLERR;
1372 error = EIO;
1373 if (*errno == 0) {
1374 *errno = EIO;
1375 }
1376 } else {
1377 kr_rxsync_finalize(ch, kring, p);
1378 }
1379 }
1380
1381 DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1382 ch, struct __kern_channel_ring *, kring, int,
1383 error);
1384
1385 found = kring->ckr_rhead != kring->ckr_rtail;
1386 kr_exit(kring);
1387 if (found) {
1388 revents |= want_rx;
1389 retry_rx = FALSE;
1390 (void) kring->ckr_na_notify(kring, p,
1391 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1392 }
1393
1394 /*
1395 * Add this ring's readable data to our running
1396 * tally for userspace.
1397 */
1398 if (result != NULL) {
1399 switch (rx_unit) {
1400 case CHANNEL_THRESHOLD_UNIT_BYTES:
1401 ready_rx_data += kring->ckr_ready_bytes;
1402 break;
1403 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1404 ready_rx_data += kring->ckr_ready_slots;
1405 break;
1406 }
1407 }
1408 }
1409
1410 if (retry_rx && !is_kevent) {
1411 if (check_all_rx) {
1412 csi_selrecord_all(na, NR_RX, p, wql);
1413 } else {
1414 csi_selrecord_one(&na->na_rx_rings[first_rx],
1415 p, wql);
1416 }
1417 }
1418 if (retry_rx) {
1419 retry_rx = FALSE;
1420 goto do_retry_rx;
1421 }
1422 }
1423
1424 if (result != NULL) {
1425 result->tx_data = ready_tx_data;
1426 result->rx_data = ready_rx_data;
1427 }
1428 goto skip_channel_event;
1429
1430 process_channel_event:
1431 /*
1432 * perform sync operation on the event ring to make the channel
1433 * events enqueued in the ring visible to user-space.
1434 */
1435
1436 /* select() and poll() not supported for event ring */
1437 ASSERT(is_kevent);
1438 VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1439 kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1440
1441 /* only one thread does the sync */
1442 s = kr_enter(kring, TRUE);
1443 ASSERT(s == 0);
1444 if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1445 kr_log_bad_ring(kring);
1446 revents |= POLLERR;
1447 if (*errno == 0) {
1448 *errno = EFAULT;
1449 }
1450 } else {
1451 if (kring->ckr_na_sync(kring, p, 0)) {
1452 revents |= POLLERR;
1453 if (*errno == 0) {
1454 *errno = EIO;
1455 }
1456 } else {
1457 kr_event_sync_finalize(ch, kring, p);
1458 }
1459 }
1460 found = (kring->ckr_rhead != kring->ckr_rtail);
1461 kr_exit(kring);
1462 if (found) {
1463 revents |= (events & POLLIN);
1464 }
1465
1466 skip_channel_event:
1467 #if SK_LOG
1468 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1469 ch_event_log("exit", ch, p, na, events, revents);
1470 }
1471 #endif /* SK_LOG */
1472
1473 /* unmark thread with sync-in-progress flag */
1474 sk_sync_unprotect(protect);
1475
1476 done:
1477 ASSERT(!sk_is_sync_protected());
1478
1479 return revents;
1480 #undef want_tx
1481 #undef want_rx
1482 }
1483
1484 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1485 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1486 {
1487 struct kern_channel *ch;
1488
1489 SK_LOCK_ASSERT_HELD();
1490
1491 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1492 struct ch_info *cinfo = ch->ch_info;
1493
1494 /* see comments in ch_open() */
1495 if (cinfo->cinfo_nx_port != port) {
1496 continue;
1497 } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1498 ring_id != cinfo->cinfo_ch_ring_id &&
1499 ring_id != CHANNEL_RING_ID_ANY) {
1500 continue;
1501 }
1502
1503 /* found a match */
1504 break;
1505 }
1506
1507 if (ch != NULL) {
1508 ch_retain_locked(ch);
1509 }
1510
1511 return ch;
1512 }
1513
1514 #if SK_LOG
1515 /* Hoisted out of line to reduce kernel stack footprint */
1516 SK_LOG_ATTRIBUTE
1517 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1518 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1519 {
1520 uuid_string_t uuidstr;
1521
1522 SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1523 sk_proc_name(p), sk_proc_pid(p), proc_uniqueid(p),
1524 sk_uuid_unparse(p_uuid, uuidstr), port);
1525 }
1526
1527 SK_LOG_ATTRIBUTE
1528 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,int err)1529 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1530 uint32_t mode, int err)
1531 {
1532 SK_D("%s(%d) port %u ring %d mode 0x%x err %d",
1533 sk_proc_name(p), sk_proc_pid(p), port, (int)ring, mode, err);
1534 }
1535 #endif /* SK_LOG */
1536
1537 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1538 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1539 {
1540 uint32_t mode = init->ci_ch_mode;
1541 nexus_port_t port = init->ci_nx_port;
1542 ring_id_t ring = init->ci_ch_ring_id;
1543 struct kern_channel *ch = NULL, *ch0 = NULL;
1544 struct nxbind *nxb = NULL;
1545 struct kern_nexus *nx;
1546 struct chreq chr;
1547 uuid_t p_uuid;
1548 kauth_cred_t cred;
1549
1550 cred = kauth_cred_get();
1551 ASSERT(!uuid_is_null(init->ci_nx_uuid));
1552 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1553 *err = 0;
1554
1555 /* make sure we don't allow userland to set kernel-only flags */
1556 mode &= CHMODE_MASK;
1557
1558 SK_LOCK();
1559
1560 nx = nx_find(init->ci_nx_uuid, TRUE);
1561 if (nx == NULL) {
1562 *err = ENOENT;
1563 goto done;
1564 }
1565 if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1566 *err = EBUSY;
1567 goto done;
1568 }
1569
1570 /* port (zero-based) must be within the domain's range */
1571 if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1572 *err = EDOM;
1573 goto done;
1574 }
1575 VERIFY(port != NEXUS_PORT_ANY);
1576
1577 if (mode & CHMODE_LOW_LATENCY) {
1578 if ((*err = skywalk_priv_check_cred(p, cred,
1579 PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1580 goto done;
1581 }
1582 }
1583
1584 /*
1585 * Check with the nexus to see if the port is bound; if so, prepare
1586 * our nxbind structure that we'll need to pass down to the nexus
1587 * for it compare. If the caller provides a key, we take it over
1588 * and will free it ourselves (as part of freeing nxbind.)
1589 */
1590 if (!NX_ANONYMOUS_PROV(nx)) {
1591 /*
1592 * -fbounds-safety: ci_key is user_addr_t (aka uint64_t), so
1593 * can't mark it as __sized_by. Forge it instead.
1594 */
1595 void *key = __unsafe_forge_bidi_indexable(void *, init->ci_key,
1596 init->ci_key_len);
1597
1598 #if SK_LOG
1599 if (__improbable(sk_verbose != 0)) {
1600 ch_open_log1(p_uuid, p, port);
1601 }
1602 #endif /* SK_LOG */
1603
1604 nxb = nxb_alloc(Z_WAITOK);
1605 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1606 nxb->nxb_uniqueid = proc_uniqueid(p);
1607 nxb->nxb_pid = proc_pid(p);
1608 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1609 uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1610 if (key != NULL) {
1611 nxb->nxb_flags |= NXBF_MATCH_KEY;
1612 nxb->nxb_key_len = init->ci_key_len;
1613 nxb->nxb_key = key;
1614 init->ci_key = USER_ADDR_NULL; /* take over */
1615 }
1616 }
1617
1618 /*
1619 * There can only be one owner of {port,ring_id} tuple.
1620 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1621 * all rings. Further attempts to own any or all of the rings
1622 * will be declined.
1623 *
1624 * For example, assuming a 2-rings setup for port 'p':
1625 *
1626 * owner{p,-1}
1627 * will not allow:
1628 * owner{p,-1}, owner{p,0}, owner{p,1}
1629 *
1630 * owner{p,0}
1631 * will allow:
1632 * owner{p,1}
1633 * will not allow:
1634 * owner{p,-1}, owner{p,0}
1635 */
1636 if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1637 SK_D("found ch0 %p", SK_KVA(ch0));
1638 #if SK_LOG
1639 uuid_string_t uuidstr;
1640 char *na_name = (ch0->ch_na != NULL) ?
1641 ch0->ch_na->na_name : "";
1642
1643 SK_PERR(p, "ch %s flags (0x%x) exists on port %d on "
1644 "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1645 sk_uuid_unparse(nx->nx_uuid, uuidstr),
1646 ch0->ch_name, ch0->ch_pid);
1647 #endif /* SK_LOG */
1648 *err = EBUSY;
1649 goto done;
1650 }
1651
1652 bzero(&chr, sizeof(chr));
1653 chr.cr_tx_lowat = init->ci_tx_lowat;
1654 chr.cr_rx_lowat = init->ci_rx_lowat;
1655 chr.cr_port = port;
1656 chr.cr_mode = mode;
1657 chr.cr_ring_id = ring;
1658
1659 /* upon success, returns a channel with reference held */
1660 ch = ch_connect(nx, &chr, nxb, p, fd, err);
1661
1662 done:
1663
1664 #if SK_LOG
1665 if (__improbable(sk_verbose != 0)) {
1666 ch_open_log2(p, port, ring, mode, *err);
1667 }
1668 #endif /* SK_LOG */
1669
1670 if (ch0 != NULL) {
1671 (void) ch_release_locked(ch0);
1672 }
1673
1674 if (nx != NULL) {
1675 (void) nx_release_locked(nx);
1676 }
1677
1678 if (nxb != NULL) {
1679 nxb_free(nxb);
1680 }
1681
1682 SK_UNLOCK();
1683
1684 return ch;
1685 }
1686
1687 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1688 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1689 int *err)
1690 {
1691 struct kern_channel *ch = NULL;
1692
1693 SK_LOCK_ASSERT_HELD();
1694 if ((nx->nx_flags & NXF_INVALIDATED) != 0) {
1695 *err = EBUSY;
1696 goto done;
1697 }
1698 *err = 0;
1699
1700 ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1701 ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1702 ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1703 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1704 chr->cr_mode |= CHMODE_KERNEL;
1705 if (nonxref) {
1706 chr->cr_mode |= CHMODE_NO_NXREF;
1707 } else {
1708 chr->cr_mode &= ~CHMODE_NO_NXREF;
1709 }
1710
1711 /* upon success, returns a channel with reference held */
1712 ch = ch_connect(nx, chr, NULL, kernproc, -1, err);
1713 if (ch != NULL) {
1714 /*
1715 * nonxref channels don't hold any reference to the nexus,
1716 * since otherwise we'll never be able to close them when
1717 * the last regular channel of the nexus is closed, as part
1718 * of the nexus's destructor operation. Release the nonxref
1719 * channel reference now, but make sure the nexus has at
1720 * least 3 refs: global list, provider list and the nonxref
1721 * channel itself, before doing that.
1722 */
1723 if (nonxref) {
1724 ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1725 ASSERT(nx->nx_refcnt > 3);
1726 (void) nx_release_locked(nx);
1727 }
1728 }
1729
1730 #if SK_LOG
1731 uuid_string_t uuidstr;
1732 const char * na_name = NULL;
1733 const char * nxdom_prov_name = NULL;
1734
1735 if (ch != NULL && ch->ch_na != NULL) {
1736 na_name = ch->ch_na->na_name;
1737 }
1738 if (nx->nx_prov != NULL) {
1739 nxdom_prov_name = NX_DOM_PROV(nx)->nxdom_prov_name;
1740 }
1741 SK_D("nx %p (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%x err %d",
1742 SK_KVA(nx),
1743 (nxdom_prov_name != NULL) ? nxdom_prov_name : "",
1744 (na_name != NULL) ? na_name : "",
1745 (int)chr->cr_port, (int)chr->cr_ring_id,
1746 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode, *err);
1747 #endif /* SK_LOG */
1748
1749 done:
1750 return ch;
1751 }
1752
1753 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1754 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1755 {
1756 #pragma unused(special)
1757 #if SK_LOG
1758 uuid_string_t uuidstr;
1759 const char *na_name = (ch->ch_na != NULL) ?
1760 ch->ch_na->na_name : "";
1761 const char *__null_terminated nxdom_name = "";
1762 if (ch->ch_nexus != NULL) {
1763 nxdom_name = NX_DOM(ch->ch_nexus)->nxdom_name;
1764 }
1765 const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1766 NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1767
1768 SK_D("ch %p (%s:%s:\"%s\":%u:%d) uuid %s flags 0x%x",
1769 SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1770 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id,
1771 sk_uuid_unparse(ch->ch_info->cinfo_ch_id, uuidstr),
1772 ch->ch_flags);
1773 #endif /* SK_LOG */
1774 struct kern_nexus *nx = ch->ch_nexus;
1775
1776 if (!locked) {
1777 SK_LOCK();
1778 }
1779
1780 SK_LOCK_ASSERT_HELD();
1781 /*
1782 * If the channel is participating in the interface advisory
1783 * notification, remove it from the nexus.
1784 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1785 * is held in exclusive mode.
1786 */
1787 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1788 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1789 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1790 kern_channel, ch_link_if_adv);
1791 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
1792 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1793 nx_netif_config_interface_advisory(nx, false);
1794 }
1795 lck_rw_done(&nx->nx_ch_if_adv_lock);
1796 lck_mtx_lock(&ch->ch_lock);
1797 (void) ch_release_locked(ch);
1798 } else {
1799 lck_rw_done(&nx->nx_ch_if_adv_lock);
1800 lck_mtx_lock(&ch->ch_lock);
1801 }
1802 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1803 /*
1804 * Mark the channel as closing to prevent further setopt requests;
1805 * this flag is set once here and never gets cleared.
1806 */
1807 ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1808 os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed);
1809
1810 if (special) {
1811 VERIFY(ch->ch_flags & CHANF_KERNEL);
1812 } else {
1813 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1814 }
1815
1816 ch->ch_fd = -1;
1817
1818 /* may be called as part of failure cleanup, so check */
1819 if (ch->ch_flags & CHANF_ATTACHED) {
1820 boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1821
1822 /* caller must hold an extra ref */
1823 ASSERT(ch->ch_refcnt > 1);
1824
1825 /* disconnect from nexus */
1826 ch_disconnect(ch);
1827
1828 /*
1829 * If this was the last regular channel and the nexus
1830 * has been closed, detach it and finish up the job.
1831 * If this was a nonxref channel, there is nothing
1832 * left to do; see comments in ch_open_special().
1833 */
1834 if (!nonxref) {
1835 STAILQ_REMOVE(&nx->nx_ch_head, ch,
1836 kern_channel, ch_link);
1837 nx->nx_ch_count--;
1838 if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1839 (nx->nx_flags & NXF_CLOSED)) {
1840 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1841 nx_detach(nx);
1842 }
1843 (void) nx_release_locked(nx);
1844 } else {
1845 ASSERT(ch->ch_flags & CHANF_KERNEL);
1846 STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1847 kern_channel, ch_link);
1848 }
1849
1850 os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed);
1851 ch->ch_nexus = NULL;
1852
1853 (void) ch_release_locked(ch); /* for the list */
1854 }
1855
1856 lck_mtx_unlock(&ch->ch_lock);
1857 if (!locked) {
1858 SK_UNLOCK();
1859 }
1860 }
1861
1862 void
ch_close(struct kern_channel * ch,boolean_t locked)1863 ch_close(struct kern_channel *ch, boolean_t locked)
1864 {
1865 ch_close_common(ch, locked, FALSE);
1866 }
1867
1868 void
ch_close_special(struct kern_channel * ch)1869 ch_close_special(struct kern_channel *ch)
1870 {
1871 ch_close_common(ch, TRUE, TRUE);
1872 }
1873
1874 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1875 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1876 struct ch_ev_thresh *cet)
1877 {
1878 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1879 uint32_t bmin, bmax, smin, smax;
1880 int err = 0;
1881
1882 if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1883 cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1884 err = EINVAL;
1885 goto done;
1886 }
1887
1888 smin = 1; /* minimum 1 slot */
1889 bmin = 1; /* minimum 1 byte */
1890
1891 if (t == NR_TX) {
1892 ASSERT(nxp->nxp_tx_slots > 0);
1893 smax = (nxp->nxp_tx_slots - 1);
1894 } else {
1895 ASSERT(nxp->nxp_rx_slots > 0);
1896 smax = (nxp->nxp_rx_slots - 1);
1897 }
1898 bmax = (smax * nxp->nxp_buf_size);
1899
1900 switch (cet->cet_unit) {
1901 case CHANNEL_THRESHOLD_UNIT_BYTES:
1902 if (cet->cet_value < bmin) {
1903 cet->cet_value = bmin;
1904 } else if (cet->cet_value > bmax) {
1905 cet->cet_value = bmax;
1906 }
1907 break;
1908
1909 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1910 if (cet->cet_value < smin) {
1911 cet->cet_value = smin;
1912 } else if (cet->cet_value > smax) {
1913 cet->cet_value = smax;
1914 }
1915 break;
1916 }
1917
1918 done:
1919 return err;
1920 }
1921
1922 #if SK_LOG
1923 /* Hoisted out of line to reduce kernel stack footprint */
1924 SK_LOG_ATTRIBUTE
1925 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1926 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1927 const struct chreq *chr, const struct kern_channel *ch,
1928 const struct kern_nexus_domain_provider *nxdom_prov,
1929 struct proc *p)
1930 {
1931 struct __user_channel_schema *ch_schema = ch->ch_schema;
1932 uuid_string_t uuidstr;
1933 unsigned int n;
1934 ring_id_t i, j;
1935
1936 ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1937 if (ch_schema != NULL) {
1938 SK_D("channel_schema at %p", SK_KVA(ch_schema));
1939 SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name);
1940 SK_D(" kern_uuid: %s",
1941 sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1942 SK_D(" flags: 0x%x", ch_schema->csm_flags);
1943 SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings,
1944 cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1945 SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings,
1946 cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1947
1948 j = ch->ch_last[NR_TX];
1949 for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1950 SK_D(" tx_ring_%u_off: 0x%llx", i,
1951 (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1952 SK_D(" tx_sd_%u_off: 0x%llx", i,
1953 (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1954 }
1955 j = n;
1956 for (n = 0, i = ch->ch_first[NR_RX];
1957 i < ch->ch_last[NR_RX]; n++, i++) {
1958 SK_D(" rx_ring_%u_off: 0x%llx", i,
1959 (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1960 SK_D(" rx_sd_%u_off: 0x%llx", i,
1961 (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1962 }
1963 SK_D(" md_type: %u", ch_schema->csm_md_type);
1964 SK_D(" md_subtype: %u", ch_schema->csm_md_subtype);
1965 SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs);
1966 SK_D(" stats_type: %u", ch_schema->csm_stats_type);
1967 SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs);
1968 SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max);
1969 SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs);
1970 }
1971
1972 SK_D("ch %p (%s:%s:\"%s\":%u:%d)",
1973 SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1975 cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1976 SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1977 SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1978 SK_D(" flags: 0x%x", ch->ch_flags);
1979 SK_D(" task: %p %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1980 sk_proc_name(p), sk_proc_pid(p));
1981 SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1982 ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1983 "bytes" : "slots"));
1984 SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1985 ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1986 "bytes" : "slots"));
1987 SK_D(" mmapref: %p", SK_KVA(ch->ch_mmap.ami_mapref));
1988 SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1989 SK_D(" mapsize: %llu (%llu KB)",
1990 (uint64_t)cinfo->cinfo_mem_map_size,
1991 (uint64_t)cinfo->cinfo_mem_map_size >> 10);
1992 SK_D(" memsize: %llu (%llu KB)",
1993 (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
1994 SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
1995 }
1996
1997 SK_LOG_ATTRIBUTE
1998 static void
ch_connect_log2(const struct kern_nexus * nx,int err)1999 ch_connect_log2(const struct kern_nexus *nx, int err)
2000 {
2001 uuid_string_t nx_uuidstr;
2002
2003 SK_ERR("Error connecting to nexus UUID %s: %d",
2004 sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2005 }
2006 #endif /* SK_LOG */
2007
2008 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct nxbind * nxb,struct proc * p,int fd,int * err)2009 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct nxbind *nxb,
2010 struct proc *p, int fd, int *err)
2011 {
2012 struct kern_nexus_domain_provider *nxdom_prov;
2013 struct kern_channel *ch = NULL;
2014 struct ch_info *cinfo = NULL;
2015 uint32_t ch_mode = chr->cr_mode;
2016 boolean_t config = FALSE;
2017 struct nxdom *nxdom;
2018 boolean_t reserved_port = FALSE;
2019
2020 ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2021 ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2022 SK_LOCK_ASSERT_HELD();
2023
2024 /* validate thresholds before we proceed any further */
2025 if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2026 (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2027 goto done;
2028 }
2029
2030 if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2031 *err = ENOTSUP;
2032 goto done;
2033 }
2034
2035 ch = ch_alloc(Z_WAITOK);
2036
2037 lck_mtx_lock(&ch->ch_lock);
2038
2039 uuid_generate_random(ch->ch_info->cinfo_ch_id);
2040 ch->ch_fd = fd;
2041 ch->ch_pid = proc_pid(p);
2042 (void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2043 proc_name_address(p));
2044
2045 nxdom_prov = NX_DOM_PROV(nx);
2046 nxdom = NX_DOM(nx);
2047
2048 if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2049 /*
2050 * CHANF_KERNEL implies a channel opened by a kernel
2051 * subsystem, and is triggered by the CHMODE_KERNEL
2052 * flag which (only ever) set by ch_open_special().
2053 *
2054 * CHANF_NONXREF can be optionally set based on the
2055 * CHMODE_NO_NXREF request flag. This must only be
2056 * set by ch_open_special() as well, hence we verify.
2057 */
2058 ASSERT(p == kernproc);
2059 ASSERT(ch_mode & CHMODE_KERNEL);
2060 os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed);
2061 if (ch_mode & CHMODE_NO_NXREF) {
2062 os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed);
2063 }
2064
2065 config = (ch_mode & CHMODE_CONFIG) != 0;
2066 if (chr->cr_port == NEXUS_PORT_ANY) {
2067 if (nxdom->nxdom_find_port == NULL) {
2068 *err = ENOTSUP;
2069 goto done;
2070 }
2071
2072 /*
2073 * If ephemeral port request, find one for client;
2074 * we ask for the reserved port range if this is
2075 * a configuration request (CHMODE_CONFIG).
2076 */
2077 if ((*err = nxdom->nxdom_find_port(nx,
2078 config, &chr->cr_port)) != 0) {
2079 goto done;
2080 }
2081 }
2082 }
2083
2084 if (skywalk_check_platform_binary(p)) {
2085 os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed);
2086 }
2087
2088 ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2089
2090 reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2091 (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2092 if (!config && reserved_port) {
2093 *err = EDOM;
2094 goto done;
2095 }
2096
2097 SK_PDF(SK_VERB_CHANNEL, p, "%snexus port %u requested",
2098 reserved_port ? "[reserved] " : "", chr->cr_port);
2099
2100 if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2101 nx, ch, chr, nxb, p)) != 0) {
2102 goto done;
2103 }
2104
2105 cinfo = ch->ch_info;
2106 uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2107 /* for easy access to immutables */
2108 bcopy(nx->nx_prov->nxprov_params, &cinfo->cinfo_nxprov_params,
2109 sizeof(struct nxprov_params));
2110 cinfo->cinfo_ch_mode = ch_mode;
2111 cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2112 cinfo->cinfo_nx_port = chr->cr_port;
2113 cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2114 cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2115 cinfo->cinfo_schema_offset = chr->cr_memoffset;
2116 cinfo->cinfo_num_bufs =
2117 PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2118 /*
2119 * ch_last is really the number of rings, but we need to return
2120 * the actual zero-based ring ID to the client. Make sure that
2121 * is the case here and adjust last_{tx,rx}_ring accordingly.
2122 */
2123 ASSERT((ch->ch_last[NR_TX] > 0) ||
2124 (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2125 ASSERT((ch->ch_last[NR_RX] > 0) ||
2126 (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2127 cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2128 cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2129 cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2130 cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2131 cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2132 cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2133
2134 if (ch_mode & CHMODE_NO_NXREF) {
2135 ASSERT(ch_mode & CHMODE_KERNEL);
2136 STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2137 } else {
2138 STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2139 nx->nx_ch_count++;
2140 }
2141 os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed);
2142 ch->ch_nexus = nx;
2143 nx_retain_locked(nx); /* hold a ref on the nexus */
2144
2145 ch_retain_locked(ch); /* one for being in the list */
2146 ch_retain_locked(ch); /* one for the caller */
2147
2148 /*
2149 * Now that we've successfully created the nexus adapter, inform the
2150 * nexus provider about the rings and the slots within each ring.
2151 * This is a no-op for internal nexus providers.
2152 */
2153 if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2154 lck_mtx_unlock(&ch->ch_lock);
2155
2156 /* gracefully close this fully-formed channel */
2157 if (ch->ch_flags & CHANF_KERNEL) {
2158 ch_close_special(ch);
2159 } else {
2160 ch_close(ch, TRUE);
2161 }
2162 (void) ch_release_locked(ch);
2163 ch = NULL;
2164 goto done;
2165 }
2166
2167 ASSERT(ch->ch_schema == NULL ||
2168 (ch->ch_schema->csm_flags & CSM_ACTIVE));
2169
2170 #if SK_LOG
2171 if (__improbable(sk_verbose != 0)) {
2172 ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2173 }
2174 #endif /* SK_LOG */
2175
2176 done:
2177 if (ch != NULL) {
2178 lck_mtx_unlock(&ch->ch_lock);
2179 }
2180 if (*err != 0) {
2181 #if SK_LOG
2182 if (__improbable(sk_verbose != 0)) {
2183 ch_connect_log2(nx, *err);
2184 }
2185 #endif /* SK_LOG */
2186 if (ch != NULL) {
2187 ch_free(ch);
2188 ch = NULL;
2189 }
2190 }
2191 return ch;
2192 }
2193
2194 static void
ch_disconnect(struct kern_channel * ch)2195 ch_disconnect(struct kern_channel *ch)
2196 {
2197 struct kern_nexus *nx = ch->ch_nexus;
2198 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2199
2200 SK_LOCK_ASSERT_HELD();
2201 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2202
2203 /*
2204 * Inform the nexus provider that the channel has been quiesced
2205 * and disconnected from the nexus port. This is a no-op for
2206 * internal nexus providers.
2207 */
2208 nxprov_advise_disconnect(nx, ch);
2209
2210 /* Finally, let the domain provider tear down the instance */
2211 nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2212 }
2213
2214 void
ch_deactivate(struct kern_channel * ch)2215 ch_deactivate(struct kern_channel *ch)
2216 {
2217 /*
2218 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2219 * it will never be set again. Doing this will cause
2220 * os_channel_is_defunct() to indicate that the channel
2221 * is defunct and is no longer usable (thus should be
2222 * immediately closed).
2223 */
2224 if (ch->ch_schema != NULL &&
2225 (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2226 os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags),
2227 CSM_ACTIVE, relaxed);
2228 /* make this globally visible */
2229 os_atomic_thread_fence(seq_cst);
2230 }
2231 }
2232
2233 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2234 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2235 {
2236 #pragma unused(ch)
2237 int err = 0;
2238
2239 if (sopt->sopt_dir != SOPT_SET) {
2240 sopt->sopt_dir = SOPT_SET;
2241 }
2242
2243 switch (sopt->sopt_name) {
2244 case CHOPT_TX_LOWAT_THRESH:
2245 err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2246 break;
2247
2248 case CHOPT_RX_LOWAT_THRESH:
2249 err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2250 break;
2251
2252 case CHOPT_IF_ADV_CONF:
2253 err = ch_configure_interface_advisory_event(ch, sopt);
2254 break;
2255
2256 default:
2257 err = ENOPROTOOPT;
2258 break;
2259 }
2260
2261 return err;
2262 }
2263
2264 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2265 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2266 {
2267 #pragma unused(ch)
2268 int err = 0;
2269
2270 if (sopt->sopt_dir != SOPT_GET) {
2271 sopt->sopt_dir = SOPT_GET;
2272 }
2273
2274 switch (sopt->sopt_name) {
2275 case CHOPT_TX_LOWAT_THRESH:
2276 err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2277 break;
2278
2279 case CHOPT_RX_LOWAT_THRESH:
2280 err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2281 break;
2282
2283 default:
2284 err = ENOPROTOOPT;
2285 break;
2286 }
2287
2288 return err;
2289 }
2290
2291 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2292 ch_configure_interface_advisory_event(struct kern_channel *ch,
2293 struct sockopt *sopt)
2294 {
2295 int err = 0;
2296 boolean_t enable = 0;
2297 struct kern_nexus *nx = ch->ch_nexus;
2298
2299 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2300 SK_LOCK_ASSERT_NOTHELD();
2301
2302 if (sopt->sopt_val == USER_ADDR_NULL) {
2303 return EINVAL;
2304 }
2305 if (nx->nx_adv.nxv_adv == NULL) {
2306 return ENOTSUP;
2307 }
2308 err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2309 if (err != 0) {
2310 return err;
2311 }
2312
2313 /*
2314 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2315 * ordering requirement; check if the channel is closing once ch_lock
2316 * is reacquired and bail if so.
2317 */
2318 lck_mtx_unlock(&ch->ch_lock);
2319 SK_LOCK();
2320 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2321 lck_mtx_lock(&ch->ch_lock);
2322 if (ch->ch_flags & CHANF_CLOSING) {
2323 err = ENXIO;
2324 goto done;
2325 }
2326
2327 /*
2328 * if interface advisory reporting is enabled on the channel then
2329 * add the channel to the list of channels eligible for interface
2330 * advisory update on the nexus. If disabled, remove from the list.
2331 */
2332 if (enable) {
2333 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2334 ASSERT(err == 0);
2335 goto done;
2336 }
2337 bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2338 os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2339 STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2340 if (enable_adv) {
2341 nx_netif_config_interface_advisory(nx, true);
2342 }
2343 ch_retain_locked(ch); /* for being in the IF ADV list */
2344 } else {
2345 if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2346 ASSERT(err == 0);
2347 goto done;
2348 }
2349 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2350 ch_link_if_adv);
2351 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2352 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2353 nx_netif_config_interface_advisory(nx, false);
2354 }
2355 (void) ch_release_locked(ch);
2356 }
2357
2358 done:
2359 lck_mtx_unlock(&ch->ch_lock);
2360 lck_rw_done(&nx->nx_ch_if_adv_lock);
2361 SK_UNLOCK();
2362 lck_mtx_lock(&ch->ch_lock);
2363
2364 return err;
2365 }
2366
2367 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2368 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2369 struct sockopt *sopt)
2370 {
2371 struct ch_ev_thresh cet, *ocet;
2372 int err = 0;
2373
2374 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2375
2376 if (sopt->sopt_val == USER_ADDR_NULL) {
2377 return EINVAL;
2378 }
2379
2380 bzero(&cet, sizeof(cet));
2381 err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2382 if (err == 0) {
2383 err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2384 if (err == 0) {
2385 if (t == NR_TX) {
2386 ocet = &ch->ch_info->cinfo_tx_lowat;
2387 } else {
2388 ocet = &ch->ch_info->cinfo_rx_lowat;
2389 }
2390
2391 /* if there is no change, we're done */
2392 if (ocet->cet_unit == cet.cet_unit &&
2393 ocet->cet_value == cet.cet_value) {
2394 return 0;
2395 }
2396
2397 *ocet = cet;
2398
2399 for_rx_tx(t) {
2400 ring_id_t qfirst = ch->ch_first[t];
2401 ring_id_t qlast = ch->ch_last[t];
2402 uint32_t i;
2403
2404 for (i = qfirst; i < qlast; i++) {
2405 struct __kern_channel_ring *kring =
2406 &NAKR(ch->ch_na, t)[i];
2407
2408 (void) kring->ckr_na_notify(kring,
2409 sopt->sopt_p, 0);
2410 }
2411 }
2412
2413 (void) sooptcopyout(sopt, &cet, sizeof(cet));
2414 }
2415 }
2416
2417 return err;
2418 }
2419
2420 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2421 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2422 struct sockopt *sopt)
2423 {
2424 struct ch_ev_thresh cet;
2425
2426 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2427
2428 if (sopt->sopt_val == USER_ADDR_NULL) {
2429 return EINVAL;
2430 }
2431
2432 if (t == NR_TX) {
2433 cet = ch->ch_info->cinfo_tx_lowat;
2434 } else {
2435 cet = ch->ch_info->cinfo_rx_lowat;
2436 }
2437
2438 return sooptcopyout(sopt, &cet, sizeof(cet));
2439 }
2440
2441 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2442 ch_alloc(zalloc_flags_t how)
2443 {
2444 struct kern_channel *ch;
2445
2446 ch = zalloc_flags(ch_zone, how | Z_ZERO);
2447 if (ch) {
2448 lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2449 ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2450 }
2451 return ch;
2452 }
2453
2454 static void
ch_free(struct kern_channel * ch)2455 ch_free(struct kern_channel *ch)
2456 {
2457 ASSERT(ch->ch_refcnt == 0);
2458 ASSERT(ch->ch_pp == NULL);
2459 ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2460 CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2461 lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2462 SK_DF(SK_VERB_MEM, "ch %p FREE", SK_KVA(ch));
2463 ASSERT(ch->ch_info != NULL);
2464 zfree(ch_info_zone, ch->ch_info);
2465 ch->ch_info = NULL;
2466 zfree(ch_zone, ch);
2467 }
2468
2469 void
ch_retain_locked(struct kern_channel * ch)2470 ch_retain_locked(struct kern_channel *ch)
2471 {
2472 SK_LOCK_ASSERT_HELD();
2473
2474 ch->ch_refcnt++;
2475 VERIFY(ch->ch_refcnt != 0);
2476 }
2477
2478 void
ch_retain(struct kern_channel * ch)2479 ch_retain(struct kern_channel *ch)
2480 {
2481 SK_LOCK();
2482 ch_retain_locked(ch);
2483 SK_UNLOCK();
2484 }
2485
2486 int
ch_release_locked(struct kern_channel * ch)2487 ch_release_locked(struct kern_channel *ch)
2488 {
2489 int oldref = ch->ch_refcnt;
2490
2491 SK_LOCK_ASSERT_HELD();
2492
2493 VERIFY(ch->ch_refcnt != 0);
2494 if (--ch->ch_refcnt == 0) {
2495 ch_free(ch);
2496 }
2497
2498 return oldref == 1;
2499 }
2500
2501 int
ch_release(struct kern_channel * ch)2502 ch_release(struct kern_channel *ch)
2503 {
2504 int lastref;
2505
2506 SK_LOCK();
2507 lastref = ch_release_locked(ch);
2508 SK_UNLOCK();
2509
2510 return lastref;
2511 }
2512
2513 void
ch_dtor(struct kern_channel * ch)2514 ch_dtor(struct kern_channel *ch)
2515 {
2516 SK_LOCK();
2517 ch_close(ch, TRUE);
2518 (void) ch_release_locked(ch);
2519 SK_UNLOCK();
2520 }
2521
2522 void
ch_update_upp_buf_stats(struct kern_channel * ch,struct kern_pbufpool * pp)2523 ch_update_upp_buf_stats(struct kern_channel *ch, struct kern_pbufpool *pp)
2524 {
2525 uint64_t buf_inuse = pp->pp_u_bufinuse;
2526 struct __user_channel_schema *csm = ch->ch_schema;
2527 os_atomic_store(&csm->csm_upp_buf_inuse, buf_inuse, relaxed);
2528 }
2529
2530 #if SK_LOG
2531 SK_NO_INLINE_ATTRIBUTE
2532 char *
ch2str(const struct kern_channel * ch,char * __counted_by (dsz)dst,size_t dsz)2533 ch2str(const struct kern_channel *ch, char *__counted_by(dsz)dst, size_t dsz)
2534 {
2535 (void) sk_snprintf(dst, dsz, "%p %s flags 0x%b",
2536 SK_KVA(ch), ch->ch_name, ch->ch_flags, CHANF_BITS);
2537
2538 return dst;
2539 }
2540 #endif
2541