1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61
62 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63
64 struct ch_event_result {
65 uint32_t tx_data;
66 uint32_t rx_data;
67 };
68
69 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72
73 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77 boolean_t, boolean_t, uint32_t);
78 static boolean_t csi_tcall_start(struct ch_selinfo *);
79 static void csi_tcall(thread_call_param_t, thread_call_param_t);
80 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81
82 static void ch_redzone_init(void);
83 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85 ring_id_t);
86 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87 struct ch_ev_thresh *);
88 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89 struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90 static void ch_disconnect(struct kern_channel *);
91 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92 struct sockopt *);
93 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94 struct sockopt *);
95 static struct kern_channel *ch_alloc(zalloc_flags_t);
96 static void ch_free(struct kern_channel *);
97 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98 struct sockopt *sopt);
99
100 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101 static void filt_chrwdetach(struct knote *, boolean_t);
102 static void filt_chrdetach(struct knote *);
103 static void filt_chwdetach(struct knote *);
104 static int filt_chrw(struct knote *, long, int);
105 static int filt_chread(struct knote *, long);
106 static int filt_chwrite(struct knote *, long);
107
108 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115 static void filt_che_detach(struct knote *);
116 static int filt_che_event(struct knote *, long);
117 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118 static int filt_che_process(struct knote *, struct kevent_qos_s *);
119 static int filt_chan_extended_common(struct knote *, long);
120
121 static int ch_event(struct kern_channel *ch, int events,
122 void *wql, struct proc *p, struct ch_event_result *,
123 const boolean_t is_kevent, int *errno, const boolean_t);
124
125 const struct filterops skywalk_channel_rfiltops = {
126 .f_isfd = 1,
127 .f_attach = filt_chrwattach,
128 .f_detach = filt_chrdetach,
129 .f_event = filt_chread,
130 .f_touch = filt_chrtouch,
131 .f_process = filt_chrprocess,
132 };
133
134 const struct filterops skywalk_channel_wfiltops = {
135 .f_isfd = 1,
136 .f_attach = filt_chrwattach,
137 .f_detach = filt_chwdetach,
138 .f_event = filt_chwrite,
139 .f_touch = filt_chwtouch,
140 .f_process = filt_chwprocess,
141 };
142
143 const struct filterops skywalk_channel_efiltops = {
144 .f_isfd = 1,
145 .f_attach = filt_che_attach,
146 .f_detach = filt_che_detach,
147 .f_event = filt_che_event,
148 .f_touch = filt_che_touch,
149 .f_process = filt_che_process,
150 };
151
152 /* mitigation intervals in ns */
153 #define CH_MIT_IVAL_MIN NSEC_PER_USEC
154
155 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156
157 #if (DEVELOPMENT || DEBUG)
158 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162 #endif /* !DEVELOPMENT && !DEBUG */
163
164 static ZONE_DEFINE(ch_zone, SKMEM_ZONE_PREFIX ".ch",
165 sizeof(struct kern_channel), ZC_ZFREE_CLEARMEM);
166
167 static ZONE_DEFINE(ch_info_zone, SKMEM_ZONE_PREFIX ".ch.info",
168 sizeof(struct ch_info), ZC_ZFREE_CLEARMEM);
169
170 static int __ch_inited = 0;
171
172 /*
173 * Global cookies to hold the random numbers used for verifying
174 * user metadata red zone violations.
175 */
176 uint64_t __ch_umd_redzone_cookie = 0;
177
178 #define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key"
179 SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
180
181 static void
ch_redzone_init(void)182 ch_redzone_init(void)
183 {
184 _CASSERT(sizeof(__ch_umd_redzone_cookie) ==
185 sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
186 _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
187 _CASSERT(sizeof(struct __slot_desc) == 8);
188
189 /* Initialize random user red zone cookie values */
190 do {
191 read_random(&__ch_umd_redzone_cookie,
192 sizeof(__ch_umd_redzone_cookie));
193 } while (__ch_umd_redzone_cookie == 0);
194
195 SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
196 }
197
198 int
channel_init(void)199 channel_init(void)
200 {
201 int error = 0;
202
203 SK_LOCK_ASSERT_HELD();
204 ASSERT(!__ch_inited);
205
206 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
207 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
208
209 ch_redzone_init();
210
211 __ch_inited = 1;
212
213 return error;
214 }
215
216 void
channel_fini(void)217 channel_fini(void)
218 {
219 SK_LOCK_ASSERT_HELD();
220
221 if (__ch_inited) {
222 __ch_umd_redzone_cookie = 0;
223 __ch_inited = 0;
224 }
225 }
226
227 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)228 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
229 {
230 csi->csi_flags = 0;
231 csi->csi_pending = 0;
232 if (mitigation) {
233 csi->csi_interval = mit_ival;
234 csi->csi_eff_interval = ch_mit_ival; /* global override */
235 atomic_bitset_32(&csi->csi_flags, CSI_MITIGATION);
236 csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
237 csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
238 /* this must not fail */
239 VERIFY(csi->csi_tcall != NULL);
240 } else {
241 csi->csi_interval = 0;
242 csi->csi_eff_interval = 0;
243 csi->csi_tcall = NULL;
244 }
245 lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
246 klist_init(&csi->csi_si.si_note);
247 }
248
249 void
csi_destroy(struct ch_selinfo * csi)250 csi_destroy(struct ch_selinfo *csi)
251 {
252 /* check if not already destroyed, else do it now */
253 if ((atomic_bitset_32_ov(&csi->csi_flags, CSI_DESTROYED) &
254 CSI_DESTROYED) == 0) {
255 CSI_LOCK(csi);
256 /* must have been set by above atomic op */
257 VERIFY(csi->csi_flags & CSI_DESTROYED);
258 if (csi->csi_flags & CSI_MITIGATION) {
259 thread_call_t tcall = csi->csi_tcall;
260 VERIFY(tcall != NULL);
261 CSI_UNLOCK(csi);
262
263 (void) thread_call_cancel_wait(tcall);
264 if (!thread_call_free(tcall)) {
265 boolean_t freed;
266 (void) thread_call_cancel_wait(tcall);
267 freed = thread_call_free(tcall);
268 VERIFY(freed);
269 }
270
271 CSI_LOCK(csi);
272 csi->csi_tcall = NULL;
273 atomic_bitclear_32(&csi->csi_flags, CSI_MITIGATION);
274 }
275 csi->csi_pending = 0;
276 CSI_UNLOCK(csi);
277
278 selthreadclear(&csi->csi_si);
279 /* now we don't need the mutex anymore */
280 lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
281 }
282 }
283
284 /*
285 * Called only for select(2).
286 */
287 __attribute__((always_inline))
288 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)289 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
290 {
291 struct selinfo *si = &csi->csi_si;
292
293 CSI_LOCK_ASSERT_HELD(csi);
294 selrecord(p, si, wql);
295 }
296
297 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)298 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
299 {
300 struct ch_selinfo *csi = &kring->ckr_si;
301
302 CSI_LOCK(csi);
303 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
304 "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
305 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
306 SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
307
308 csi_selrecord(csi, p, wql);
309 CSI_UNLOCK(csi);
310 }
311
312 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)313 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
314 void *wql)
315 {
316 struct ch_selinfo *csi = &na->na_si[t];
317
318 CSI_LOCK(csi);
319 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
320 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
321 SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
322
323 csi_selrecord(csi, p, wql);
324 CSI_UNLOCK(csi);
325 }
326
327 /*
328 * Called from na_post_event().
329 */
330 __attribute__((always_inline))
331 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)332 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
333 boolean_t selwake, uint32_t hint)
334 {
335 struct selinfo *si = &csi->csi_si;
336
337 CSI_LOCK_ASSERT_HELD(csi);
338 csi->csi_pending = 0;
339 if (selwake) {
340 selwakeup(si);
341 }
342 if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
343 KNOTE(&si->si_note, hint);
344 }
345 }
346
347 __attribute__((always_inline))
348 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)349 csi_selwakeup_delayed(struct ch_selinfo *csi)
350 {
351 CSI_LOCK_ASSERT_HELD(csi);
352 ASSERT(csi->csi_flags & CSI_MITIGATION);
353 ASSERT(csi->csi_tcall != NULL);
354
355 if (thread_call_isactive(csi->csi_tcall)) {
356 csi->csi_pending++;
357 } else if (!csi_tcall_start(csi)) {
358 csi_selwakeup(csi, FALSE, FALSE, 0);
359 }
360 }
361
362 __attribute__((always_inline))
363 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)364 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
365 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
366 {
367 CSI_LOCK_ASSERT_HELD(csi);
368
369 if (nodelay || within_kevent || !selwake || hint != 0 ||
370 !(csi->csi_flags & CSI_MITIGATION)) {
371 csi_selwakeup(csi, within_kevent, selwake, hint);
372 } else {
373 csi_selwakeup_delayed(csi);
374 }
375 }
376
377 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)378 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
379 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
380 {
381 struct ch_selinfo *csi = &kring->ckr_si;
382
383 CSI_LOCK(csi);
384 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
385 "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
386 (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
387 SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
388 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
389 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
390
391 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
392 CSI_UNLOCK(csi);
393 }
394
395 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)396 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
397 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
398 {
399 struct ch_selinfo *csi = &na->na_si[t];
400
401 CSI_LOCK(csi);
402 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
403 "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
404 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
405 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
406 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
407
408 switch (t) {
409 case NR_RX:
410 if (!(na->na_flags & NAF_RX_MITIGATION)) {
411 nodelay = TRUE;
412 }
413 break;
414
415 case NR_TX:
416 if (!(na->na_flags & NAF_TX_MITIGATION)) {
417 nodelay = TRUE;
418 }
419 break;
420
421 default:
422 nodelay = TRUE;
423 break;
424 }
425 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
426 CSI_UNLOCK(csi);
427 }
428
429 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)430 csi_tcall_start(struct ch_selinfo *csi)
431 {
432 uint64_t now, ival, deadline;
433
434 CSI_LOCK_ASSERT_HELD(csi);
435 ASSERT(csi->csi_flags & CSI_MITIGATION);
436 ASSERT(csi->csi_tcall != NULL);
437
438 /* pick up latest value */
439 ival = csi_tcall_update_interval(csi);
440
441 /* if no mitigation, pass notification up now */
442 if (__improbable(ival == 0)) {
443 return FALSE;
444 }
445
446 deadline = now = mach_absolute_time();
447 clock_deadline_for_periodic_event(ival, now, &deadline);
448 (void) thread_call_enter_delayed(csi->csi_tcall, deadline);
449
450 return TRUE;
451 }
452
453 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)454 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
455 {
456 #pragma unused(arg1)
457 struct ch_selinfo *csi = arg0;
458
459 CSI_LOCK(csi);
460 csi_selwakeup(csi, FALSE, FALSE, 0);
461 CSI_UNLOCK(csi);
462
463 CSI_LOCK(csi);
464 if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
465 csi->csi_pending != 0 && !csi_tcall_start(csi))) {
466 csi_selwakeup(csi, FALSE, FALSE, 0);
467 }
468 CSI_UNLOCK(csi);
469 }
470
471 __attribute__((always_inline))
472 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)473 csi_tcall_update_interval(struct ch_selinfo *csi)
474 {
475 uint64_t i = ch_mit_ival;
476
477 /* if global override was adjusted, update local copies */
478 if (__improbable(csi->csi_eff_interval != i)) {
479 ASSERT(csi->csi_flags & CSI_MITIGATION);
480 csi->csi_interval = csi->csi_eff_interval =
481 ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
482 }
483
484 return csi->csi_interval;
485 }
486
487 /* return EV_EOF if the channel is defunct */
488 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)489 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
490 {
491 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
492 if (kn) {
493 kn->kn_flags |= EV_EOF;
494 }
495 return TRUE;
496 }
497 return FALSE;
498 }
499
500 static void
filt_chrwdetach(struct knote * kn,boolean_t write)501 filt_chrwdetach(struct knote *kn, boolean_t write)
502 {
503 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
504 struct ch_selinfo *csi;
505 struct selinfo *si;
506
507 lck_mtx_lock(&ch->ch_lock);
508 csi = ch->ch_si[write ? NR_TX : NR_RX];
509 si = &csi->csi_si;
510
511 CSI_LOCK(csi);
512 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
513 "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
514 SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
515 write ? "write" : "read", si->si_flags);
516
517 if (KNOTE_DETACH(&si->si_note, kn)) {
518 atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
519 }
520
521 CSI_UNLOCK(csi);
522 lck_mtx_unlock(&ch->ch_lock);
523 }
524
525 static void
filt_chrdetach(struct knote * kn)526 filt_chrdetach(struct knote *kn)
527 {
528 ASSERT(kn->kn_filter == EVFILT_READ);
529 filt_chrwdetach(kn, FALSE);
530 }
531
532 static void
filt_chwdetach(struct knote * kn)533 filt_chwdetach(struct knote *kn)
534 {
535 ASSERT(kn->kn_filter == EVFILT_WRITE);
536 filt_chrwdetach(kn, TRUE);
537 }
538
539 /*
540 * callback from notifies (generated externally).
541 * This always marks the knote activated, so always
542 * return 1.
543 */
544 static int
filt_chrw(struct knote * kn,long hint,int events)545 filt_chrw(struct knote *kn, long hint, int events)
546 {
547 #if SK_LOG
548 struct kern_channel *ch = kn->kn_hook;
549 #else
550 #pragma unused(kn)
551 #pragma unused(hint)
552 #pragma unused(events)
553 #endif
554 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
555 "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
556 SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
557 (kn->kn_flags & EV_POLL) ? "poll," : "",
558 (events == POLLOUT) ? "write" : "read",
559 (uint32_t)hint);
560
561 /* assume we are ready */
562 return 1;
563 }
564
565 static int
filt_chread(struct knote * kn,long hint)566 filt_chread(struct knote *kn, long hint)
567 {
568 ASSERT(kn->kn_filter == EVFILT_READ);
569 /* There is no hint for read/write event */
570 if (hint != 0) {
571 return 0;
572 }
573 return filt_chrw(kn, hint, POLLIN);
574 }
575
576 static int
filt_chwrite(struct knote * kn,long hint)577 filt_chwrite(struct knote *kn, long hint)
578 {
579 ASSERT(kn->kn_filter == EVFILT_WRITE);
580 /* There is no hint for read/write event */
581 if (hint != 0) {
582 return 0;
583 }
584 return filt_chrw(kn, hint, POLLOUT);
585 }
586
587 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)588 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
589 {
590 #pragma unused(kev)
591 struct kern_channel *ch = kn->kn_hook;
592 int ev = kn->kn_filter;
593 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
594 int event_error = 0;
595 int revents;
596
597 /* save off the new input fflags and data */
598 kn->kn_sfflags = kev->fflags;
599 kn->kn_sdata = kev->data;
600
601 lck_mtx_lock(&ch->ch_lock);
602 if (__improbable(ch_filt_check_defunct(ch, kn))) {
603 lck_mtx_unlock(&ch->ch_lock);
604 return 1;
605 }
606
607 /* if a note-specific low watermark is given, validate it */
608 if (kn->kn_sfflags & NOTE_LOWAT) {
609 struct ch_ev_thresh note_thresh = {
610 .cet_unit = (dir == NR_TX) ?
611 ch->ch_info->cinfo_tx_lowat.cet_unit :
612 ch->ch_info->cinfo_rx_lowat.cet_unit,
613 .cet_value = (uint32_t)kn->kn_sdata
614 };
615 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
616 ¬e_thresh) != 0) {
617 SK_ERR("invalid NOTE_LOWAT threshold %u",
618 note_thresh.cet_value);
619 knote_set_error(kn, EINVAL);
620 lck_mtx_unlock(&ch->ch_lock);
621 return 1;
622 }
623 }
624
625 /* capture new state just so we can return it */
626 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
627 &event_error, FALSE);
628 lck_mtx_unlock(&ch->ch_lock);
629
630 if (revents & POLLERR) {
631 ASSERT(event_error != 0);
632 /*
633 * Setting a knote error here will confuse libdispatch, so we
634 * use EV_EOF instead.
635 */
636 kn->kn_flags |= EV_EOF;
637 return 1;
638 } else {
639 return (events & revents) != 0;
640 }
641 }
642
643 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)644 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
645 {
646 ASSERT(kn->kn_filter == EVFILT_READ);
647
648 if (kev->flags & EV_ENABLE) {
649 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
650 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
651 kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
652 ((struct kern_channel *)kn->kn_hook)->ch_na));
653 }
654
655 return filt_chtouch(kn, kev, POLLIN);
656 }
657
658 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)659 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
660 {
661 ASSERT(kn->kn_filter == EVFILT_WRITE);
662 return filt_chtouch(kn, kev, POLLOUT);
663 }
664
665
666 /*
667 * Called from kevent. We call ch_event(POLL[IN|OUT]) and
668 * return 0/1 accordingly.
669 */
670 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)671 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
672 {
673 struct kern_channel *ch = kn->kn_hook;
674 struct ch_event_result result;
675 uint32_t lowat;
676 int trigger_event = 1;
677 int revents;
678 int event_error;
679 int64_t data;
680
681 lck_mtx_lock(&ch->ch_lock);
682 if (__improbable(ch_filt_check_defunct(ch, kn))) {
683 knote_fill_kevent(kn, kev, 0);
684 lck_mtx_unlock(&ch->ch_lock);
685 return 1;
686 }
687
688 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
689 TRUE, &event_error, FALSE);
690
691 if (revents & POLLERR) {
692 ASSERT(event_error != 0);
693 lck_mtx_unlock(&ch->ch_lock);
694 /*
695 * Setting a knote error here will confuse libdispatch, so we
696 * use EV_EOF instead.
697 */
698 kn->kn_flags |= EV_EOF;
699 knote_fill_kevent_with_sdata(kn, kev);
700 return 1;
701 }
702
703 trigger_event = (events & revents) != 0;
704
705 if (events == POLLOUT) {
706 lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
707 if ((kn->kn_sfflags & NOTE_LOWAT) &&
708 kn->kn_sdata > lowat) {
709 lowat = (uint32_t)kn->kn_sdata;
710 }
711
712 data = result.tx_data;
713
714 if (result.tx_data < lowat) {
715 trigger_event = 0;
716 }
717 } else {
718 lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
719 if ((kn->kn_sfflags & NOTE_LOWAT) &&
720 kn->kn_sdata > lowat) {
721 lowat = (uint32_t)kn->kn_sdata;
722 }
723
724 data = result.rx_data;
725
726 if (result.rx_data < lowat) {
727 trigger_event = 0;
728 }
729 }
730
731 if (trigger_event) {
732 knote_fill_kevent(kn, kev, data);
733 }
734
735 lck_mtx_unlock(&ch->ch_lock);
736
737 return trigger_event;
738 }
739
740 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)741 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
742 {
743 ASSERT(kn->kn_filter == EVFILT_READ);
744 return filt_chprocess(kn, kev, POLLIN);
745 }
746
747 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)748 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
749 {
750 ASSERT(kn->kn_filter == EVFILT_WRITE);
751 return filt_chprocess(kn, kev, POLLOUT);
752 }
753
754 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)755 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
756 {
757 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
758 struct nexus_adapter *na;
759 struct ch_selinfo *csi;
760 int ev = kn->kn_filter;
761 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
762 int revents;
763 int events;
764 int event_error = 0;
765
766 ASSERT((kn->kn_filter == EVFILT_READ) ||
767 (kn->kn_filter == EVFILT_WRITE));
768
769 /* ch_kqfilter() should have acquired the lock */
770 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
771
772 na = ch->ch_na;
773 /* if a note-specific low watermark is given, validate it */
774 if (kn->kn_sfflags & NOTE_LOWAT) {
775 struct ch_ev_thresh note_thresh = {
776 .cet_unit = (dir == NR_TX) ?
777 ch->ch_info->cinfo_tx_lowat.cet_unit :
778 ch->ch_info->cinfo_rx_lowat.cet_unit,
779 .cet_value = (uint32_t)kn->kn_sdata
780 };
781 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
782 ¬e_thresh) != 0) {
783 SK_ERR("invalid NOTE_LOWAT threshold %u",
784 note_thresh.cet_value);
785 knote_set_error(kn, EINVAL);
786 return 0;
787 }
788 }
789
790 /* the si is indicated in the channel */
791 csi = ch->ch_si[dir];
792 CSI_LOCK(csi);
793
794 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
795 atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
796 }
797
798 CSI_UNLOCK(csi);
799
800 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
801 na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
802 (kn->kn_flags & EV_POLL) ? "poll," : "",
803 (ev == EVFILT_WRITE) ? "write" : "read");
804
805 /* capture current state */
806 events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
807
808 if (__improbable(ch_filt_check_defunct(ch, kn))) {
809 revents = events;
810 } else {
811 /* filt_chprocess() will fill in the kn_sdata field */
812 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
813 NULL, TRUE, &event_error, FALSE);
814 }
815
816 if (revents & POLLERR) {
817 ASSERT(event_error != 0);
818 kn->kn_flags |= EV_EOF;
819 return 1;
820 } else {
821 return (events & revents) != 0;
822 }
823 }
824
825 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)826 filt_chan_extended_common(struct knote *kn, long ev_hint)
827 {
828 /*
829 * This function is not always called with the same set of locks held,
830 * hence it is only allowed to manipulate kn_fflags, with atomics.
831 *
832 * the f_event / f_process functions may run concurrently.
833 */
834 uint32_t add_fflags = 0;
835
836 if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
837 add_fflags |= NOTE_FLOW_ADV_UPDATE;
838 }
839 if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
840 add_fflags |= NOTE_CHANNEL_EVENT;
841 }
842 if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
843 add_fflags |= NOTE_IF_ADV_UPD;
844 }
845 if (add_fflags) {
846 /* Reset any events that are not requested on this knote */
847 add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
848 os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
849 return add_fflags != 0;
850 }
851 return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
852 }
853
854 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)855 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
856 uint32_t fflags, long *hint)
857 {
858 int revents, event_error = 0;
859
860 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
861 *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
862
863 if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
864 ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
865 /* capture new state to return */
866 revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
867 NULL, TRUE, &event_error, TRUE);
868 if (revents & POLLERR) {
869 ASSERT(event_error != 0);
870 /*
871 * Setting a knote error here will confuse libdispatch,
872 * so we use EV_EOF instead.
873 */
874 kn->kn_flags |= EV_EOF;
875 } else if ((revents & POLLIN) != 0) {
876 *hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
877 }
878 }
879 /*
880 * if the sync operation on event ring didn't find any events
881 * then indicate that the channel event is not active.
882 */
883 if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
884 /*
885 * Avoid a costly atomic when the bit is already cleared.
886 */
887 uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
888 if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
889 os_atomic_andnot(&kn->kn_fflags,
890 CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
891 }
892 }
893 }
894
895 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)896 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
897 {
898 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
899 struct ch_selinfo *csi;
900 long hint = 0;
901
902 _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
903 _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
904 _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
905
906 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
907
908 /* ch_kqfilter() should have acquired the lock */
909 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
910
911 csi = ch->ch_si[NR_TX];
912 CSI_LOCK(csi);
913 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
914 atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
915 }
916 CSI_UNLOCK(csi);
917
918 if (__improbable(ch_filt_check_defunct(ch, kn))) {
919 return 1;
920 }
921 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
922 atomic_bitset_32(&ch->ch_na->na_flags,
923 NAF_CHANNEL_EVENT_ATTACHED);
924 }
925 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
926 if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
927 /* on registration force an event */
928 hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
929 }
930 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
931 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
932 "EVFILT_NW_CHANNEL");
933 return filt_chan_extended_common(kn, hint);
934 }
935
936 static void
filt_che_detach(struct knote * kn)937 filt_che_detach(struct knote *kn)
938 {
939 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
940 struct ch_selinfo *csi;
941
942 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
943
944 lck_mtx_lock(&ch->ch_lock);
945 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
946 atomic_bitclear_32(&ch->ch_na->na_flags,
947 NAF_CHANNEL_EVENT_ATTACHED);
948 }
949 csi = ch->ch_si[NR_TX];
950 CSI_LOCK(csi);
951 if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
952 atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
953 }
954 CSI_UNLOCK(csi);
955 lck_mtx_unlock(&ch->ch_lock);
956
957 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
958 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
959 "EVFILT_NW_CHANNEL");
960 }
961
962 static int
filt_che_event(struct knote * kn,long hint)963 filt_che_event(struct knote *kn, long hint)
964 {
965 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
966
967 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
968 if (hint == 0) {
969 return 0;
970 }
971 if (__improbable(ch_filt_check_defunct(ch, NULL))) {
972 return 1;
973 }
974 if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
975 VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
976 }
977 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
978 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
979 CHAN_FILT_HINT_BITS);
980 return filt_chan_extended_common(kn, hint);
981 }
982
983 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)984 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
985 {
986 int ret;
987 long hint = 0;
988 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
989
990 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
991 /* save off the new input fflags and data */
992 kn->kn_sfflags = kev->fflags;
993 kn->kn_sdata = kev->data;
994
995 lck_mtx_lock(&ch->ch_lock);
996 if (__improbable(ch_filt_check_defunct(ch, kn))) {
997 ret = 1;
998 goto done;
999 }
1000 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1001 if (kev->flags & EV_ENABLE) {
1002 atomic_bitset_32(&ch->ch_na->na_flags,
1003 NAF_CHANNEL_EVENT_ATTACHED);
1004 } else if (kev->flags & EV_DISABLE) {
1005 atomic_bitclear_32(&ch->ch_na->na_flags,
1006 NAF_CHANNEL_EVENT_ATTACHED);
1007 }
1008 }
1009 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1010 ret = filt_chan_extended_common(kn, hint);
1011 done:
1012 lck_mtx_unlock(&ch->ch_lock);
1013 return ret;
1014 }
1015
1016 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1017 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1018 {
1019 int ret;
1020 long hint = 0;
1021 struct kern_channel *ch = kn->kn_hook;
1022
1023 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1024 lck_mtx_lock(&ch->ch_lock);
1025 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1026 ret = 1;
1027 goto done;
1028 }
1029 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1030 ret = filt_chan_extended_common(kn, hint);
1031 done:
1032 lck_mtx_unlock(&ch->ch_lock);
1033 if (ret != 0) {
1034 /*
1035 * This filter historically behaves like EV_CLEAR,
1036 * even when EV_CLEAR wasn't set.
1037 */
1038 knote_fill_kevent(kn, kev, 0);
1039 kn->kn_fflags = 0;
1040 }
1041 return ret;
1042 }
1043
1044 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1045 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1046 struct kevent_qos_s *kev)
1047 {
1048 int result;
1049
1050 lck_mtx_lock(&ch->ch_lock);
1051 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1052
1053 if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1054 na_reject_channel(ch, ch->ch_na))) {
1055 SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1056 ch->ch_pid, ch->ch_flags, CHANF_BITS);
1057 knote_set_error(kn, ENXIO);
1058 lck_mtx_unlock(&ch->ch_lock);
1059 return 0;
1060 }
1061
1062 switch (kn->kn_filter) {
1063 case EVFILT_READ:
1064 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1065 break;
1066
1067 case EVFILT_WRITE:
1068 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1069 break;
1070
1071 case EVFILT_NW_CHANNEL:
1072 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1073 break;
1074
1075 default:
1076 lck_mtx_unlock(&ch->ch_lock);
1077 SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1078 ch->ch_pid, kn->kn_filter);
1079 knote_set_error(kn, EINVAL);
1080 return 0;
1081 }
1082
1083 kn->kn_hook = ch;
1084 /* call the appropriate sub-filter attach with the channel lock held */
1085 result = knote_fops(kn)->f_attach(kn, kev);
1086 lck_mtx_unlock(&ch->ch_lock);
1087 return result;
1088 }
1089
1090 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1091 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1092 {
1093 return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1094 }
1095
1096 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1097 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1098 {
1099 int revents;
1100 int event_error = 0;
1101
1102 lck_mtx_lock(&ch->ch_lock);
1103 revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1104 FALSE);
1105 lck_mtx_unlock(&ch->ch_lock);
1106
1107 ASSERT((revents & POLLERR) == 0 || event_error != 0);
1108
1109 return revents;
1110 }
1111
1112 #if SK_LOG
1113 /* Hoisted out of line to reduce kernel stack footprint */
1114 SK_LOG_ATTRIBUTE
1115 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1116 ch_event_log(const char *prefix, const struct kern_channel *ch,
1117 struct proc *p, const struct nexus_adapter *na,
1118 int events, int revents)
1119 {
1120 SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1121 "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1122 SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1123 SK_KVA(current_thread()), events, revents);
1124 }
1125 #endif /* SK_LOG */
1126
1127 /*
1128 * select(2), poll(2) and kevent(2) handlers for channels.
1129 *
1130 * Can be called for one or more rings. Return true the event mask
1131 * corresponding to ready events. If there are no ready events, do
1132 * a selrecord on either individual selinfo or on the global one.
1133 * Device-dependent parts (locking and sync of tx/rx rings)
1134 * are done through callbacks.
1135 */
1136 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1137 ch_event(struct kern_channel *ch, int events, void *wql,
1138 struct proc *p, struct ch_event_result *result,
1139 const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1140 {
1141 struct nexus_adapter *na;
1142 struct __kern_channel_ring *kring;
1143 uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1144 uint32_t ready_tx_data = 0, ready_rx_data = 0;
1145 sk_protect_t protect = NULL;
1146
1147 #define want_tx want[NR_TX]
1148 #define want_rx want[NR_RX]
1149 /*
1150 * In order to avoid nested locks, we need to "double check"
1151 * txsync and rxsync if we decide to do a selrecord().
1152 * retry_tx (and retry_rx, later) prevent looping forever.
1153 */
1154 boolean_t retry_tx = TRUE, retry_rx = TRUE;
1155 int found, error = 0;
1156 int s;
1157
1158 net_update_uptime();
1159
1160 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1161 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1162
1163 *errno = 0;
1164
1165 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1166 ch->ch_schema == NULL)) {
1167 SK_ERR("%s(%d): channel is defunct or no longer bound",
1168 ch->ch_name, ch->ch_pid);
1169 revents = POLLERR;
1170 *errno = ENXIO;
1171 goto done;
1172 }
1173
1174 /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1175 if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1176 atomic_bitclear_32(&ch->ch_flags, CHANF_DEFUNCT_SKIP);
1177 }
1178
1179 na = ch->ch_na;
1180 if (__improbable(na == NULL ||
1181 !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1182 SK_ERR("%s(%d): channel is non-permissive",
1183 ch->ch_name, ch->ch_pid);
1184 revents = POLLERR;
1185 *errno = ENXIO;
1186 goto done;
1187 }
1188
1189 /* mark thread with sync-in-progress flag */
1190 protect = sk_sync_protect();
1191
1192 /* update our work timestamp */
1193 na->na_work_ts = _net_uptime;
1194
1195 /* and make this channel eligible for draining again */
1196 if (na->na_flags & NAF_DRAINING) {
1197 atomic_bitclear_32(&na->na_flags, NAF_DRAINING);
1198 }
1199
1200 #if SK_LOG
1201 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1202 ch_event_log("enter", ch, p, na, events, revents);
1203 }
1204 #endif
1205 if (is_ch_event) {
1206 goto process_channel_event;
1207 }
1208
1209 want_tx = (events & (POLLOUT | POLLWRNORM));
1210 want_rx = (events & (POLLIN | POLLRDNORM));
1211
1212 /*
1213 * check_all_{tx|rx} are set if the channel has more than one ring
1214 * AND the file descriptor is bound to all of them. If so, we sleep
1215 * on the "global" selinfo, otherwise we sleep on individual selinfo
1216 * The interrupt routine in the driver wake one or the other (or both)
1217 * depending on which clients are active.
1218 *
1219 * rxsync() is only called if we run out of buffers on a POLLIN.
1220 * txsync() is called if we run out of buffers on POLLOUT.
1221 */
1222 check_all_tx = ch_is_multiplex(ch, NR_TX);
1223 check_all_rx = ch_is_multiplex(ch, NR_RX);
1224
1225 /*
1226 * If want_tx is still set, we must issue txsync calls
1227 * (on all rings, to avoid that the tx rings stall).
1228 * XXX should also check head != khead on the tx rings.
1229 */
1230 if (want_tx) {
1231 ring_id_t first_tx = ch->ch_first[NR_TX];
1232 ring_id_t last_tx = ch->ch_last[NR_TX];
1233
1234 channel_threshold_unit_t tx_unit =
1235 ch->ch_info->cinfo_tx_lowat.cet_unit;
1236
1237 /*
1238 * The first round checks if anyone is ready, if not
1239 * do a selrecord and another round to handle races.
1240 * want_tx goes to 0 if any space is found, and is
1241 * used to skip rings with no pending transmissions.
1242 */
1243 flush_tx:
1244 for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1245 kring = &na->na_tx_rings[i];
1246 if (!want_tx &&
1247 kring->ckr_ring->ring_head == kring->ckr_khead) {
1248 continue;
1249 }
1250
1251 /* only one thread does txsync */
1252 s = kr_enter(kring, TRUE);
1253 ASSERT(s == 0);
1254
1255 error = 0;
1256 DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1257 ch, struct __kern_channel_ring *, kring);
1258 if (kr_txsync_prologue(ch, kring, p) >=
1259 kring->ckr_num_slots) {
1260 kr_log_bad_ring(kring);
1261 revents |= POLLERR;
1262 error = EFAULT;
1263 if (*errno == 0) {
1264 *errno = EFAULT;
1265 }
1266 } else {
1267 if (kring->ckr_na_sync(kring, p, 0)) {
1268 revents |= POLLERR;
1269 error = EIO;
1270 if (*errno == 0) {
1271 *errno = EIO;
1272 }
1273 } else {
1274 kr_txsync_finalize(ch, kring, p);
1275 }
1276 }
1277 DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1278 ch, struct __kern_channel_ring *, kring, int,
1279 error);
1280
1281 /*
1282 * If we found new slots, notify potential listeners on
1283 * the same ring. Since we just did a txsync, look at
1284 * the copies of cur,tail in the kring.
1285 */
1286 found = kring->ckr_rhead != kring->ckr_rtail;
1287 kr_exit(kring);
1288 if (found) { /* notify other listeners */
1289 revents |= want_tx;
1290 want_tx = 0;
1291 (void) kring->ckr_na_notify(kring, p,
1292 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1293 }
1294
1295 /*
1296 * Add this ring's free data to our running
1297 * tally for userspace.
1298 */
1299 if (result != NULL) {
1300 switch (tx_unit) {
1301 case CHANNEL_THRESHOLD_UNIT_BYTES:
1302 ready_tx_data += kring->ckr_ready_bytes;
1303 break;
1304 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1305 ready_tx_data += kring->ckr_ready_slots;
1306 break;
1307 }
1308 }
1309 }
1310 if (want_tx && retry_tx && !is_kevent) {
1311 if (check_all_tx) {
1312 csi_selrecord_all(na, NR_TX, p, wql);
1313 } else {
1314 csi_selrecord_one(&na->na_tx_rings[first_tx],
1315 p, wql);
1316 }
1317 retry_tx = FALSE;
1318 goto flush_tx;
1319 }
1320 }
1321
1322 /*
1323 * If want_rx is still set scan receive rings.
1324 * Do it on all rings because otherwise we starve.
1325 */
1326 if (want_rx) {
1327 ring_id_t first_rx = ch->ch_first[NR_RX];
1328 ring_id_t last_rx = ch->ch_last[NR_RX];
1329 channel_threshold_unit_t rx_unit =
1330 ch->ch_info->cinfo_rx_lowat.cet_unit;
1331
1332 /* two rounds here for race avoidance */
1333 do_retry_rx:
1334 for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1335 kring = &na->na_rx_rings[i];
1336
1337 /* only one thread does rxsync */
1338 s = kr_enter(kring, TRUE);
1339 ASSERT(s == 0);
1340
1341 error = 0;
1342 DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1343 ch, struct __kern_channel_ring *, kring);
1344 if (kr_rxsync_prologue(ch, kring, p) >=
1345 kring->ckr_num_slots) {
1346 kr_log_bad_ring(kring);
1347 revents |= POLLERR;
1348 error = EFAULT;
1349 if (*errno == 0) {
1350 *errno = EFAULT;
1351 }
1352 } else {
1353 /* now we can use kring->rhead, rtail */
1354 if (kring->ckr_na_sync(kring, p, 0)) {
1355 revents |= POLLERR;
1356 error = EIO;
1357 if (*errno == 0) {
1358 *errno = EIO;
1359 }
1360 } else {
1361 kr_rxsync_finalize(ch, kring, p);
1362 }
1363 }
1364
1365 DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1366 ch, struct __kern_channel_ring *, kring, int,
1367 error);
1368
1369 found = kring->ckr_rhead != kring->ckr_rtail;
1370 kr_exit(kring);
1371 if (found) {
1372 revents |= want_rx;
1373 retry_rx = FALSE;
1374 (void) kring->ckr_na_notify(kring, p,
1375 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1376 }
1377
1378 /*
1379 * Add this ring's readable data to our running
1380 * tally for userspace.
1381 */
1382 if (result != NULL) {
1383 switch (rx_unit) {
1384 case CHANNEL_THRESHOLD_UNIT_BYTES:
1385 ready_rx_data += kring->ckr_ready_bytes;
1386 break;
1387 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1388 ready_rx_data += kring->ckr_ready_slots;
1389 break;
1390 }
1391 }
1392 }
1393
1394 if (retry_rx && !is_kevent) {
1395 if (check_all_rx) {
1396 csi_selrecord_all(na, NR_RX, p, wql);
1397 } else {
1398 csi_selrecord_one(&na->na_rx_rings[first_rx],
1399 p, wql);
1400 }
1401 }
1402 if (retry_rx) {
1403 retry_rx = FALSE;
1404 goto do_retry_rx;
1405 }
1406 }
1407
1408 if (result != NULL) {
1409 result->tx_data = ready_tx_data;
1410 result->rx_data = ready_rx_data;
1411 }
1412 goto skip_channel_event;
1413
1414 process_channel_event:
1415 /*
1416 * perform sync operation on the event ring to make the channel
1417 * events enqueued in the ring visible to user-space.
1418 */
1419
1420 /* select() and poll() not supported for event ring */
1421 ASSERT(is_kevent);
1422 VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1423 kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1424
1425 /* only one thread does the sync */
1426 s = kr_enter(kring, TRUE);
1427 ASSERT(s == 0);
1428 if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1429 kr_log_bad_ring(kring);
1430 revents |= POLLERR;
1431 if (*errno == 0) {
1432 *errno = EFAULT;
1433 }
1434 } else {
1435 if (kring->ckr_na_sync(kring, p, 0)) {
1436 revents |= POLLERR;
1437 if (*errno == 0) {
1438 *errno = EIO;
1439 }
1440 } else {
1441 kr_event_sync_finalize(ch, kring, p);
1442 }
1443 }
1444 found = (kring->ckr_rhead != kring->ckr_rtail);
1445 kr_exit(kring);
1446 if (found) {
1447 revents |= (events & POLLIN);
1448 }
1449
1450 skip_channel_event:
1451 #if SK_LOG
1452 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1453 ch_event_log("exit", ch, p, na, events, revents);
1454 }
1455 #endif /* SK_LOG */
1456
1457 /* unmark thread with sync-in-progress flag */
1458 sk_sync_unprotect(protect);
1459
1460 done:
1461 ASSERT(!sk_is_sync_protected());
1462
1463 return revents;
1464 #undef want_tx
1465 #undef want_rx
1466 }
1467
1468 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1469 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1470 {
1471 struct kern_channel *ch;
1472
1473 SK_LOCK_ASSERT_HELD();
1474
1475 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1476 struct ch_info *cinfo = ch->ch_info;
1477
1478 /* see comments in ch_open() */
1479 if (cinfo->cinfo_nx_port != port) {
1480 continue;
1481 } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1482 continue;
1483 } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1484 ring_id != cinfo->cinfo_ch_ring_id &&
1485 ring_id != CHANNEL_RING_ID_ANY) {
1486 continue;
1487 }
1488
1489 /* found a match */
1490 break;
1491 }
1492
1493 if (ch != NULL) {
1494 ch_retain_locked(ch);
1495 }
1496
1497 return ch;
1498 }
1499
1500 #if SK_LOG
1501 /* Hoisted out of line to reduce kernel stack footprint */
1502 SK_LOG_ATTRIBUTE
1503 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1504 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1505 {
1506 uuid_string_t uuidstr;
1507
1508 SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1509 sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1510 sk_uuid_unparse(p_uuid, uuidstr), port);
1511 }
1512
1513 SK_LOG_ATTRIBUTE
1514 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,const char * mode_bits,int err)1515 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1516 uint32_t mode, const char *mode_bits, int err)
1517 {
1518 SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1519 sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1520 mode, mode_bits, err);
1521 }
1522 #endif /* SK_LOG */
1523
1524 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1525 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1526 {
1527 uint32_t mode = init->ci_ch_mode;
1528 nexus_port_t port = init->ci_nx_port;
1529 ring_id_t ring = init->ci_ch_ring_id;
1530 struct kern_channel *ch = NULL, *ch0 = NULL;
1531 struct nxbind *nxb = NULL;
1532 struct kern_nexus *nx;
1533 struct chreq chr;
1534 uuid_t p_uuid;
1535 kauth_cred_t cred;
1536
1537 cred = kauth_cred_get();
1538 ASSERT(!uuid_is_null(init->ci_nx_uuid));
1539 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1540 *err = 0;
1541
1542 /* make sure we don't allow userland to set kernel-only flags */
1543 mode &= CHMODE_MASK;
1544
1545 SK_LOCK();
1546
1547 nx = nx_find(init->ci_nx_uuid, TRUE);
1548 if (nx == NULL) {
1549 *err = ENOENT;
1550 goto done;
1551 }
1552
1553 /* port (zero-based) must be within the domain's range */
1554 if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1555 *err = EDOM;
1556 goto done;
1557 }
1558 VERIFY(port != NEXUS_PORT_ANY);
1559
1560 if (mode & CHMODE_LOW_LATENCY) {
1561 if ((*err = skywalk_priv_check_cred(p, cred,
1562 PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1563 goto done;
1564 }
1565 }
1566
1567 /* "no copy" is valid only when at least one tx/rx mon flag is set */
1568 if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1569 mode &= ~CHMODE_MONITOR_NO_COPY;
1570 }
1571
1572 if (mode & CHMODE_MONITOR) {
1573 if ((*err = skywalk_priv_check_cred(p, cred,
1574 PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1575 goto done;
1576 }
1577 /* Don't allow non-root processes to monitor channels. */
1578 if (kauth_cred_issuser(cred) == 0) {
1579 *err = EPERM;
1580 goto done;
1581 }
1582 }
1583
1584 /*
1585 * Check with the nexus to see if the port is bound; if so, prepare
1586 * our nxbind structure that we'll need to pass down to the nexus
1587 * for it compare. If the caller provides a key, we take it over
1588 * and will free it ourselves (as part of freeing nxbind.)
1589 *
1590 * If this is a monitor channel, skip this altogether since the check
1591 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1592 */
1593 if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1594 void *key = (void *)(init->ci_key);
1595
1596 #if SK_LOG
1597 if (__improbable(sk_verbose != 0)) {
1598 ch_open_log1(p_uuid, p, port);
1599 }
1600 #endif /* SK_LOG */
1601
1602 nxb = nxb_alloc(Z_WAITOK);
1603 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1604 nxb->nxb_uniqueid = proc_uniqueid(p);
1605 nxb->nxb_pid = proc_pid(p);
1606 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1607 uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1608 if (key != NULL) {
1609 nxb->nxb_flags |= NXBF_MATCH_KEY;
1610 nxb->nxb_key_len = init->ci_key_len;
1611 nxb->nxb_key = key;
1612 init->ci_key = USER_ADDR_NULL; /* take over */
1613 }
1614 }
1615
1616 /*
1617 * There can only be one owner of {port,ring_id} tuple. Once
1618 * owned, this can be made available among multiple monitors.
1619 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1620 * all rings. Further attempts to own any or all of the rings
1621 * will be declined.
1622 *
1623 * Multiple monitors are allowed to exist. If a channel has been
1624 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1625 * monitored. If an owning channel has been bound to an individual
1626 * ring, only that ring can be monitored, either by specifying the
1627 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1628 *
1629 * For example, assuming a 2-rings setup for port 'p':
1630 *
1631 * owner{p,-1}
1632 * will allow:
1633 * monitor{p,-1}, monitor{p,0}, monitor{p,1}
1634 * will not allow:
1635 * owner{p,-1}, owner{p,0}, owner{p,1}
1636 *
1637 * owner{p,0}
1638 * will allow:
1639 * owner{p,1}, monitor{p,-1}, monitor{p,0}
1640 * will not allow:
1641 * owner{p,-1}, owner{p,0}, monitor{p,1}
1642 */
1643 if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1644 SK_D("found ch0 0x%llx", SK_KVA(ch0));
1645 /*
1646 * Unless this is a monitor channel, allow only at
1647 * most one owner of the {port,ring_id} tuple.
1648 */
1649 if (!(mode & CHMODE_MONITOR)) {
1650 #if SK_LOG
1651 uuid_string_t uuidstr;
1652 char *na_name = (ch0->ch_na != NULL) ?
1653 ch0->ch_na->na_name : "";
1654
1655 SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1656 "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1657 sk_uuid_unparse(nx->nx_uuid, uuidstr),
1658 ch0->ch_name, ch0->ch_pid);
1659 #endif /* SK_LOG */
1660 *err = EBUSY;
1661 goto done;
1662 }
1663 } else if (mode & CHMODE_MONITOR) {
1664 *err = ENXIO;
1665 goto done;
1666 }
1667
1668 bzero(&chr, sizeof(chr));
1669 chr.cr_tx_lowat = init->ci_tx_lowat;
1670 chr.cr_rx_lowat = init->ci_rx_lowat;
1671 chr.cr_port = port;
1672 chr.cr_mode = mode;
1673 chr.cr_ring_id = ring;
1674
1675 /* upon success, returns a channel with reference held */
1676 ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1677
1678 done:
1679
1680 #if SK_LOG
1681 if (__improbable(sk_verbose != 0)) {
1682 ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1683 }
1684 #endif /* SK_LOG */
1685
1686 if (ch0 != NULL) {
1687 (void) ch_release_locked(ch0);
1688 }
1689
1690 if (nx != NULL) {
1691 (void) nx_release_locked(nx);
1692 }
1693
1694 if (nxb != NULL) {
1695 nxb_free(nxb);
1696 }
1697
1698 SK_UNLOCK();
1699
1700 return ch;
1701 }
1702
1703 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1704 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1705 int *err)
1706 {
1707 struct kern_channel *ch = NULL;
1708
1709 SK_LOCK_ASSERT_HELD();
1710 *err = 0;
1711
1712 ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1713 ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1714 ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1715 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1716 chr->cr_mode |= CHMODE_KERNEL;
1717 if (nonxref) {
1718 chr->cr_mode |= CHMODE_NO_NXREF;
1719 } else {
1720 chr->cr_mode &= ~CHMODE_NO_NXREF;
1721 }
1722
1723 /* upon success, returns a channel with reference held */
1724 ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1725 if (ch != NULL) {
1726 /*
1727 * nonxref channels don't hold any reference to the nexus,
1728 * since otherwise we'll never be able to close them when
1729 * the last regular channel of the nexus is closed, as part
1730 * of the nexus's destructor operation. Release the nonxref
1731 * channel reference now, but make sure the nexus has at
1732 * least 3 refs: global list, provider list and the nonxref
1733 * channel itself, before doing that.
1734 */
1735 if (nonxref) {
1736 ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1737 ASSERT(nx->nx_refcnt > 3);
1738 (void) nx_release_locked(nx);
1739 }
1740 }
1741
1742 #if SK_LOG
1743 uuid_string_t uuidstr;
1744 SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1745 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ?
1746 ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id,
1747 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1748 CHMODE_BITS, *err);
1749 #endif /* SK_LOG */
1750
1751 return ch;
1752 }
1753
1754 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1755 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1756 {
1757 #pragma unused(special)
1758 #if SK_LOG
1759 uuid_string_t uuidstr;
1760 const char *na_name = (ch->ch_na != NULL) ?
1761 ch->ch_na->na_name : "";
1762 const char *nxdom_name = (ch->ch_nexus != NULL) ?
1763 NX_DOM(ch->ch_nexus)->nxdom_name : "";
1764 const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1765 NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1766
1767 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1768 SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1769 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1770 SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1771 uuidstr));
1772 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1773 #endif /* SK_LOG */
1774 struct kern_nexus *nx = ch->ch_nexus;
1775
1776 if (!locked) {
1777 SK_LOCK();
1778 }
1779
1780 SK_LOCK_ASSERT_HELD();
1781 /*
1782 * If the channel is participating in the interface advisory
1783 * notification, remove it from the nexus.
1784 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1785 * is held in exclusive mode.
1786 */
1787 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1788 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1789 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1790 kern_channel, ch_link_if_adv);
1791 atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
1792 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1793 nx_netif_config_interface_advisory(nx, false);
1794 }
1795 lck_rw_done(&nx->nx_ch_if_adv_lock);
1796 lck_mtx_lock(&ch->ch_lock);
1797 (void) ch_release_locked(ch);
1798 } else {
1799 lck_rw_done(&nx->nx_ch_if_adv_lock);
1800 lck_mtx_lock(&ch->ch_lock);
1801 }
1802 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1803 /*
1804 * Mark the channel as closing to prevent further setopt requests;
1805 * this flag is set once here and never gets cleared.
1806 */
1807 ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1808 atomic_bitset_32(&ch->ch_flags, CHANF_CLOSING);
1809
1810 if (special) {
1811 VERIFY(ch->ch_flags & CHANF_KERNEL);
1812 } else {
1813 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1814 }
1815
1816 ch->ch_fd = -1;
1817
1818 /* may be called as part of failure cleanup, so check */
1819 if (ch->ch_flags & CHANF_ATTACHED) {
1820 boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1821
1822 /* caller must hold an extra ref */
1823 ASSERT(ch->ch_refcnt > 1);
1824
1825 /* disconnect from nexus */
1826 ch_disconnect(ch);
1827
1828 /*
1829 * If this was the last regular channel and the nexus
1830 * has been closed, detach it and finish up the job.
1831 * If this was a nonxref channel, there is nothing
1832 * left to do; see comments in ch_open_special().
1833 */
1834 if (!nonxref) {
1835 STAILQ_REMOVE(&nx->nx_ch_head, ch,
1836 kern_channel, ch_link);
1837 nx->nx_ch_count--;
1838 if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1839 (nx->nx_flags & NXF_CLOSED)) {
1840 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1841 nx_detach(nx);
1842 }
1843 (void) nx_release_locked(nx);
1844 } else {
1845 ASSERT(ch->ch_flags & CHANF_KERNEL);
1846 STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1847 kern_channel, ch_link);
1848 }
1849
1850 atomic_bitclear_32(&ch->ch_flags, CHANF_ATTACHED);
1851 ch->ch_nexus = NULL;
1852
1853 (void) ch_release_locked(ch); /* for the list */
1854 }
1855
1856 lck_mtx_unlock(&ch->ch_lock);
1857 if (!locked) {
1858 SK_UNLOCK();
1859 }
1860 }
1861
1862 void
ch_close(struct kern_channel * ch,boolean_t locked)1863 ch_close(struct kern_channel *ch, boolean_t locked)
1864 {
1865 ch_close_common(ch, locked, FALSE);
1866 }
1867
1868 void
ch_close_special(struct kern_channel * ch)1869 ch_close_special(struct kern_channel *ch)
1870 {
1871 ch_close_common(ch, TRUE, TRUE);
1872 }
1873
1874 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1875 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1876 struct ch_ev_thresh *cet)
1877 {
1878 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1879 uint32_t bmin, bmax, smin, smax;
1880 int err = 0;
1881
1882 if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1883 cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1884 err = EINVAL;
1885 goto done;
1886 }
1887
1888 smin = 1; /* minimum 1 slot */
1889 bmin = 1; /* minimum 1 byte */
1890
1891 if (t == NR_TX) {
1892 ASSERT(nxp->nxp_tx_slots > 0);
1893 smax = (nxp->nxp_tx_slots - 1);
1894 } else {
1895 ASSERT(nxp->nxp_rx_slots > 0);
1896 smax = (nxp->nxp_rx_slots - 1);
1897 }
1898 bmax = (smax * nxp->nxp_buf_size);
1899
1900 switch (cet->cet_unit) {
1901 case CHANNEL_THRESHOLD_UNIT_BYTES:
1902 if (cet->cet_value < bmin) {
1903 cet->cet_value = bmin;
1904 } else if (cet->cet_value > bmax) {
1905 cet->cet_value = bmax;
1906 }
1907 break;
1908
1909 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1910 if (cet->cet_value < smin) {
1911 cet->cet_value = smin;
1912 } else if (cet->cet_value > smax) {
1913 cet->cet_value = smax;
1914 }
1915 break;
1916 }
1917
1918 done:
1919 return err;
1920 }
1921
1922 #if SK_LOG
1923 /* Hoisted out of line to reduce kernel stack footprint */
1924 SK_LOG_ATTRIBUTE
1925 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1926 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1927 const struct chreq *chr, const struct kern_channel *ch,
1928 const struct kern_nexus_domain_provider *nxdom_prov,
1929 struct proc *p)
1930 {
1931 struct __user_channel_schema *ch_schema = ch->ch_schema;
1932 uuid_string_t uuidstr;
1933 unsigned int n;
1934 ring_id_t i, j;
1935
1936 ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1937 if (ch_schema != NULL) {
1938 SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1939 SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name);
1940 SK_D(" kern_uuid: %s",
1941 sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1942 SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS);
1943 SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings,
1944 cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1945 SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings,
1946 cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1947
1948 j = ch->ch_last[NR_TX];
1949 for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1950 SK_D(" tx_ring_%u_off: 0x%llx", i,
1951 (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1952 SK_D(" tx_sd_%u_off: 0x%llx", i,
1953 (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1954 }
1955 j = n;
1956 for (n = 0, i = ch->ch_first[NR_RX];
1957 i < ch->ch_last[NR_RX]; n++, i++) {
1958 SK_D(" rx_ring_%u_off: 0x%llx", i,
1959 (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1960 SK_D(" rx_sd_%u_off: 0x%llx", i,
1961 (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1962 }
1963 SK_D(" md_type: %u", ch_schema->csm_md_type);
1964 SK_D(" md_subtype: %u", ch_schema->csm_md_subtype);
1965 SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs);
1966 SK_D(" stats_type: %u", ch_schema->csm_stats_type);
1967 SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs);
1968 SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max);
1969 SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs);
1970 }
1971
1972 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1973 SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1974 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1975 cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1976 SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1977 SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1978 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1979 SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1980 sk_proc_name_address(p), sk_proc_pid(p));
1981 SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1982 ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1983 "bytes" : "slots"));
1984 SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1985 ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1986 "bytes" : "slots"));
1987 SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
1988 SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1989 SK_D(" mapsize: 0x%llx (%llu KB)",
1990 (uint64_t)cinfo->cinfo_mem_map_size,
1991 (uint64_t)cinfo->cinfo_mem_map_size >> 10);
1992 SK_D(" memsize: 0x%llx (%llu KB)",
1993 (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
1994 SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
1995 }
1996
1997 SK_LOG_ATTRIBUTE
1998 static void
ch_connect_log2(const struct kern_nexus * nx,int err)1999 ch_connect_log2(const struct kern_nexus *nx, int err)
2000 {
2001 uuid_string_t nx_uuidstr;
2002
2003 SK_ERR("Error connecting to nexus UUID %s: %d",
2004 sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2005 }
2006 #endif /* SK_LOG */
2007
2008 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,int fd,int * err)2009 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2010 struct nxbind *nxb, struct proc *p, int fd, int *err)
2011 {
2012 struct kern_nexus_domain_provider *nxdom_prov;
2013 struct kern_channel *ch = NULL;
2014 struct ch_info *cinfo = NULL;
2015 uint32_t ch_mode = chr->cr_mode;
2016 boolean_t config = FALSE;
2017 struct nxdom *nxdom;
2018 boolean_t reserved_port = FALSE;
2019
2020 ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2021 ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2022 SK_LOCK_ASSERT_HELD();
2023
2024 /* validate thresholds before we proceed any further */
2025 if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2026 (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2027 goto done;
2028 }
2029
2030 if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2031 *err = ENOTSUP;
2032 goto done;
2033 }
2034
2035 ch = ch_alloc(Z_WAITOK);
2036
2037 lck_mtx_lock(&ch->ch_lock);
2038
2039 uuid_generate_random(ch->ch_info->cinfo_ch_id);
2040 ch->ch_fd = fd;
2041 ch->ch_pid = proc_pid(p);
2042 (void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2043 proc_name_address(p));
2044
2045 nxdom_prov = NX_DOM_PROV(nx);
2046 nxdom = NX_DOM(nx);
2047
2048 if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2049 /*
2050 * CHANF_KERNEL implies a channel opened by a kernel
2051 * subsystem, and is triggered by the CHMODE_KERNEL
2052 * flag which (only ever) set by ch_open_special().
2053 *
2054 * CHANF_NONXREF can be optionally set based on the
2055 * CHMODE_NO_NXREF request flag. This must only be
2056 * set by ch_open_special() as well, hence we verify.
2057 */
2058 ASSERT(p == kernproc);
2059 ASSERT(ch_mode & CHMODE_KERNEL);
2060 atomic_bitset_32(&ch->ch_flags, CHANF_KERNEL);
2061 if (ch_mode & CHMODE_NO_NXREF) {
2062 atomic_bitset_32(&ch->ch_flags, CHANF_NONXREF);
2063 }
2064
2065 config = (ch_mode & CHMODE_CONFIG) != 0;
2066 if (chr->cr_port == NEXUS_PORT_ANY) {
2067 if (nxdom->nxdom_find_port == NULL) {
2068 *err = ENOTSUP;
2069 goto done;
2070 }
2071
2072 /*
2073 * If ephemeral port request, find one for client;
2074 * we ask for the reserved port range if this is
2075 * a configuration request (CHMODE_CONFIG).
2076 */
2077 if ((*err = nxdom->nxdom_find_port(nx,
2078 config, &chr->cr_port)) != 0) {
2079 goto done;
2080 }
2081 }
2082 }
2083
2084 if (skywalk_check_platform_binary(p)) {
2085 atomic_bitset_32(&ch->ch_flags, CHANF_PLATFORM);
2086 }
2087
2088 ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2089
2090 reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2091 (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2092 if (!config && reserved_port) {
2093 *err = EDOM;
2094 goto done;
2095 }
2096
2097 SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2098 sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2099
2100 if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2101 nx, ch, chr, ch0, nxb, p)) != 0) {
2102 goto done;
2103 }
2104
2105 cinfo = ch->ch_info;
2106 uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2107 /* for easy access to immutables */
2108 bcopy((void *)nx->nx_prov->nxprov_params,
2109 (void *)&cinfo->cinfo_nxprov_params, sizeof(struct nxprov_params));
2110 cinfo->cinfo_ch_mode = ch_mode;
2111 cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2112 cinfo->cinfo_nx_port = chr->cr_port;
2113 cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2114 cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2115 cinfo->cinfo_schema_offset = chr->cr_memoffset;
2116 cinfo->cinfo_num_bufs =
2117 PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2118 /*
2119 * ch_last is really the number of rings, but we need to return
2120 * the actual zero-based ring ID to the client. Make sure that
2121 * is the case here and adjust last_{tx,rx}_ring accordingly.
2122 */
2123 ASSERT((ch->ch_last[NR_TX] > 0) ||
2124 (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2125 ASSERT((ch->ch_last[NR_RX] > 0) ||
2126 (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2127 cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2128 cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2129 cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2130 cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2131 cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2132 cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2133
2134 if (ch_mode & CHMODE_NO_NXREF) {
2135 ASSERT(ch_mode & CHMODE_KERNEL);
2136 STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2137 } else {
2138 STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2139 nx->nx_ch_count++;
2140 }
2141 atomic_bitset_32(&ch->ch_flags, CHANF_ATTACHED);
2142 ch->ch_nexus = nx;
2143 nx_retain_locked(nx); /* hold a ref on the nexus */
2144
2145 ch_retain_locked(ch); /* one for being in the list */
2146 ch_retain_locked(ch); /* one for the caller */
2147
2148 /*
2149 * Now that we've successfully created the nexus adapter, inform the
2150 * nexus provider about the rings and the slots within each ring.
2151 * This is a no-op for internal nexus providers.
2152 */
2153 if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2154 lck_mtx_unlock(&ch->ch_lock);
2155
2156 /* gracefully close this fully-formed channel */
2157 if (ch->ch_flags & CHANF_KERNEL) {
2158 ch_close_special(ch);
2159 } else {
2160 ch_close(ch, TRUE);
2161 }
2162 (void) ch_release_locked(ch);
2163 ch = NULL;
2164 goto done;
2165 }
2166
2167 ASSERT(ch->ch_schema == NULL ||
2168 (ch->ch_schema->csm_flags & CSM_ACTIVE));
2169
2170 #if SK_LOG
2171 if (__improbable(sk_verbose != 0)) {
2172 ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2173 }
2174 #endif /* SK_LOG */
2175
2176 done:
2177 if (ch != NULL) {
2178 lck_mtx_unlock(&ch->ch_lock);
2179 }
2180 if (*err != 0) {
2181 #if SK_LOG
2182 if (__improbable(sk_verbose != 0)) {
2183 ch_connect_log2(nx, *err);
2184 }
2185 #endif /* SK_LOG */
2186 if (ch != NULL) {
2187 ch_free(ch);
2188 ch = NULL;
2189 }
2190 }
2191 return ch;
2192 }
2193
2194 static void
ch_disconnect(struct kern_channel * ch)2195 ch_disconnect(struct kern_channel *ch)
2196 {
2197 struct kern_nexus *nx = ch->ch_nexus;
2198 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2199
2200 SK_LOCK_ASSERT_HELD();
2201 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2202
2203 /*
2204 * Inform the nexus provider that the channel has been quiesced
2205 * and disconnected from the nexus port. This is a no-op for
2206 * internal nexus providers.
2207 */
2208 nxprov_advise_disconnect(nx, ch);
2209
2210 /* Finally, let the domain provider tear down the instance */
2211 nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2212 }
2213
2214 void
ch_deactivate(struct kern_channel * ch)2215 ch_deactivate(struct kern_channel *ch)
2216 {
2217 /*
2218 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2219 * it will never be set again. Doing this will cause
2220 * os_channel_is_defunct() to indicate that the channel
2221 * is defunct and is no longer usable (thus should be
2222 * immediately closed).
2223 */
2224 if (ch->ch_schema != NULL &&
2225 (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2226 atomic_bitclear_32(__DECONST(uint32_t *,
2227 &ch->ch_schema->csm_flags), CSM_ACTIVE);
2228 /* make this globally visible */
2229 membar_sync();
2230 }
2231 }
2232
2233 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2234 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2235 {
2236 #pragma unused(ch)
2237 int err = 0;
2238
2239 if (sopt->sopt_dir != SOPT_SET) {
2240 sopt->sopt_dir = SOPT_SET;
2241 }
2242
2243 switch (sopt->sopt_name) {
2244 case CHOPT_TX_LOWAT_THRESH:
2245 err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2246 break;
2247
2248 case CHOPT_RX_LOWAT_THRESH:
2249 err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2250 break;
2251
2252 case CHOPT_IF_ADV_CONF:
2253 err = ch_configure_interface_advisory_event(ch, sopt);
2254 break;
2255
2256 default:
2257 err = ENOPROTOOPT;
2258 break;
2259 }
2260
2261 return err;
2262 }
2263
2264 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2265 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2266 {
2267 #pragma unused(ch)
2268 int err = 0;
2269
2270 if (sopt->sopt_dir != SOPT_GET) {
2271 sopt->sopt_dir = SOPT_GET;
2272 }
2273
2274 switch (sopt->sopt_name) {
2275 case CHOPT_TX_LOWAT_THRESH:
2276 err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2277 break;
2278
2279 case CHOPT_RX_LOWAT_THRESH:
2280 err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2281 break;
2282
2283 default:
2284 err = ENOPROTOOPT;
2285 break;
2286 }
2287
2288 return err;
2289 }
2290
2291 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2292 ch_configure_interface_advisory_event(struct kern_channel *ch,
2293 struct sockopt *sopt)
2294 {
2295 int err = 0;
2296 boolean_t enable = 0;
2297 struct kern_nexus *nx = ch->ch_nexus;
2298
2299 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2300 SK_LOCK_ASSERT_NOTHELD();
2301
2302 if (sopt->sopt_val == USER_ADDR_NULL) {
2303 return EINVAL;
2304 }
2305 if (nx->nx_adv.nxv_adv == NULL) {
2306 return ENOTSUP;
2307 }
2308 err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2309 if (err != 0) {
2310 return err;
2311 }
2312
2313 /*
2314 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2315 * ordering requirement; check if the channel is closing once ch_lock
2316 * is reacquired and bail if so.
2317 */
2318 lck_mtx_unlock(&ch->ch_lock);
2319 SK_LOCK();
2320 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2321 lck_mtx_lock(&ch->ch_lock);
2322 if (ch->ch_flags & CHANF_CLOSING) {
2323 err = ENXIO;
2324 goto done;
2325 }
2326
2327 /*
2328 * if interface advisory reporting is enabled on the channel then
2329 * add the channel to the list of channels eligible for interface
2330 * advisory update on the nexus. If disabled, remove from the list.
2331 */
2332 if (enable) {
2333 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2334 ASSERT(err == 0);
2335 goto done;
2336 }
2337 bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2338 atomic_bitset_32(&ch->ch_flags, CHANF_IF_ADV);
2339 STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2340 if (enable_adv) {
2341 nx_netif_config_interface_advisory(nx, true);
2342 }
2343 ch_retain_locked(ch); /* for being in the IF ADV list */
2344 } else {
2345 if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2346 ASSERT(err == 0);
2347 goto done;
2348 }
2349 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2350 ch_link_if_adv);
2351 atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
2352 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2353 nx_netif_config_interface_advisory(nx, false);
2354 }
2355 (void) ch_release_locked(ch);
2356 }
2357
2358 done:
2359 lck_mtx_unlock(&ch->ch_lock);
2360 lck_rw_done(&nx->nx_ch_if_adv_lock);
2361 SK_UNLOCK();
2362 lck_mtx_lock(&ch->ch_lock);
2363
2364 return err;
2365 }
2366
2367 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2368 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2369 struct sockopt *sopt)
2370 {
2371 struct ch_ev_thresh cet, *ocet;
2372 int err = 0;
2373
2374 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2375
2376 if (sopt->sopt_val == USER_ADDR_NULL) {
2377 return EINVAL;
2378 }
2379
2380 bzero(&cet, sizeof(cet));
2381 err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2382 if (err == 0) {
2383 err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2384 if (err == 0) {
2385 if (t == NR_TX) {
2386 ocet = &ch->ch_info->cinfo_tx_lowat;
2387 } else {
2388 ocet = &ch->ch_info->cinfo_rx_lowat;
2389 }
2390
2391 /* if there is no change, we're done */
2392 if (ocet->cet_unit == cet.cet_unit &&
2393 ocet->cet_value == cet.cet_value) {
2394 return 0;
2395 }
2396
2397 *ocet = cet;
2398
2399 for_rx_tx(t) {
2400 ring_id_t qfirst = ch->ch_first[t];
2401 ring_id_t qlast = ch->ch_last[t];
2402 uint32_t i;
2403
2404 for (i = qfirst; i < qlast; i++) {
2405 struct __kern_channel_ring *kring =
2406 &NAKR(ch->ch_na, t)[i];
2407
2408 (void) kring->ckr_na_notify(kring,
2409 sopt->sopt_p, 0);
2410 }
2411 }
2412
2413 (void) sooptcopyout(sopt, &cet, sizeof(cet));
2414 }
2415 }
2416
2417 return err;
2418 }
2419
2420 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2421 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2422 struct sockopt *sopt)
2423 {
2424 struct ch_ev_thresh cet;
2425
2426 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2427
2428 if (sopt->sopt_val == USER_ADDR_NULL) {
2429 return EINVAL;
2430 }
2431
2432 if (t == NR_TX) {
2433 cet = ch->ch_info->cinfo_tx_lowat;
2434 } else {
2435 cet = ch->ch_info->cinfo_rx_lowat;
2436 }
2437
2438 return sooptcopyout(sopt, &cet, sizeof(cet));
2439 }
2440
2441 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2442 ch_alloc(zalloc_flags_t how)
2443 {
2444 struct kern_channel *ch;
2445
2446 ch = zalloc_flags(ch_zone, how | Z_ZERO);
2447 if (ch) {
2448 lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2449 ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2450 }
2451 return ch;
2452 }
2453
2454 static void
ch_free(struct kern_channel * ch)2455 ch_free(struct kern_channel *ch)
2456 {
2457 ASSERT(ch->ch_refcnt == 0);
2458 ASSERT(ch->ch_pp == NULL);
2459 ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2460 CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2461 lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2462 SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2463 ASSERT(ch->ch_info != NULL);
2464 zfree(ch_info_zone, ch->ch_info);
2465 ch->ch_info = NULL;
2466 zfree(ch_zone, ch);
2467 }
2468
2469 void
ch_retain_locked(struct kern_channel * ch)2470 ch_retain_locked(struct kern_channel *ch)
2471 {
2472 SK_LOCK_ASSERT_HELD();
2473
2474 ch->ch_refcnt++;
2475 VERIFY(ch->ch_refcnt != 0);
2476 }
2477
2478 void
ch_retain(struct kern_channel * ch)2479 ch_retain(struct kern_channel *ch)
2480 {
2481 SK_LOCK();
2482 ch_retain_locked(ch);
2483 SK_UNLOCK();
2484 }
2485
2486 int
ch_release_locked(struct kern_channel * ch)2487 ch_release_locked(struct kern_channel *ch)
2488 {
2489 int oldref = ch->ch_refcnt;
2490
2491 SK_LOCK_ASSERT_HELD();
2492
2493 VERIFY(ch->ch_refcnt != 0);
2494 if (--ch->ch_refcnt == 0) {
2495 ch_free(ch);
2496 }
2497
2498 return oldref == 1;
2499 }
2500
2501 int
ch_release(struct kern_channel * ch)2502 ch_release(struct kern_channel *ch)
2503 {
2504 int lastref;
2505
2506 SK_LOCK();
2507 lastref = ch_release_locked(ch);
2508 SK_UNLOCK();
2509
2510 return lastref;
2511 }
2512
2513 void
ch_dtor(void * arg)2514 ch_dtor(void *arg)
2515 {
2516 struct kern_channel *ch = arg;
2517
2518 SK_LOCK();
2519 ch_close(ch, TRUE);
2520 (void) ch_release_locked(ch);
2521 SK_UNLOCK();
2522 }
2523