1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #include <sys/eventvar.h>
57 #include <sys/kdebug.h>
58 #include <sys/sdt.h>
59 #include <skywalk/os_skywalk_private.h>
60 #include <skywalk/nexus/netif/nx_netif.h>
61
62 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63
64 struct ch_event_result {
65 uint32_t tx_data;
66 uint32_t rx_data;
67 };
68
69 static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70 static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71 LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72
73 static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74 static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75 static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76 static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77 boolean_t, boolean_t, uint32_t);
78 static boolean_t csi_tcall_start(struct ch_selinfo *);
79 static void csi_tcall(thread_call_param_t, thread_call_param_t);
80 static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81
82 static void ch_redzone_init(void);
83 static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84 static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85 ring_id_t);
86 static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87 struct ch_ev_thresh *);
88 static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89 struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90 static void ch_disconnect(struct kern_channel *);
91 static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92 struct sockopt *);
93 static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94 struct sockopt *);
95 static struct kern_channel *ch_alloc(zalloc_flags_t);
96 static void ch_free(struct kern_channel *);
97 static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98 struct sockopt *sopt);
99
100 static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101 static void filt_chrwdetach(struct knote *, boolean_t);
102 static void filt_chrdetach(struct knote *);
103 static void filt_chwdetach(struct knote *);
104 static int filt_chrw(struct knote *, long, int);
105 static int filt_chread(struct knote *, long);
106 static int filt_chwrite(struct knote *, long);
107
108 static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109 static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110 static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111 static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112 static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113 static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114 static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115 static void filt_che_detach(struct knote *);
116 static int filt_che_event(struct knote *, long);
117 static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118 static int filt_che_process(struct knote *, struct kevent_qos_s *);
119 static int filt_chan_extended_common(struct knote *, long);
120
121 static int ch_event(struct kern_channel *ch, int events,
122 void *wql, struct proc *p, struct ch_event_result *,
123 const boolean_t is_kevent, int *errno, const boolean_t);
124
125 const struct filterops skywalk_channel_rfiltops = {
126 .f_isfd = 1,
127 .f_attach = filt_chrwattach,
128 .f_detach = filt_chrdetach,
129 .f_event = filt_chread,
130 .f_touch = filt_chrtouch,
131 .f_process = filt_chrprocess,
132 };
133
134 const struct filterops skywalk_channel_wfiltops = {
135 .f_isfd = 1,
136 .f_attach = filt_chrwattach,
137 .f_detach = filt_chwdetach,
138 .f_event = filt_chwrite,
139 .f_touch = filt_chwtouch,
140 .f_process = filt_chwprocess,
141 };
142
143 const struct filterops skywalk_channel_efiltops = {
144 .f_isfd = 1,
145 .f_attach = filt_che_attach,
146 .f_detach = filt_che_detach,
147 .f_event = filt_che_event,
148 .f_touch = filt_che_touch,
149 .f_process = filt_che_process,
150 };
151
152 /* mitigation intervals in ns */
153 #define CH_MIT_IVAL_MIN NSEC_PER_USEC
154
155 static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156
157 #if (DEVELOPMENT || DEBUG)
158 SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160 SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162 #endif /* !DEVELOPMENT && !DEBUG */
163
164 static ZONE_DECLARE(ch_zone, SKMEM_ZONE_PREFIX ".ch",
165 sizeof(struct kern_channel), ZC_ZFREE_CLEARMEM);
166
167 static ZONE_DECLARE(ch_info_zone, SKMEM_ZONE_PREFIX ".ch.info",
168 sizeof(struct ch_info), ZC_ZFREE_CLEARMEM);
169
170 static int __ch_inited = 0;
171
172 /*
173 * Global cookies to hold the random numbers used for verifying
174 * user metadata red zone violations.
175 */
176 uint64_t __ch_umd_redzone_cookie = 0;
177
178 #define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key"
179 kern_allocation_name_t skmem_tag_ch_key;
180
181 static void
ch_redzone_init(void)182 ch_redzone_init(void)
183 {
184 _CASSERT(sizeof(__ch_umd_redzone_cookie) ==
185 sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
186 _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
187 _CASSERT(sizeof(struct __slot_desc) == 8);
188
189 /* Initialize random user red zone cookie values */
190 do {
191 read_random(&__ch_umd_redzone_cookie,
192 sizeof(__ch_umd_redzone_cookie));
193 } while (__ch_umd_redzone_cookie == 0);
194
195 SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
196 }
197
198 int
channel_init(void)199 channel_init(void)
200 {
201 int error = 0;
202
203 SK_LOCK_ASSERT_HELD();
204 ASSERT(!__ch_inited);
205
206 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
207 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
208
209 ch_redzone_init();
210
211 ASSERT(skmem_tag_ch_key == NULL);
212 skmem_tag_ch_key = kern_allocation_name_allocate(SKMEM_TAG_CH_KEY, 0);
213 ASSERT(skmem_tag_ch_key != NULL);
214
215 __ch_inited = 1;
216
217 return error;
218 }
219
220 void
channel_fini(void)221 channel_fini(void)
222 {
223 SK_LOCK_ASSERT_HELD();
224
225 if (__ch_inited) {
226 if (skmem_tag_ch_key != NULL) {
227 kern_allocation_name_release(skmem_tag_ch_key);
228 skmem_tag_ch_key = NULL;
229 }
230
231 __ch_umd_redzone_cookie = 0;
232 __ch_inited = 0;
233 }
234 }
235
236 void
csi_init(struct ch_selinfo * csi,boolean_t mitigation,uint64_t mit_ival)237 csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
238 {
239 csi->csi_flags = 0;
240 csi->csi_pending = 0;
241 if (mitigation) {
242 csi->csi_interval = mit_ival;
243 csi->csi_eff_interval = ch_mit_ival; /* global override */
244 atomic_bitset_32(&csi->csi_flags, CSI_MITIGATION);
245 csi->csi_tcall = thread_call_allocate_with_options(csi_tcall,
246 csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
247 /* this must not fail */
248 VERIFY(csi->csi_tcall != NULL);
249 } else {
250 csi->csi_interval = 0;
251 csi->csi_eff_interval = 0;
252 csi->csi_tcall = NULL;
253 }
254 lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr);
255 klist_init(&csi->csi_si.si_note);
256 }
257
258 void
csi_destroy(struct ch_selinfo * csi)259 csi_destroy(struct ch_selinfo *csi)
260 {
261 /* check if not already destroyed, else do it now */
262 if ((atomic_bitset_32_ov(&csi->csi_flags, CSI_DESTROYED) &
263 CSI_DESTROYED) == 0) {
264 CSI_LOCK(csi);
265 /* must have been set by above atomic op */
266 VERIFY(csi->csi_flags & CSI_DESTROYED);
267 if (csi->csi_flags & CSI_MITIGATION) {
268 thread_call_t tcall = csi->csi_tcall;
269 VERIFY(tcall != NULL);
270 CSI_UNLOCK(csi);
271
272 (void) thread_call_cancel_wait(tcall);
273 if (!thread_call_free(tcall)) {
274 boolean_t freed;
275 (void) thread_call_cancel_wait(tcall);
276 freed = thread_call_free(tcall);
277 VERIFY(freed);
278 }
279
280 CSI_LOCK(csi);
281 csi->csi_tcall = NULL;
282 atomic_bitclear_32(&csi->csi_flags, CSI_MITIGATION);
283 }
284 csi->csi_pending = 0;
285 CSI_UNLOCK(csi);
286
287 selthreadclear(&csi->csi_si);
288 /* now we don't need the mutex anymore */
289 lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group);
290 }
291 }
292
293 /*
294 * Called only for select(2).
295 */
296 __attribute__((always_inline))
297 static inline void
csi_selrecord(struct ch_selinfo * csi,struct proc * p,void * wql)298 csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
299 {
300 struct selinfo *si = &csi->csi_si;
301
302 CSI_LOCK_ASSERT_HELD(csi);
303 selrecord(p, si, wql);
304 }
305
306 void
csi_selrecord_one(struct __kern_channel_ring * kring,struct proc * p,void * wql)307 csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
308 {
309 struct ch_selinfo *csi = &kring->ckr_si;
310
311 CSI_LOCK(csi);
312 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
313 "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
314 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
315 SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
316
317 csi_selrecord(csi, p, wql);
318 CSI_UNLOCK(csi);
319 }
320
321 void
csi_selrecord_all(struct nexus_adapter * na,enum txrx t,struct proc * p,void * wql)322 csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
323 void *wql)
324 {
325 struct ch_selinfo *csi = &na->na_si[t];
326
327 CSI_LOCK(csi);
328 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
329 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
330 SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
331
332 csi_selrecord(csi, p, wql);
333 CSI_UNLOCK(csi);
334 }
335
336 /*
337 * Called from na_post_event().
338 */
339 __attribute__((always_inline))
340 static inline void
csi_selwakeup(struct ch_selinfo * csi,boolean_t within_kevent,boolean_t selwake,uint32_t hint)341 csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
342 boolean_t selwake, uint32_t hint)
343 {
344 struct selinfo *si = &csi->csi_si;
345
346 CSI_LOCK_ASSERT_HELD(csi);
347 csi->csi_pending = 0;
348 if (selwake) {
349 selwakeup(si);
350 }
351 if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
352 KNOTE(&si->si_note, hint);
353 }
354 }
355
356 __attribute__((always_inline))
357 static inline void
csi_selwakeup_delayed(struct ch_selinfo * csi)358 csi_selwakeup_delayed(struct ch_selinfo *csi)
359 {
360 CSI_LOCK_ASSERT_HELD(csi);
361 ASSERT(csi->csi_flags & CSI_MITIGATION);
362 ASSERT(csi->csi_tcall != NULL);
363
364 if (thread_call_isactive(csi->csi_tcall)) {
365 csi->csi_pending++;
366 } else if (!csi_tcall_start(csi)) {
367 csi_selwakeup(csi, FALSE, FALSE, 0);
368 }
369 }
370
371 __attribute__((always_inline))
372 static inline void
csi_selwakeup_common(struct ch_selinfo * csi,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)373 csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
374 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
375 {
376 CSI_LOCK_ASSERT_HELD(csi);
377
378 if (nodelay || within_kevent || !selwake || hint != 0 ||
379 !(csi->csi_flags & CSI_MITIGATION)) {
380 csi_selwakeup(csi, within_kevent, selwake, hint);
381 } else {
382 csi_selwakeup_delayed(csi);
383 }
384 }
385
386 void
csi_selwakeup_one(struct __kern_channel_ring * kring,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)387 csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
388 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
389 {
390 struct ch_selinfo *csi = &kring->ckr_si;
391
392 CSI_LOCK(csi);
393 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
394 "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
395 (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
396 SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
397 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
398 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
399
400 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
401 CSI_UNLOCK(csi);
402 }
403
404 void
csi_selwakeup_all(struct nexus_adapter * na,enum txrx t,boolean_t nodelay,boolean_t within_kevent,boolean_t selwake,uint32_t hint)405 csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
406 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
407 {
408 struct ch_selinfo *csi = &na->na_si[t];
409
410 CSI_LOCK(csi);
411 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
412 "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
413 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
414 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
415 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
416
417 switch (t) {
418 case NR_RX:
419 if (!(na->na_flags & NAF_RX_MITIGATION)) {
420 nodelay = TRUE;
421 }
422 break;
423
424 case NR_TX:
425 if (!(na->na_flags & NAF_TX_MITIGATION)) {
426 nodelay = TRUE;
427 }
428 break;
429
430 default:
431 nodelay = TRUE;
432 break;
433 }
434 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
435 CSI_UNLOCK(csi);
436 }
437
438 static boolean_t
csi_tcall_start(struct ch_selinfo * csi)439 csi_tcall_start(struct ch_selinfo *csi)
440 {
441 uint64_t now, ival, deadline;
442
443 CSI_LOCK_ASSERT_HELD(csi);
444 ASSERT(csi->csi_flags & CSI_MITIGATION);
445 ASSERT(csi->csi_tcall != NULL);
446
447 /* pick up latest value */
448 ival = csi_tcall_update_interval(csi);
449
450 /* if no mitigation, pass notification up now */
451 if (__improbable(ival == 0)) {
452 return FALSE;
453 }
454
455 deadline = now = mach_absolute_time();
456 clock_deadline_for_periodic_event(ival, now, &deadline);
457 (void) thread_call_enter_delayed(csi->csi_tcall, deadline);
458
459 return TRUE;
460 }
461
462 static void
csi_tcall(thread_call_param_t arg0,thread_call_param_t arg1)463 csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
464 {
465 #pragma unused(arg1)
466 struct ch_selinfo *csi = arg0;
467
468 CSI_LOCK(csi);
469 csi_selwakeup(csi, FALSE, FALSE, 0);
470 CSI_UNLOCK(csi);
471
472 CSI_LOCK(csi);
473 if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
474 csi->csi_pending != 0 && !csi_tcall_start(csi))) {
475 csi_selwakeup(csi, FALSE, FALSE, 0);
476 }
477 CSI_UNLOCK(csi);
478 }
479
480 __attribute__((always_inline))
481 static inline uint64_t
csi_tcall_update_interval(struct ch_selinfo * csi)482 csi_tcall_update_interval(struct ch_selinfo *csi)
483 {
484 uint64_t i = ch_mit_ival;
485
486 /* if global override was adjusted, update local copies */
487 if (__improbable(csi->csi_eff_interval != i)) {
488 ASSERT(csi->csi_flags & CSI_MITIGATION);
489 csi->csi_interval = csi->csi_eff_interval =
490 ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
491 }
492
493 return csi->csi_interval;
494 }
495
496 /* return EV_EOF if the channel is defunct */
497 static inline boolean_t
ch_filt_check_defunct(struct kern_channel * ch,struct knote * kn)498 ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
499 {
500 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
501 if (kn) {
502 kn->kn_flags |= EV_EOF;
503 }
504 return TRUE;
505 }
506 return FALSE;
507 }
508
509 static void
filt_chrwdetach(struct knote * kn,boolean_t write)510 filt_chrwdetach(struct knote *kn, boolean_t write)
511 {
512 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
513 struct ch_selinfo *csi;
514 struct selinfo *si;
515
516 lck_mtx_lock(&ch->ch_lock);
517 csi = ch->ch_si[write ? NR_TX : NR_RX];
518 si = &csi->csi_si;
519
520 CSI_LOCK(csi);
521 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
522 "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
523 SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
524 write ? "write" : "read", si->si_flags);
525
526 if (KNOTE_DETACH(&si->si_note, kn)) {
527 atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
528 }
529
530 CSI_UNLOCK(csi);
531 lck_mtx_unlock(&ch->ch_lock);
532 }
533
534 static void
filt_chrdetach(struct knote * kn)535 filt_chrdetach(struct knote *kn)
536 {
537 ASSERT(kn->kn_filter == EVFILT_READ);
538 filt_chrwdetach(kn, FALSE);
539 }
540
541 static void
filt_chwdetach(struct knote * kn)542 filt_chwdetach(struct knote *kn)
543 {
544 ASSERT(kn->kn_filter == EVFILT_WRITE);
545 filt_chrwdetach(kn, TRUE);
546 }
547
548 /*
549 * callback from notifies (generated externally).
550 * This always marks the knote activated, so always
551 * return 1.
552 */
553 static int
filt_chrw(struct knote * kn,long hint,int events)554 filt_chrw(struct knote *kn, long hint, int events)
555 {
556 #if SK_LOG
557 struct kern_channel *ch = kn->kn_hook;
558 #else
559 #pragma unused(kn)
560 #pragma unused(hint)
561 #pragma unused(events)
562 #endif
563 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
564 "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
565 SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
566 (kn->kn_flags & EV_POLL) ? "poll," : "",
567 (events == POLLOUT) ? "write" : "read",
568 (uint32_t)hint);
569
570 /* assume we are ready */
571 return 1;
572 }
573
574 static int
filt_chread(struct knote * kn,long hint)575 filt_chread(struct knote *kn, long hint)
576 {
577 ASSERT(kn->kn_filter == EVFILT_READ);
578 /* There is no hint for read/write event */
579 if (hint != 0) {
580 return 0;
581 }
582 return filt_chrw(kn, hint, POLLIN);
583 }
584
585 static int
filt_chwrite(struct knote * kn,long hint)586 filt_chwrite(struct knote *kn, long hint)
587 {
588 ASSERT(kn->kn_filter == EVFILT_WRITE);
589 /* There is no hint for read/write event */
590 if (hint != 0) {
591 return 0;
592 }
593 return filt_chrw(kn, hint, POLLOUT);
594 }
595
596 static int
filt_chtouch(struct knote * kn,struct kevent_qos_s * kev,int events)597 filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
598 {
599 #pragma unused(kev)
600 struct kern_channel *ch = kn->kn_hook;
601 int ev = kn->kn_filter;
602 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
603 int event_error = 0;
604 int revents;
605
606 /* save off the new input fflags and data */
607 kn->kn_sfflags = kev->fflags;
608 kn->kn_sdata = kev->data;
609
610 lck_mtx_lock(&ch->ch_lock);
611 if (__improbable(ch_filt_check_defunct(ch, kn))) {
612 lck_mtx_unlock(&ch->ch_lock);
613 return 1;
614 }
615
616 /* if a note-specific low watermark is given, validate it */
617 if (kn->kn_sfflags & NOTE_LOWAT) {
618 struct ch_ev_thresh note_thresh = {
619 .cet_unit = (dir == NR_TX) ?
620 ch->ch_info->cinfo_tx_lowat.cet_unit :
621 ch->ch_info->cinfo_rx_lowat.cet_unit,
622 .cet_value = (uint32_t)kn->kn_sdata
623 };
624 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
625 ¬e_thresh) != 0) {
626 SK_ERR("invalid NOTE_LOWAT threshold %u",
627 note_thresh.cet_value);
628 knote_set_error(kn, EINVAL);
629 lck_mtx_unlock(&ch->ch_lock);
630 return 1;
631 }
632 }
633
634 /* capture new state just so we can return it */
635 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE,
636 &event_error, FALSE);
637 lck_mtx_unlock(&ch->ch_lock);
638
639 if (revents & POLLERR) {
640 ASSERT(event_error != 0);
641 /*
642 * Setting a knote error here will confuse libdispatch, so we
643 * use EV_EOF instead.
644 */
645 kn->kn_flags |= EV_EOF;
646 return 1;
647 } else {
648 return (events & revents) != 0;
649 }
650 }
651
652 static int
filt_chrtouch(struct knote * kn,struct kevent_qos_s * kev)653 filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
654 {
655 ASSERT(kn->kn_filter == EVFILT_READ);
656
657 if (kev->flags & EV_ENABLE) {
658 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
659 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
660 kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
661 ((struct kern_channel *)kn->kn_hook)->ch_na));
662 }
663
664 return filt_chtouch(kn, kev, POLLIN);
665 }
666
667 static int
filt_chwtouch(struct knote * kn,struct kevent_qos_s * kev)668 filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
669 {
670 ASSERT(kn->kn_filter == EVFILT_WRITE);
671 return filt_chtouch(kn, kev, POLLOUT);
672 }
673
674
675 /*
676 * Called from kevent. We call ch_event(POLL[IN|OUT]) and
677 * return 0/1 accordingly.
678 */
679 static int
filt_chprocess(struct knote * kn,struct kevent_qos_s * kev,int events)680 filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
681 {
682 struct kern_channel *ch = kn->kn_hook;
683 struct ch_event_result result;
684 uint32_t lowat;
685 int trigger_event = 1;
686 int revents;
687 int event_error;
688 int64_t data;
689
690 lck_mtx_lock(&ch->ch_lock);
691 if (__improbable(ch_filt_check_defunct(ch, kn))) {
692 knote_fill_kevent(kn, kev, 0);
693 lck_mtx_unlock(&ch->ch_lock);
694 return 1;
695 }
696
697 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result,
698 TRUE, &event_error, FALSE);
699
700 if (revents & POLLERR) {
701 ASSERT(event_error != 0);
702 lck_mtx_unlock(&ch->ch_lock);
703 /*
704 * Setting a knote error here will confuse libdispatch, so we
705 * use EV_EOF instead.
706 */
707 kn->kn_flags |= EV_EOF;
708 knote_fill_kevent_with_sdata(kn, kev);
709 return 1;
710 }
711
712 trigger_event = (events & revents) != 0;
713
714 if (events == POLLOUT) {
715 lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
716 if ((kn->kn_sfflags & NOTE_LOWAT) &&
717 kn->kn_sdata > lowat) {
718 lowat = (uint32_t)kn->kn_sdata;
719 }
720
721 data = result.tx_data;
722
723 if (result.tx_data < lowat) {
724 trigger_event = 0;
725 }
726 } else {
727 lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
728 if ((kn->kn_sfflags & NOTE_LOWAT) &&
729 kn->kn_sdata > lowat) {
730 lowat = (uint32_t)kn->kn_sdata;
731 }
732
733 data = result.rx_data;
734
735 if (result.rx_data < lowat) {
736 trigger_event = 0;
737 }
738 }
739
740 if (trigger_event) {
741 knote_fill_kevent(kn, kev, data);
742 }
743
744 lck_mtx_unlock(&ch->ch_lock);
745
746 return trigger_event;
747 }
748
749 static int
filt_chrprocess(struct knote * kn,struct kevent_qos_s * kev)750 filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
751 {
752 ASSERT(kn->kn_filter == EVFILT_READ);
753 return filt_chprocess(kn, kev, POLLIN);
754 }
755
756 static int
filt_chwprocess(struct knote * kn,struct kevent_qos_s * kev)757 filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
758 {
759 ASSERT(kn->kn_filter == EVFILT_WRITE);
760 return filt_chprocess(kn, kev, POLLOUT);
761 }
762
763 static int
filt_chrwattach(struct knote * kn,__unused struct kevent_qos_s * kev)764 filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
765 {
766 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
767 struct nexus_adapter *na;
768 struct ch_selinfo *csi;
769 int ev = kn->kn_filter;
770 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
771 int revents;
772 int events;
773 int event_error = 0;
774
775 ASSERT((kn->kn_filter == EVFILT_READ) ||
776 (kn->kn_filter == EVFILT_WRITE));
777
778 /* ch_kqfilter() should have acquired the lock */
779 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
780
781 na = ch->ch_na;
782 /* if a note-specific low watermark is given, validate it */
783 if (kn->kn_sfflags & NOTE_LOWAT) {
784 struct ch_ev_thresh note_thresh = {
785 .cet_unit = (dir == NR_TX) ?
786 ch->ch_info->cinfo_tx_lowat.cet_unit :
787 ch->ch_info->cinfo_rx_lowat.cet_unit,
788 .cet_value = (uint32_t)kn->kn_sdata
789 };
790 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
791 ¬e_thresh) != 0) {
792 SK_ERR("invalid NOTE_LOWAT threshold %u",
793 note_thresh.cet_value);
794 knote_set_error(kn, EINVAL);
795 return 0;
796 }
797 }
798
799 /* the si is indicated in the channel */
800 csi = ch->ch_si[dir];
801 CSI_LOCK(csi);
802
803 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
804 atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
805 }
806
807 CSI_UNLOCK(csi);
808
809 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
810 na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
811 (kn->kn_flags & EV_POLL) ? "poll," : "",
812 (ev == EVFILT_WRITE) ? "write" : "read");
813
814 /* capture current state */
815 events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
816
817 if (__improbable(ch_filt_check_defunct(ch, kn))) {
818 revents = events;
819 } else {
820 /* filt_chprocess() will fill in the kn_sdata field */
821 revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p,
822 NULL, TRUE, &event_error, FALSE);
823 }
824
825 if (revents & POLLERR) {
826 ASSERT(event_error != 0);
827 kn->kn_flags |= EV_EOF;
828 return 1;
829 } else {
830 return (events & revents) != 0;
831 }
832 }
833
834 static int
filt_chan_extended_common(struct knote * kn,long ev_hint)835 filt_chan_extended_common(struct knote *kn, long ev_hint)
836 {
837 /*
838 * This function is not always called with the same set of locks held,
839 * hence it is only allowed to manipulate kn_fflags, with atomics.
840 *
841 * the f_event / f_process functions may run concurrently.
842 */
843 uint32_t add_fflags = 0;
844
845 if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
846 add_fflags |= NOTE_FLOW_ADV_UPDATE;
847 }
848 if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
849 add_fflags |= NOTE_CHANNEL_EVENT;
850 }
851 if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
852 add_fflags |= NOTE_IF_ADV_UPD;
853 }
854 if (add_fflags) {
855 /* Reset any events that are not requested on this knote */
856 add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
857 os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
858 return add_fflags != 0;
859 }
860 return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
861 }
862
863 static inline void
che_process_channel_event(struct kern_channel * ch,struct knote * kn,uint32_t fflags,long * hint)864 che_process_channel_event(struct kern_channel *ch, struct knote *kn,
865 uint32_t fflags, long *hint)
866 {
867 int revents, event_error = 0;
868
869 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
870 *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
871
872 if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
873 ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
874 /* capture new state to return */
875 revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p,
876 NULL, TRUE, &event_error, TRUE);
877 if (revents & POLLERR) {
878 ASSERT(event_error != 0);
879 /*
880 * Setting a knote error here will confuse libdispatch,
881 * so we use EV_EOF instead.
882 */
883 kn->kn_flags |= EV_EOF;
884 } else if ((revents & POLLIN) != 0) {
885 *hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
886 }
887 }
888 /*
889 * if the sync operation on event ring didn't find any events
890 * then indicate that the channel event is not active.
891 */
892 if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
893 /*
894 * Avoid a costly atomic when the bit is already cleared.
895 */
896 uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
897 if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
898 os_atomic_andnot(&kn->kn_fflags,
899 CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
900 }
901 }
902 }
903
904 static int
filt_che_attach(struct knote * kn,__unused struct kevent_qos_s * kev)905 filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
906 {
907 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
908 struct ch_selinfo *csi;
909 long hint = 0;
910
911 _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
912 _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
913 _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
914
915 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
916
917 /* ch_kqfilter() should have acquired the lock */
918 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
919
920 csi = ch->ch_si[NR_TX];
921 CSI_LOCK(csi);
922 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
923 atomic_bitset_32(&csi->csi_flags, CSI_KNOTE);
924 }
925 CSI_UNLOCK(csi);
926
927 if (__improbable(ch_filt_check_defunct(ch, kn))) {
928 return 1;
929 }
930 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
931 atomic_bitset_32(&ch->ch_na->na_flags,
932 NAF_CHANNEL_EVENT_ATTACHED);
933 }
934 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
935 if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
936 /* on registration force an event */
937 hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
938 }
939 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
940 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
941 "EVFILT_NW_CHANNEL");
942 return filt_chan_extended_common(kn, hint);
943 }
944
945 static void
filt_che_detach(struct knote * kn)946 filt_che_detach(struct knote *kn)
947 {
948 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
949 struct ch_selinfo *csi;
950
951 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
952
953 lck_mtx_lock(&ch->ch_lock);
954 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
955 atomic_bitclear_32(&ch->ch_na->na_flags,
956 NAF_CHANNEL_EVENT_ATTACHED);
957 }
958 csi = ch->ch_si[NR_TX];
959 CSI_LOCK(csi);
960 if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
961 atomic_bitclear_32(&csi->csi_flags, CSI_KNOTE);
962 }
963 CSI_UNLOCK(csi);
964 lck_mtx_unlock(&ch->ch_lock);
965
966 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
967 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
968 "EVFILT_NW_CHANNEL");
969 }
970
971 static int
filt_che_event(struct knote * kn,long hint)972 filt_che_event(struct knote *kn, long hint)
973 {
974 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
975
976 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
977 if (hint == 0) {
978 return 0;
979 }
980 if (__improbable(ch_filt_check_defunct(ch, NULL))) {
981 return 1;
982 }
983 if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
984 VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
985 }
986 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
987 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
988 CHAN_FILT_HINT_BITS);
989 return filt_chan_extended_common(kn, hint);
990 }
991
992 static int
filt_che_touch(struct knote * kn,struct kevent_qos_s * kev)993 filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
994 {
995 int ret;
996 long hint = 0;
997 struct kern_channel *ch = (struct kern_channel *)kn->kn_hook;
998
999 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1000 /* save off the new input fflags and data */
1001 kn->kn_sfflags = kev->fflags;
1002 kn->kn_sdata = kev->data;
1003
1004 lck_mtx_lock(&ch->ch_lock);
1005 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1006 ret = 1;
1007 goto done;
1008 }
1009 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
1010 if (kev->flags & EV_ENABLE) {
1011 atomic_bitset_32(&ch->ch_na->na_flags,
1012 NAF_CHANNEL_EVENT_ATTACHED);
1013 } else if (kev->flags & EV_DISABLE) {
1014 atomic_bitclear_32(&ch->ch_na->na_flags,
1015 NAF_CHANNEL_EVENT_ATTACHED);
1016 }
1017 }
1018 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1019 ret = filt_chan_extended_common(kn, hint);
1020 done:
1021 lck_mtx_unlock(&ch->ch_lock);
1022 return ret;
1023 }
1024
1025 static int
filt_che_process(struct knote * kn,struct kevent_qos_s * kev)1026 filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1027 {
1028 int ret;
1029 long hint = 0;
1030 struct kern_channel *ch = kn->kn_hook;
1031
1032 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1033 lck_mtx_lock(&ch->ch_lock);
1034 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1035 ret = 1;
1036 goto done;
1037 }
1038 che_process_channel_event(ch, kn, kn->kn_sfflags, &hint);
1039 ret = filt_chan_extended_common(kn, hint);
1040 done:
1041 lck_mtx_unlock(&ch->ch_lock);
1042 if (ret != 0) {
1043 /*
1044 * This filter historically behaves like EV_CLEAR,
1045 * even when EV_CLEAR wasn't set.
1046 */
1047 knote_fill_kevent(kn, kev, 0);
1048 kn->kn_fflags = 0;
1049 }
1050 return ret;
1051 }
1052
1053 int
ch_kqfilter(struct kern_channel * ch,struct knote * kn,struct kevent_qos_s * kev)1054 ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1055 struct kevent_qos_s *kev)
1056 {
1057 int result;
1058
1059 lck_mtx_lock(&ch->ch_lock);
1060 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1061
1062 if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1063 na_reject_channel(ch, ch->ch_na))) {
1064 SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1065 ch->ch_pid, ch->ch_flags, CHANF_BITS);
1066 knote_set_error(kn, ENXIO);
1067 lck_mtx_unlock(&ch->ch_lock);
1068 return 0;
1069 }
1070
1071 switch (kn->kn_filter) {
1072 case EVFILT_READ:
1073 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1074 break;
1075
1076 case EVFILT_WRITE:
1077 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1078 break;
1079
1080 case EVFILT_NW_CHANNEL:
1081 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1082 break;
1083
1084 default:
1085 lck_mtx_unlock(&ch->ch_lock);
1086 SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1087 ch->ch_pid, kn->kn_filter);
1088 knote_set_error(kn, EINVAL);
1089 return 0;
1090 }
1091
1092 kn->kn_hook = ch;
1093 /* call the appropriate sub-filter attach with the channel lock held */
1094 result = knote_fops(kn)->f_attach(kn, kev);
1095 lck_mtx_unlock(&ch->ch_lock);
1096 return result;
1097 }
1098
1099 boolean_t
ch_is_multiplex(struct kern_channel * ch,enum txrx t)1100 ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1101 {
1102 return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1103 }
1104
1105 int
ch_select(struct kern_channel * ch,int events,void * wql,struct proc * p)1106 ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1107 {
1108 int revents;
1109 int event_error = 0;
1110
1111 lck_mtx_lock(&ch->ch_lock);
1112 revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error,
1113 FALSE);
1114 lck_mtx_unlock(&ch->ch_lock);
1115
1116 ASSERT((revents & POLLERR) == 0 || event_error != 0);
1117
1118 return revents;
1119 }
1120
1121 #if SK_LOG
1122 /* Hoisted out of line to reduce kernel stack footprint */
1123 SK_LOG_ATTRIBUTE
1124 static void
ch_event_log(const char * prefix,const struct kern_channel * ch,struct proc * p,const struct nexus_adapter * na,int events,int revents)1125 ch_event_log(const char *prefix, const struct kern_channel *ch,
1126 struct proc *p, const struct nexus_adapter *na,
1127 int events, int revents)
1128 {
1129 SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1130 "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1131 SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1132 SK_KVA(current_thread()), events, revents);
1133 }
1134 #endif /* SK_LOG */
1135
1136 /*
1137 * select(2), poll(2) and kevent(2) handlers for channels.
1138 *
1139 * Can be called for one or more rings. Return true the event mask
1140 * corresponding to ready events. If there are no ready events, do
1141 * a selrecord on either individual selinfo or on the global one.
1142 * Device-dependent parts (locking and sync of tx/rx rings)
1143 * are done through callbacks.
1144 */
1145 static int
ch_event(struct kern_channel * ch,int events,void * wql,struct proc * p,struct ch_event_result * result,const boolean_t is_kevent,int * errno,const boolean_t is_ch_event)1146 ch_event(struct kern_channel *ch, int events, void *wql,
1147 struct proc *p, struct ch_event_result *result,
1148 const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1149 {
1150 struct nexus_adapter *na;
1151 struct __kern_channel_ring *kring;
1152 uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1153 uint32_t ready_tx_data = 0, ready_rx_data = 0;
1154 sk_protect_t protect = NULL;
1155
1156 #define want_tx want[NR_TX]
1157 #define want_rx want[NR_RX]
1158 /*
1159 * In order to avoid nested locks, we need to "double check"
1160 * txsync and rxsync if we decide to do a selrecord().
1161 * retry_tx (and retry_rx, later) prevent looping forever.
1162 */
1163 boolean_t retry_tx = TRUE, retry_rx = TRUE;
1164 int found, error = 0;
1165 int s;
1166
1167 net_update_uptime();
1168
1169 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1170 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1171
1172 *errno = 0;
1173
1174 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1175 ch->ch_schema == NULL)) {
1176 SK_ERR("%s(%d): channel is defunct or no longer bound",
1177 ch->ch_name, ch->ch_pid);
1178 revents = POLLERR;
1179 *errno = ENXIO;
1180 goto done;
1181 }
1182
1183 /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1184 if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1185 atomic_bitclear_32(&ch->ch_flags, CHANF_DEFUNCT_SKIP);
1186 }
1187
1188 na = ch->ch_na;
1189 if (__improbable(na == NULL ||
1190 !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1191 SK_ERR("%s(%d): channel is non-permissive",
1192 ch->ch_name, ch->ch_pid);
1193 revents = POLLERR;
1194 *errno = ENXIO;
1195 goto done;
1196 }
1197
1198 /* mark thread with sync-in-progress flag */
1199 protect = sk_sync_protect();
1200
1201 /* update our work timestamp */
1202 na->na_work_ts = _net_uptime;
1203
1204 /* and make this channel eligible for draining again */
1205 if (na->na_flags & NAF_DRAINING) {
1206 atomic_bitclear_32(&na->na_flags, NAF_DRAINING);
1207 }
1208
1209 #if SK_LOG
1210 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1211 ch_event_log("enter", ch, p, na, events, revents);
1212 }
1213 #endif
1214 if (is_ch_event) {
1215 goto process_channel_event;
1216 }
1217
1218 want_tx = (events & (POLLOUT | POLLWRNORM));
1219 want_rx = (events & (POLLIN | POLLRDNORM));
1220
1221 /*
1222 * check_all_{tx|rx} are set if the channel has more than one ring
1223 * AND the file descriptor is bound to all of them. If so, we sleep
1224 * on the "global" selinfo, otherwise we sleep on individual selinfo
1225 * The interrupt routine in the driver wake one or the other (or both)
1226 * depending on which clients are active.
1227 *
1228 * rxsync() is only called if we run out of buffers on a POLLIN.
1229 * txsync() is called if we run out of buffers on POLLOUT.
1230 */
1231 check_all_tx = ch_is_multiplex(ch, NR_TX);
1232 check_all_rx = ch_is_multiplex(ch, NR_RX);
1233
1234 /*
1235 * If want_tx is still set, we must issue txsync calls
1236 * (on all rings, to avoid that the tx rings stall).
1237 * XXX should also check head != khead on the tx rings.
1238 */
1239 if (want_tx) {
1240 ring_id_t first_tx = ch->ch_first[NR_TX];
1241 ring_id_t last_tx = ch->ch_last[NR_TX];
1242
1243 channel_threshold_unit_t tx_unit =
1244 ch->ch_info->cinfo_tx_lowat.cet_unit;
1245
1246 /*
1247 * The first round checks if anyone is ready, if not
1248 * do a selrecord and another round to handle races.
1249 * want_tx goes to 0 if any space is found, and is
1250 * used to skip rings with no pending transmissions.
1251 */
1252 flush_tx:
1253 for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1254 kring = &na->na_tx_rings[i];
1255 if (!want_tx &&
1256 kring->ckr_ring->ring_head == kring->ckr_khead) {
1257 continue;
1258 }
1259
1260 /* only one thread does txsync */
1261 s = kr_enter(kring, TRUE);
1262 ASSERT(s == 0);
1263
1264 error = 0;
1265 DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1266 ch, struct __kern_channel_ring *, kring);
1267 if (kr_txsync_prologue(ch, kring, p) >=
1268 kring->ckr_num_slots) {
1269 kr_log_bad_ring(kring);
1270 revents |= POLLERR;
1271 error = EFAULT;
1272 if (*errno == 0) {
1273 *errno = EFAULT;
1274 }
1275 } else {
1276 if (kring->ckr_na_sync(kring, p, 0)) {
1277 revents |= POLLERR;
1278 error = EIO;
1279 if (*errno == 0) {
1280 *errno = EIO;
1281 }
1282 } else {
1283 kr_txsync_finalize(ch, kring, p);
1284 }
1285 }
1286 DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1287 ch, struct __kern_channel_ring *, kring, int,
1288 error);
1289
1290 /*
1291 * If we found new slots, notify potential listeners on
1292 * the same ring. Since we just did a txsync, look at
1293 * the copies of cur,tail in the kring.
1294 */
1295 found = kring->ckr_rhead != kring->ckr_rtail;
1296 kr_exit(kring);
1297 if (found) { /* notify other listeners */
1298 revents |= want_tx;
1299 want_tx = 0;
1300 (void) kring->ckr_na_notify(kring, p,
1301 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1302 }
1303
1304 /*
1305 * Add this ring's free data to our running
1306 * tally for userspace.
1307 */
1308 if (result != NULL) {
1309 switch (tx_unit) {
1310 case CHANNEL_THRESHOLD_UNIT_BYTES:
1311 ready_tx_data += kring->ckr_ready_bytes;
1312 break;
1313 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1314 ready_tx_data += kring->ckr_ready_slots;
1315 break;
1316 }
1317 }
1318 }
1319 if (want_tx && retry_tx && !is_kevent) {
1320 if (check_all_tx) {
1321 csi_selrecord_all(na, NR_TX, p, wql);
1322 } else {
1323 csi_selrecord_one(&na->na_tx_rings[first_tx],
1324 p, wql);
1325 }
1326 retry_tx = FALSE;
1327 goto flush_tx;
1328 }
1329 }
1330
1331 /*
1332 * If want_rx is still set scan receive rings.
1333 * Do it on all rings because otherwise we starve.
1334 */
1335 if (want_rx) {
1336 ring_id_t first_rx = ch->ch_first[NR_RX];
1337 ring_id_t last_rx = ch->ch_last[NR_RX];
1338 channel_threshold_unit_t rx_unit =
1339 ch->ch_info->cinfo_rx_lowat.cet_unit;
1340
1341 /* two rounds here for race avoidance */
1342 do_retry_rx:
1343 for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1344 kring = &na->na_rx_rings[i];
1345
1346 /* only one thread does rxsync */
1347 s = kr_enter(kring, TRUE);
1348 ASSERT(s == 0);
1349
1350 error = 0;
1351 DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1352 ch, struct __kern_channel_ring *, kring);
1353 if (kr_rxsync_prologue(ch, kring, p) >=
1354 kring->ckr_num_slots) {
1355 kr_log_bad_ring(kring);
1356 revents |= POLLERR;
1357 error = EFAULT;
1358 if (*errno == 0) {
1359 *errno = EFAULT;
1360 }
1361 } else {
1362 /* now we can use kring->rhead, rtail */
1363 if (kring->ckr_na_sync(kring, p, 0)) {
1364 revents |= POLLERR;
1365 error = EIO;
1366 if (*errno == 0) {
1367 *errno = EIO;
1368 }
1369 } else {
1370 kr_rxsync_finalize(ch, kring, p);
1371 }
1372 }
1373
1374 DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1375 ch, struct __kern_channel_ring *, kring, int,
1376 error);
1377
1378 found = kring->ckr_rhead != kring->ckr_rtail;
1379 kr_exit(kring);
1380 if (found) {
1381 revents |= want_rx;
1382 retry_rx = FALSE;
1383 (void) kring->ckr_na_notify(kring, p,
1384 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1385 }
1386
1387 /*
1388 * Add this ring's readable data to our running
1389 * tally for userspace.
1390 */
1391 if (result != NULL) {
1392 switch (rx_unit) {
1393 case CHANNEL_THRESHOLD_UNIT_BYTES:
1394 ready_rx_data += kring->ckr_ready_bytes;
1395 break;
1396 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1397 ready_rx_data += kring->ckr_ready_slots;
1398 break;
1399 }
1400 }
1401 }
1402
1403 if (retry_rx && !is_kevent) {
1404 if (check_all_rx) {
1405 csi_selrecord_all(na, NR_RX, p, wql);
1406 } else {
1407 csi_selrecord_one(&na->na_rx_rings[first_rx],
1408 p, wql);
1409 }
1410 }
1411 if (retry_rx) {
1412 retry_rx = FALSE;
1413 goto do_retry_rx;
1414 }
1415 }
1416
1417 if (result != NULL) {
1418 result->tx_data = ready_tx_data;
1419 result->rx_data = ready_rx_data;
1420 }
1421 goto skip_channel_event;
1422
1423 process_channel_event:
1424 /*
1425 * perform sync operation on the event ring to make the channel
1426 * events enqueued in the ring visible to user-space.
1427 */
1428
1429 /* select() and poll() not supported for event ring */
1430 ASSERT(is_kevent);
1431 VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1432 kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1433
1434 /* only one thread does the sync */
1435 s = kr_enter(kring, TRUE);
1436 ASSERT(s == 0);
1437 if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1438 kr_log_bad_ring(kring);
1439 revents |= POLLERR;
1440 if (*errno == 0) {
1441 *errno = EFAULT;
1442 }
1443 } else {
1444 if (kring->ckr_na_sync(kring, p, 0)) {
1445 revents |= POLLERR;
1446 if (*errno == 0) {
1447 *errno = EIO;
1448 }
1449 } else {
1450 kr_event_sync_finalize(ch, kring, p);
1451 }
1452 }
1453 found = (kring->ckr_rhead != kring->ckr_rtail);
1454 kr_exit(kring);
1455 if (found) {
1456 revents |= (events & POLLIN);
1457 }
1458
1459 skip_channel_event:
1460 #if SK_LOG
1461 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1462 ch_event_log("exit", ch, p, na, events, revents);
1463 }
1464 #endif /* SK_LOG */
1465
1466 /* unmark thread with sync-in-progress flag */
1467 sk_sync_unprotect(protect);
1468
1469 done:
1470 ASSERT(!sk_is_sync_protected());
1471
1472 return revents;
1473 #undef want_tx
1474 #undef want_rx
1475 }
1476
1477 static struct kern_channel *
ch_find(struct kern_nexus * nx,nexus_port_t port,ring_id_t ring_id)1478 ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1479 {
1480 struct kern_channel *ch;
1481
1482 SK_LOCK_ASSERT_HELD();
1483
1484 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1485 struct ch_info *cinfo = ch->ch_info;
1486
1487 /* see comments in ch_open() */
1488 if (cinfo->cinfo_nx_port != port) {
1489 continue;
1490 } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1491 continue;
1492 } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1493 ring_id != cinfo->cinfo_ch_ring_id &&
1494 ring_id != CHANNEL_RING_ID_ANY) {
1495 continue;
1496 }
1497
1498 /* found a match */
1499 break;
1500 }
1501
1502 if (ch != NULL) {
1503 ch_retain_locked(ch);
1504 }
1505
1506 return ch;
1507 }
1508
1509 #if SK_LOG
1510 /* Hoisted out of line to reduce kernel stack footprint */
1511 SK_LOG_ATTRIBUTE
1512 static void
ch_open_log1(const uuid_t p_uuid,struct proc * p,nexus_port_t port)1513 ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1514 {
1515 uuid_string_t uuidstr;
1516
1517 SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1518 sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1519 sk_uuid_unparse(p_uuid, uuidstr), port);
1520 }
1521
1522 SK_LOG_ATTRIBUTE
1523 static void
ch_open_log2(struct proc * p,nexus_port_t port,ring_id_t ring,uint32_t mode,const char * mode_bits,int err)1524 ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1525 uint32_t mode, const char *mode_bits, int err)
1526 {
1527 SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1528 sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1529 mode, mode_bits, err);
1530 }
1531 #endif /* SK_LOG */
1532
1533 struct kern_channel *
ch_open(struct ch_init * init,struct proc * p,int fd,int * err)1534 ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1535 {
1536 uint32_t mode = init->ci_ch_mode;
1537 nexus_port_t port = init->ci_nx_port;
1538 ring_id_t ring = init->ci_ch_ring_id;
1539 struct kern_channel *ch = NULL, *ch0 = NULL;
1540 struct nxbind *nxb = NULL;
1541 struct kern_nexus *nx;
1542 struct chreq chr;
1543 uuid_t p_uuid;
1544 kauth_cred_t cred;
1545
1546 cred = kauth_cred_get();
1547 ASSERT(!uuid_is_null(init->ci_nx_uuid));
1548 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1549 *err = 0;
1550
1551 /* make sure we don't allow userland to set kernel-only flags */
1552 mode &= CHMODE_MASK;
1553
1554 SK_LOCK();
1555
1556 nx = nx_find(init->ci_nx_uuid, TRUE);
1557 if (nx == NULL) {
1558 *err = ENOENT;
1559 goto done;
1560 }
1561
1562 /* port (zero-based) must be within the domain's range */
1563 if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1564 *err = EDOM;
1565 goto done;
1566 }
1567 VERIFY(port != NEXUS_PORT_ANY);
1568
1569 if (mode & CHMODE_LOW_LATENCY) {
1570 if ((*err = skywalk_priv_check_cred(p, cred,
1571 PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1572 goto done;
1573 }
1574 }
1575
1576 /* "no copy" is valid only when at least one tx/rx mon flag is set */
1577 if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1578 mode &= ~CHMODE_MONITOR_NO_COPY;
1579 }
1580
1581 if (mode & CHMODE_MONITOR) {
1582 if ((*err = skywalk_priv_check_cred(p, cred,
1583 PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1584 goto done;
1585 }
1586 /* Don't allow non-root processes to monitor channels. */
1587 if (kauth_cred_issuser(cred) == 0) {
1588 *err = EPERM;
1589 goto done;
1590 }
1591 }
1592
1593 /*
1594 * Check with the nexus to see if the port is bound; if so, prepare
1595 * our nxbind structure that we'll need to pass down to the nexus
1596 * for it compare. If the caller provides a key, we take it over
1597 * and will free it ourselves (as part of freeing nxbind.)
1598 *
1599 * If this is a monitor channel, skip this altogether since the check
1600 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1601 */
1602 if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1603 void *key = (void *)(init->ci_key);
1604
1605 #if SK_LOG
1606 if (__improbable(sk_verbose != 0)) {
1607 ch_open_log1(p_uuid, p, port);
1608 }
1609 #endif /* SK_LOG */
1610
1611 nxb = nxb_alloc(Z_WAITOK);
1612 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1613 nxb->nxb_uniqueid = proc_uniqueid(p);
1614 nxb->nxb_pid = proc_pid(p);
1615 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1616 uuid_copy(nxb->nxb_exec_uuid, p_uuid);
1617 if (key != NULL) {
1618 nxb->nxb_flags |= NXBF_MATCH_KEY;
1619 nxb->nxb_key_len = init->ci_key_len;
1620 nxb->nxb_key = key;
1621 init->ci_key = USER_ADDR_NULL; /* take over */
1622 }
1623 }
1624
1625 /*
1626 * There can only be one owner of {port,ring_id} tuple. Once
1627 * owned, this can be made available among multiple monitors.
1628 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1629 * all rings. Further attempts to own any or all of the rings
1630 * will be declined.
1631 *
1632 * Multiple monitors are allowed to exist. If a channel has been
1633 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1634 * monitored. If an owning channel has been bound to an individual
1635 * ring, only that ring can be monitored, either by specifying the
1636 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1637 *
1638 * For example, assuming a 2-rings setup for port 'p':
1639 *
1640 * owner{p,-1}
1641 * will allow:
1642 * monitor{p,-1}, monitor{p,0}, monitor{p,1}
1643 * will not allow:
1644 * owner{p,-1}, owner{p,0}, owner{p,1}
1645 *
1646 * owner{p,0}
1647 * will allow:
1648 * owner{p,1}, monitor{p,-1}, monitor{p,0}
1649 * will not allow:
1650 * owner{p,-1}, owner{p,0}, monitor{p,1}
1651 */
1652 if ((ch0 = ch_find(nx, port, ring)) != NULL) {
1653 SK_D("found ch0 0x%llx", SK_KVA(ch0));
1654 /*
1655 * Unless this is a monitor channel, allow only at
1656 * most one owner of the {port,ring_id} tuple.
1657 */
1658 if (!(mode & CHMODE_MONITOR)) {
1659 #if SK_LOG
1660 uuid_string_t uuidstr;
1661 char *na_name = (ch0->ch_na != NULL) ?
1662 ch0->ch_na->na_name : "";
1663
1664 SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1665 "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1666 sk_uuid_unparse(nx->nx_uuid, uuidstr),
1667 ch0->ch_name, ch0->ch_pid);
1668 #endif /* SK_LOG */
1669 *err = EBUSY;
1670 goto done;
1671 }
1672 } else if (mode & CHMODE_MONITOR) {
1673 *err = ENXIO;
1674 goto done;
1675 }
1676
1677 bzero(&chr, sizeof(chr));
1678 chr.cr_tx_lowat = init->ci_tx_lowat;
1679 chr.cr_rx_lowat = init->ci_rx_lowat;
1680 chr.cr_port = port;
1681 chr.cr_mode = mode;
1682 chr.cr_ring_id = ring;
1683
1684 /* upon success, returns a channel with reference held */
1685 ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1686
1687 done:
1688
1689 #if SK_LOG
1690 if (__improbable(sk_verbose != 0)) {
1691 ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1692 }
1693 #endif /* SK_LOG */
1694
1695 if (ch0 != NULL) {
1696 (void) ch_release_locked(ch0);
1697 }
1698
1699 if (nx != NULL) {
1700 (void) nx_release_locked(nx);
1701 }
1702
1703 if (nxb != NULL) {
1704 nxb_free(nxb);
1705 }
1706
1707 SK_UNLOCK();
1708
1709 return ch;
1710 }
1711
1712 struct kern_channel *
ch_open_special(struct kern_nexus * nx,struct chreq * chr,boolean_t nonxref,int * err)1713 ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1714 int *err)
1715 {
1716 struct kern_channel *ch = NULL;
1717
1718 SK_LOCK_ASSERT_HELD();
1719 *err = 0;
1720
1721 ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1722 ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1723 ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1724 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1725 chr->cr_mode |= CHMODE_KERNEL;
1726 if (nonxref) {
1727 chr->cr_mode |= CHMODE_NO_NXREF;
1728 } else {
1729 chr->cr_mode &= ~CHMODE_NO_NXREF;
1730 }
1731
1732 /* upon success, returns a channel with reference held */
1733 ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1734 if (ch != NULL) {
1735 /*
1736 * nonxref channels don't hold any reference to the nexus,
1737 * since otherwise we'll never be able to close them when
1738 * the last regular channel of the nexus is closed, as part
1739 * of the nexus's destructor operation. Release the nonxref
1740 * channel reference now, but make sure the nexus has at
1741 * least 3 refs: global list, provider list and the nonxref
1742 * channel itself, before doing that.
1743 */
1744 if (nonxref) {
1745 ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1746 ASSERT(nx->nx_refcnt > 3);
1747 (void) nx_release_locked(nx);
1748 }
1749 }
1750
1751 #if SK_LOG
1752 uuid_string_t uuidstr;
1753 SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1754 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ?
1755 ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id,
1756 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1757 CHMODE_BITS, *err);
1758 #endif /* SK_LOG */
1759
1760 return ch;
1761 }
1762
1763 static void
ch_close_common(struct kern_channel * ch,boolean_t locked,boolean_t special)1764 ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1765 {
1766 #pragma unused(special)
1767 #if SK_LOG
1768 uuid_string_t uuidstr;
1769 const char *na_name = (ch->ch_na != NULL) ?
1770 ch->ch_na->na_name : "";
1771 const char *nxdom_name = (ch->ch_nexus != NULL) ?
1772 NX_DOM(ch->ch_nexus)->nxdom_name : "";
1773 const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1774 NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1775
1776 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1777 SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1778 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1779 SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1780 uuidstr));
1781 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1782 #endif /* SK_LOG */
1783 struct kern_nexus *nx = ch->ch_nexus;
1784
1785 if (!locked) {
1786 SK_LOCK();
1787 }
1788
1789 SK_LOCK_ASSERT_HELD();
1790 /*
1791 * If the channel is participating in the interface advisory
1792 * notification, remove it from the nexus.
1793 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1794 * is held in exclusive mode.
1795 */
1796 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
1797 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1798 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1799 kern_channel, ch_link_if_adv);
1800 atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
1801 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1802 nx_netif_config_interface_advisory(nx, false);
1803 }
1804 lck_rw_done(&nx->nx_ch_if_adv_lock);
1805 lck_mtx_lock(&ch->ch_lock);
1806 (void) ch_release_locked(ch);
1807 } else {
1808 lck_rw_done(&nx->nx_ch_if_adv_lock);
1809 lck_mtx_lock(&ch->ch_lock);
1810 }
1811 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1812 /*
1813 * Mark the channel as closing to prevent further setopt requests;
1814 * this flag is set once here and never gets cleared.
1815 */
1816 ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1817 atomic_bitset_32(&ch->ch_flags, CHANF_CLOSING);
1818
1819 if (special) {
1820 VERIFY(ch->ch_flags & CHANF_KERNEL);
1821 } else {
1822 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1823 }
1824
1825 ch->ch_fd = -1;
1826
1827 /* may be called as part of failure cleanup, so check */
1828 if (ch->ch_flags & CHANF_ATTACHED) {
1829 boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1830
1831 /* caller must hold an extra ref */
1832 ASSERT(ch->ch_refcnt > 1);
1833
1834 /* disconnect from nexus */
1835 ch_disconnect(ch);
1836
1837 /*
1838 * If this was the last regular channel and the nexus
1839 * has been closed, detach it and finish up the job.
1840 * If this was a nonxref channel, there is nothing
1841 * left to do; see comments in ch_open_special().
1842 */
1843 if (!nonxref) {
1844 STAILQ_REMOVE(&nx->nx_ch_head, ch,
1845 kern_channel, ch_link);
1846 nx->nx_ch_count--;
1847 if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1848 (nx->nx_flags & NXF_CLOSED)) {
1849 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1850 nx_detach(nx);
1851 }
1852 (void) nx_release_locked(nx);
1853 } else {
1854 ASSERT(ch->ch_flags & CHANF_KERNEL);
1855 STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1856 kern_channel, ch_link);
1857 }
1858
1859 atomic_bitclear_32(&ch->ch_flags, CHANF_ATTACHED);
1860 ch->ch_nexus = NULL;
1861
1862 (void) ch_release_locked(ch); /* for the list */
1863 }
1864
1865 lck_mtx_unlock(&ch->ch_lock);
1866 if (!locked) {
1867 SK_UNLOCK();
1868 }
1869 }
1870
1871 void
ch_close(struct kern_channel * ch,boolean_t locked)1872 ch_close(struct kern_channel *ch, boolean_t locked)
1873 {
1874 ch_close_common(ch, locked, FALSE);
1875 }
1876
1877 void
ch_close_special(struct kern_channel * ch)1878 ch_close_special(struct kern_channel *ch)
1879 {
1880 ch_close_common(ch, TRUE, TRUE);
1881 }
1882
1883 static int
ch_ev_thresh_validate(struct kern_nexus * nx,enum txrx t,struct ch_ev_thresh * cet)1884 ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1885 struct ch_ev_thresh *cet)
1886 {
1887 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1888 uint32_t bmin, bmax, smin, smax;
1889 int err = 0;
1890
1891 if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1892 cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1893 err = EINVAL;
1894 goto done;
1895 }
1896
1897 smin = 1; /* minimum 1 slot */
1898 bmin = 1; /* minimum 1 byte */
1899
1900 if (t == NR_TX) {
1901 ASSERT(nxp->nxp_tx_slots > 0);
1902 smax = (nxp->nxp_tx_slots - 1);
1903 } else {
1904 ASSERT(nxp->nxp_rx_slots > 0);
1905 smax = (nxp->nxp_rx_slots - 1);
1906 }
1907 bmax = (smax * nxp->nxp_buf_size);
1908
1909 switch (cet->cet_unit) {
1910 case CHANNEL_THRESHOLD_UNIT_BYTES:
1911 if (cet->cet_value < bmin) {
1912 cet->cet_value = bmin;
1913 } else if (cet->cet_value > bmax) {
1914 cet->cet_value = bmax;
1915 }
1916 break;
1917
1918 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1919 if (cet->cet_value < smin) {
1920 cet->cet_value = smin;
1921 } else if (cet->cet_value > smax) {
1922 cet->cet_value = smax;
1923 }
1924 break;
1925 }
1926
1927 done:
1928 return err;
1929 }
1930
1931 #if SK_LOG
1932 /* Hoisted out of line to reduce kernel stack footprint */
1933 SK_LOG_ATTRIBUTE
1934 static void
ch_connect_log1(const struct kern_nexus * nx,const struct ch_info * cinfo,const struct chreq * chr,const struct kern_channel * ch,const struct kern_nexus_domain_provider * nxdom_prov,struct proc * p)1935 ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1936 const struct chreq *chr, const struct kern_channel *ch,
1937 const struct kern_nexus_domain_provider *nxdom_prov,
1938 struct proc *p)
1939 {
1940 struct __user_channel_schema *ch_schema = ch->ch_schema;
1941 uuid_string_t uuidstr;
1942 unsigned int n;
1943 ring_id_t i, j;
1944
1945 ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1946 if (ch_schema != NULL) {
1947 SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1948 SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name);
1949 SK_D(" kern_uuid: %s",
1950 sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1951 SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS);
1952 SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings,
1953 cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1954 SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings,
1955 cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1956
1957 j = ch->ch_last[NR_TX];
1958 for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1959 SK_D(" tx_ring_%u_off: 0x%llx", i,
1960 (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1961 SK_D(" tx_sd_%u_off: 0x%llx", i,
1962 (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1963 }
1964 j = n;
1965 for (n = 0, i = ch->ch_first[NR_RX];
1966 i < ch->ch_last[NR_RX]; n++, i++) {
1967 SK_D(" rx_ring_%u_off: 0x%llx", i,
1968 (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1969 SK_D(" rx_sd_%u_off: 0x%llx", i,
1970 (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1971 }
1972 SK_D(" md_type: %u", ch_schema->csm_md_type);
1973 SK_D(" md_subtype: %u", ch_schema->csm_md_subtype);
1974 SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs);
1975 SK_D(" stats_type: %u", ch_schema->csm_stats_type);
1976 SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs);
1977 SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max);
1978 SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs);
1979 }
1980
1981 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1982 SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1983 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1984 cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1985 SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1986 SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1987 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1988 SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1989 sk_proc_name_address(p), sk_proc_pid(p));
1990 SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1991 ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1992 "bytes" : "slots"));
1993 SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1994 ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1995 "bytes" : "slots"));
1996 SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
1997 SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1998 SK_D(" mapsize: 0x%llx (%llu KB)",
1999 (uint64_t)cinfo->cinfo_mem_map_size,
2000 (uint64_t)cinfo->cinfo_mem_map_size >> 10);
2001 SK_D(" memsize: 0x%llx (%llu KB)",
2002 (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
2003 SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
2004 }
2005
2006 SK_LOG_ATTRIBUTE
2007 static void
ch_connect_log2(const struct kern_nexus * nx,int err)2008 ch_connect_log2(const struct kern_nexus *nx, int err)
2009 {
2010 uuid_string_t nx_uuidstr;
2011
2012 SK_ERR("Error connecting to nexus UUID %s: %d",
2013 sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2014 }
2015 #endif /* SK_LOG */
2016
2017 static struct kern_channel *
ch_connect(struct kern_nexus * nx,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p,int fd,int * err)2018 ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2019 struct nxbind *nxb, struct proc *p, int fd, int *err)
2020 {
2021 struct kern_nexus_domain_provider *nxdom_prov;
2022 struct kern_channel *ch = NULL;
2023 struct ch_info *cinfo = NULL;
2024 uint32_t ch_mode = chr->cr_mode;
2025 boolean_t config = FALSE;
2026 struct nxdom *nxdom;
2027 boolean_t reserved_port = FALSE;
2028
2029 ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2030 ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2031 SK_LOCK_ASSERT_HELD();
2032
2033 /* validate thresholds before we proceed any further */
2034 if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 ||
2035 (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) {
2036 goto done;
2037 }
2038
2039 if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2040 *err = ENOTSUP;
2041 goto done;
2042 }
2043
2044 ch = ch_alloc(Z_WAITOK);
2045
2046 lck_mtx_lock(&ch->ch_lock);
2047
2048 uuid_generate_random(ch->ch_info->cinfo_ch_id);
2049 ch->ch_fd = fd;
2050 ch->ch_pid = proc_pid(p);
2051 (void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s",
2052 proc_name_address(p));
2053
2054 nxdom_prov = NX_DOM_PROV(nx);
2055 nxdom = NX_DOM(nx);
2056
2057 if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2058 /*
2059 * CHANF_KERNEL implies a channel opened by a kernel
2060 * subsystem, and is triggered by the CHMODE_KERNEL
2061 * flag which (only ever) set by ch_open_special().
2062 *
2063 * CHANF_NONXREF can be optionally set based on the
2064 * CHMODE_NO_NXREF request flag. This must only be
2065 * set by ch_open_special() as well, hence we verify.
2066 */
2067 ASSERT(p == kernproc);
2068 ASSERT(ch_mode & CHMODE_KERNEL);
2069 atomic_bitset_32(&ch->ch_flags, CHANF_KERNEL);
2070 if (ch_mode & CHMODE_NO_NXREF) {
2071 atomic_bitset_32(&ch->ch_flags, CHANF_NONXREF);
2072 }
2073
2074 config = (ch_mode & CHMODE_CONFIG) != 0;
2075 if (chr->cr_port == NEXUS_PORT_ANY) {
2076 if (nxdom->nxdom_find_port == NULL) {
2077 *err = ENOTSUP;
2078 goto done;
2079 }
2080
2081 /*
2082 * If ephemeral port request, find one for client;
2083 * we ask for the reserved port range if this is
2084 * a configuration request (CHMODE_CONFIG).
2085 */
2086 if ((*err = nxdom->nxdom_find_port(nx,
2087 config, &chr->cr_port)) != 0) {
2088 goto done;
2089 }
2090 }
2091 }
2092
2093 if (skywalk_check_platform_binary(p)) {
2094 atomic_bitset_32(&ch->ch_flags, CHANF_PLATFORM);
2095 }
2096
2097 ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2098
2099 reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2100 (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2101 if (!config && reserved_port) {
2102 *err = EDOM;
2103 goto done;
2104 }
2105
2106 SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2107 sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2108
2109 if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2110 nx, ch, chr, ch0, nxb, p)) != 0) {
2111 goto done;
2112 }
2113
2114 cinfo = ch->ch_info;
2115 uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid);
2116 /* for easy access to immutables */
2117 bcopy((void *)nx->nx_prov->nxprov_params,
2118 (void *)&cinfo->cinfo_nxprov_params, sizeof(struct nxprov_params));
2119 cinfo->cinfo_ch_mode = ch_mode;
2120 cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2121 cinfo->cinfo_nx_port = chr->cr_port;
2122 cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2123 cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2124 cinfo->cinfo_schema_offset = chr->cr_memoffset;
2125 cinfo->cinfo_num_bufs =
2126 skmem_arena_nexus(ch->ch_na->na_arena)->
2127 arn_rx_pp->pp_buf_region->skr_params.srp_c_obj_cnt;
2128 /*
2129 * ch_last is really the number of rings, but we need to return
2130 * the actual zero-based ring ID to the client. Make sure that
2131 * is the case here and adjust last_{tx,rx}_ring accordingly.
2132 */
2133 ASSERT((ch->ch_last[NR_TX] > 0) ||
2134 (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2135 ASSERT((ch->ch_last[NR_RX] > 0) ||
2136 (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2137 cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2138 cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2139 cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2140 cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2141 cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2142 cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2143
2144 if (ch_mode & CHMODE_NO_NXREF) {
2145 ASSERT(ch_mode & CHMODE_KERNEL);
2146 STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2147 } else {
2148 STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2149 nx->nx_ch_count++;
2150 }
2151 atomic_bitset_32(&ch->ch_flags, CHANF_ATTACHED);
2152 ch->ch_nexus = nx;
2153 nx_retain_locked(nx); /* hold a ref on the nexus */
2154
2155 ch_retain_locked(ch); /* one for being in the list */
2156 ch_retain_locked(ch); /* one for the caller */
2157
2158 /*
2159 * Now that we've successfully created the nexus adapter, inform the
2160 * nexus provider about the rings and the slots within each ring.
2161 * This is a no-op for internal nexus providers.
2162 */
2163 if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2164 lck_mtx_unlock(&ch->ch_lock);
2165
2166 /* gracefully close this fully-formed channel */
2167 if (ch->ch_flags & CHANF_KERNEL) {
2168 ch_close_special(ch);
2169 } else {
2170 ch_close(ch, TRUE);
2171 }
2172 (void) ch_release_locked(ch);
2173 ch = NULL;
2174 goto done;
2175 }
2176
2177 ASSERT(ch->ch_schema == NULL ||
2178 (ch->ch_schema->csm_flags & CSM_ACTIVE));
2179
2180 #if SK_LOG
2181 if (__improbable(sk_verbose != 0)) {
2182 ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2183 }
2184 #endif /* SK_LOG */
2185
2186 done:
2187 if (ch != NULL) {
2188 lck_mtx_unlock(&ch->ch_lock);
2189 }
2190 if (*err != 0) {
2191 #if SK_LOG
2192 if (__improbable(sk_verbose != 0)) {
2193 ch_connect_log2(nx, *err);
2194 }
2195 #endif /* SK_LOG */
2196 if (ch != NULL) {
2197 ch_free(ch);
2198 ch = NULL;
2199 }
2200 }
2201 return ch;
2202 }
2203
2204 static void
ch_disconnect(struct kern_channel * ch)2205 ch_disconnect(struct kern_channel *ch)
2206 {
2207 struct kern_nexus *nx = ch->ch_nexus;
2208 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2209
2210 SK_LOCK_ASSERT_HELD();
2211 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2212
2213 /*
2214 * Inform the nexus provider that the channel has been quiesced
2215 * and disconnected from the nexus port. This is a no-op for
2216 * internal nexus providers.
2217 */
2218 nxprov_advise_disconnect(nx, ch);
2219
2220 /* Finally, let the domain provider tear down the instance */
2221 nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2222 }
2223
2224 void
ch_deactivate(struct kern_channel * ch)2225 ch_deactivate(struct kern_channel *ch)
2226 {
2227 /*
2228 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2229 * it will never be set again. Doing this will cause
2230 * os_channel_is_defunct() to indicate that the channel
2231 * is defunct and is no longer usable (thus should be
2232 * immediately closed).
2233 */
2234 if (ch->ch_schema != NULL &&
2235 (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2236 atomic_bitclear_32(__DECONST(uint32_t *,
2237 &ch->ch_schema->csm_flags), CSM_ACTIVE);
2238 /* make this globally visible */
2239 membar_sync();
2240 }
2241 }
2242
2243 int
ch_set_opt(struct kern_channel * ch,struct sockopt * sopt)2244 ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2245 {
2246 #pragma unused(ch)
2247 int err = 0;
2248
2249 if (sopt->sopt_dir != SOPT_SET) {
2250 sopt->sopt_dir = SOPT_SET;
2251 }
2252
2253 switch (sopt->sopt_name) {
2254 case CHOPT_TX_LOWAT_THRESH:
2255 err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2256 break;
2257
2258 case CHOPT_RX_LOWAT_THRESH:
2259 err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2260 break;
2261
2262 case CHOPT_IF_ADV_CONF:
2263 err = ch_configure_interface_advisory_event(ch, sopt);
2264 break;
2265
2266 default:
2267 err = ENOPROTOOPT;
2268 break;
2269 }
2270
2271 return err;
2272 }
2273
2274 int
ch_get_opt(struct kern_channel * ch,struct sockopt * sopt)2275 ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2276 {
2277 #pragma unused(ch)
2278 int err = 0;
2279
2280 if (sopt->sopt_dir != SOPT_GET) {
2281 sopt->sopt_dir = SOPT_GET;
2282 }
2283
2284 switch (sopt->sopt_name) {
2285 case CHOPT_TX_LOWAT_THRESH:
2286 err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2287 break;
2288
2289 case CHOPT_RX_LOWAT_THRESH:
2290 err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2291 break;
2292
2293 default:
2294 err = ENOPROTOOPT;
2295 break;
2296 }
2297
2298 return err;
2299 }
2300
2301 static int
ch_configure_interface_advisory_event(struct kern_channel * ch,struct sockopt * sopt)2302 ch_configure_interface_advisory_event(struct kern_channel *ch,
2303 struct sockopt *sopt)
2304 {
2305 int err = 0;
2306 boolean_t enable = 0;
2307 struct kern_nexus *nx = ch->ch_nexus;
2308
2309 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2310 SK_LOCK_ASSERT_NOTHELD();
2311
2312 if (sopt->sopt_val == USER_ADDR_NULL) {
2313 return EINVAL;
2314 }
2315 if (nx->nx_adv.nxv_adv == NULL) {
2316 return ENOTSUP;
2317 }
2318 err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable));
2319 if (err != 0) {
2320 return err;
2321 }
2322
2323 /*
2324 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2325 * ordering requirement; check if the channel is closing once ch_lock
2326 * is reacquired and bail if so.
2327 */
2328 lck_mtx_unlock(&ch->ch_lock);
2329 SK_LOCK();
2330 lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock);
2331 lck_mtx_lock(&ch->ch_lock);
2332 if (ch->ch_flags & CHANF_CLOSING) {
2333 err = ENXIO;
2334 goto done;
2335 }
2336
2337 /*
2338 * if interface advisory reporting is enabled on the channel then
2339 * add the channel to the list of channels eligible for interface
2340 * advisory update on the nexus. If disabled, remove from the list.
2341 */
2342 if (enable) {
2343 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2344 ASSERT(err == 0);
2345 goto done;
2346 }
2347 bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2348 atomic_bitset_32(&ch->ch_flags, CHANF_IF_ADV);
2349 STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2350 if (enable_adv) {
2351 nx_netif_config_interface_advisory(nx, true);
2352 }
2353 ch_retain_locked(ch); /* for being in the IF ADV list */
2354 } else {
2355 if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2356 ASSERT(err == 0);
2357 goto done;
2358 }
2359 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2360 ch_link_if_adv);
2361 atomic_bitclear_32(&ch->ch_flags, CHANF_IF_ADV);
2362 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2363 nx_netif_config_interface_advisory(nx, false);
2364 }
2365 (void) ch_release_locked(ch);
2366 }
2367
2368 done:
2369 lck_mtx_unlock(&ch->ch_lock);
2370 lck_rw_done(&nx->nx_ch_if_adv_lock);
2371 SK_UNLOCK();
2372 lck_mtx_lock(&ch->ch_lock);
2373
2374 return err;
2375 }
2376
2377 static int
ch_set_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2378 ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2379 struct sockopt *sopt)
2380 {
2381 struct ch_ev_thresh cet, *ocet;
2382 int err = 0;
2383
2384 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2385
2386 if (sopt->sopt_val == USER_ADDR_NULL) {
2387 return EINVAL;
2388 }
2389
2390 bzero(&cet, sizeof(cet));
2391 err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet));
2392 if (err == 0) {
2393 err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet);
2394 if (err == 0) {
2395 if (t == NR_TX) {
2396 ocet = &ch->ch_info->cinfo_tx_lowat;
2397 } else {
2398 ocet = &ch->ch_info->cinfo_rx_lowat;
2399 }
2400
2401 /* if there is no change, we're done */
2402 if (ocet->cet_unit == cet.cet_unit &&
2403 ocet->cet_value == cet.cet_value) {
2404 return 0;
2405 }
2406
2407 *ocet = cet;
2408
2409 for_rx_tx(t) {
2410 ring_id_t qfirst = ch->ch_first[t];
2411 ring_id_t qlast = ch->ch_last[t];
2412 uint32_t i;
2413
2414 for (i = qfirst; i < qlast; i++) {
2415 struct __kern_channel_ring *kring =
2416 &NAKR(ch->ch_na, t)[i];
2417
2418 (void) kring->ckr_na_notify(kring,
2419 sopt->sopt_p, 0);
2420 }
2421 }
2422
2423 (void) sooptcopyout(sopt, &cet, sizeof(cet));
2424 }
2425 }
2426
2427 return err;
2428 }
2429
2430 static int
ch_get_lowat_thresh(struct kern_channel * ch,enum txrx t,struct sockopt * sopt)2431 ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2432 struct sockopt *sopt)
2433 {
2434 struct ch_ev_thresh cet;
2435
2436 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2437
2438 if (sopt->sopt_val == USER_ADDR_NULL) {
2439 return EINVAL;
2440 }
2441
2442 if (t == NR_TX) {
2443 cet = ch->ch_info->cinfo_tx_lowat;
2444 } else {
2445 cet = ch->ch_info->cinfo_rx_lowat;
2446 }
2447
2448 return sooptcopyout(sopt, &cet, sizeof(cet));
2449 }
2450
2451 static struct kern_channel *
ch_alloc(zalloc_flags_t how)2452 ch_alloc(zalloc_flags_t how)
2453 {
2454 struct kern_channel *ch;
2455
2456 ch = zalloc_flags(ch_zone, how | Z_ZERO);
2457 if (ch) {
2458 lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr);
2459 ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2460 }
2461 return ch;
2462 }
2463
2464 static void
ch_free(struct kern_channel * ch)2465 ch_free(struct kern_channel *ch)
2466 {
2467 ASSERT(ch->ch_refcnt == 0);
2468 ASSERT(ch->ch_pp == NULL);
2469 ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2470 CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2471 lck_mtx_destroy(&ch->ch_lock, &channel_lock_group);
2472 SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2473 ASSERT(ch->ch_info != NULL);
2474 zfree(ch_info_zone, ch->ch_info);
2475 ch->ch_info = NULL;
2476 zfree(ch_zone, ch);
2477 }
2478
2479 void
ch_retain_locked(struct kern_channel * ch)2480 ch_retain_locked(struct kern_channel *ch)
2481 {
2482 SK_LOCK_ASSERT_HELD();
2483
2484 ch->ch_refcnt++;
2485 VERIFY(ch->ch_refcnt != 0);
2486 }
2487
2488 void
ch_retain(struct kern_channel * ch)2489 ch_retain(struct kern_channel *ch)
2490 {
2491 SK_LOCK();
2492 ch_retain_locked(ch);
2493 SK_UNLOCK();
2494 }
2495
2496 int
ch_release_locked(struct kern_channel * ch)2497 ch_release_locked(struct kern_channel *ch)
2498 {
2499 int oldref = ch->ch_refcnt;
2500
2501 SK_LOCK_ASSERT_HELD();
2502
2503 VERIFY(ch->ch_refcnt != 0);
2504 if (--ch->ch_refcnt == 0) {
2505 ch_free(ch);
2506 }
2507
2508 return oldref == 1;
2509 }
2510
2511 int
ch_release(struct kern_channel * ch)2512 ch_release(struct kern_channel *ch)
2513 {
2514 int lastref;
2515
2516 SK_LOCK();
2517 lastref = ch_release_locked(ch);
2518 SK_UNLOCK();
2519
2520 return lastref;
2521 }
2522
2523 void
ch_dtor(void * arg)2524 ch_dtor(void *arg)
2525 {
2526 struct kern_channel *ch = arg;
2527
2528 SK_LOCK();
2529 ch_close(ch, TRUE);
2530 (void) ch_release_locked(ch);
2531 SK_UNLOCK();
2532 }
2533