1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106
107 #include <pthread/priority_private.h>
108 #include <pthread/workqueue_syscalls.h>
109 #include <pthread/workqueue_internal.h>
110 #include <libkern/libkern.h>
111
112 #include <os/log.h>
113
114 #include "net/net_str_id.h"
115
116 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
117 #include <skywalk/lib/net_filter_event.h>
118
119 extern bool net_check_compatible_alf(void);
120 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
121
122 #include <mach/task.h>
123 #include <libkern/section_keywords.h>
124
125 #if CONFIG_MEMORYSTATUS
126 #include <sys/kern_memorystatus.h>
127 #endif
128
129 #if DEVELOPMENT || DEBUG
130 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
131 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
132 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
133 #endif
134
135 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
136 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
137 VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
138
139 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
140 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
141
142 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
143
144 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
145 vfs_context_t ctx);
146 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
147 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
148 struct kevent_qos_s *kev);
149 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
150
151 static const struct fileops kqueueops = {
152 .fo_type = DTYPE_KQUEUE,
153 .fo_read = fo_no_read,
154 .fo_write = fo_no_write,
155 .fo_ioctl = fo_no_ioctl,
156 .fo_select = kqueue_select,
157 .fo_close = kqueue_close,
158 .fo_drain = kqueue_drain,
159 .fo_kqfilter = kqueue_kqfilter,
160 };
161
162 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
163 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
164 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
165 thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
166 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
167 static void kevent_register_wait_cleanup(struct knote *kn);
168
169 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
170 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
171
172 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
173 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
174 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
175 static void kqueue_update_iotier_override(kqueue_t kqu);
176
177 static void kqworkloop_unbind(struct kqworkloop *kwql);
178
179 enum kqwl_unbind_locked_mode {
180 KQWL_OVERRIDE_DROP_IMMEDIATELY,
181 KQWL_OVERRIDE_DROP_DELAYED,
182 };
183 static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
184 enum kqwl_unbind_locked_mode how);
185 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
186 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
187 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
188 enum {
189 KQWL_UTQ_NONE,
190 /*
191 * The wakeup qos is the qos of QUEUED knotes.
192 *
193 * This QoS is accounted for with the events override in the
194 * kqr_override_index field. It is raised each time a new knote is queued at
195 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
196 * knote buckets and is recomputed after each event delivery.
197 */
198 KQWL_UTQ_UPDATE_WAKEUP_QOS,
199 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
200 KQWL_UTQ_UNBINDING, /* attempt to rebind */
201 KQWL_UTQ_PARKING,
202 /*
203 * The wakeup override is for suppressed knotes that have fired again at
204 * a higher QoS than the one for which they are suppressed already.
205 * This override is cleared when the knote suppressed list becomes empty.
206 */
207 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
208 KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
209 /*
210 * The QoS is the maximum QoS of an event enqueued on this workloop in
211 * userland. It is copied from the only EVFILT_WORKLOOP knote with
212 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
213 * such knote, this QoS is 0.
214 */
215 KQWL_UTQ_SET_QOS_INDEX,
216 KQWL_UTQ_REDRIVE_EVENTS,
217 };
218 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
219 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
220
221 static struct knote *knote_alloc(void);
222 static void knote_free(struct knote *kn);
223 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
224 struct knote_lock_ctx *knlc, struct proc *p);
225 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
226 struct kevent_qos_s *kev, bool is_fd, struct proc *p);
227
228 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
229 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
230
231 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
232 struct kevent_qos_s *kev, int result);
233 static void knote_suppress(kqueue_t kqu, struct knote *kn);
234 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
235 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
236
237 // both these functions may dequeue the knote and it is up to the caller
238 // to enqueue the knote back
239 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
240 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
241
242 static ZONE_DEFINE(knote_zone, "knote zone",
243 sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
244 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
245 sizeof(struct kqfile), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
246 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
247 sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
248 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
249 sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
250
251 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
252
253 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
254 static void filt_no_detach(struct knote *kn);
255 static int filt_bad_event(struct knote *kn, long hint);
256 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
257 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
258
259 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
260 .f_attach = filt_no_attach,
261 .f_detach = filt_no_detach,
262 .f_event = filt_bad_event,
263 .f_touch = filt_bad_touch,
264 .f_process = filt_bad_process,
265 };
266
267 #if CONFIG_MEMORYSTATUS
268 extern const struct filterops memorystatus_filtops;
269 #endif /* CONFIG_MEMORYSTATUS */
270 extern const struct filterops fs_filtops;
271 extern const struct filterops sig_filtops;
272 extern const struct filterops machport_filtops;
273 extern const struct filterops pipe_nfiltops;
274 extern const struct filterops pipe_rfiltops;
275 extern const struct filterops pipe_wfiltops;
276 extern const struct filterops ptsd_kqops;
277 extern const struct filterops ptmx_kqops;
278 extern const struct filterops soread_filtops;
279 extern const struct filterops sowrite_filtops;
280 extern const struct filterops sock_filtops;
281 extern const struct filterops soexcept_filtops;
282 extern const struct filterops spec_filtops;
283 extern const struct filterops bpfread_filtops;
284 extern const struct filterops necp_fd_rfiltops;
285 #if SKYWALK
286 extern const struct filterops skywalk_channel_rfiltops;
287 extern const struct filterops skywalk_channel_wfiltops;
288 extern const struct filterops skywalk_channel_efiltops;
289 #endif /* SKYWALK */
290 extern const struct filterops fsevent_filtops;
291 extern const struct filterops vnode_filtops;
292 extern const struct filterops tty_filtops;
293
294 const static struct filterops file_filtops;
295 const static struct filterops kqread_filtops;
296 const static struct filterops proc_filtops;
297 const static struct filterops timer_filtops;
298 const static struct filterops user_filtops;
299 const static struct filterops workloop_filtops;
300
301 /*
302 *
303 * Rules for adding new filters to the system:
304 * Public filters:
305 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
306 * in the exported section of the header
307 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
308 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
309 * of the Public Filters section in the array.
310 * Private filters:
311 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
312 * in the XNU_KERNEL_PRIVATE section of the header
313 * - Update the EVFILTID_MAX value to reflect the new addition
314 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
315 * the Private filters section of the array.
316 */
317 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
318 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
319 /* Public Filters */
320 [~EVFILT_READ] = &file_filtops,
321 [~EVFILT_WRITE] = &file_filtops,
322 [~EVFILT_AIO] = &bad_filtops,
323 [~EVFILT_VNODE] = &file_filtops,
324 [~EVFILT_PROC] = &proc_filtops,
325 [~EVFILT_SIGNAL] = &sig_filtops,
326 [~EVFILT_TIMER] = &timer_filtops,
327 [~EVFILT_MACHPORT] = &machport_filtops,
328 [~EVFILT_FS] = &fs_filtops,
329 [~EVFILT_USER] = &user_filtops,
330 [~EVFILT_UNUSED_11] = &bad_filtops,
331 [~EVFILT_VM] = &bad_filtops,
332 [~EVFILT_SOCK] = &file_filtops,
333 #if CONFIG_MEMORYSTATUS
334 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
335 #else
336 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
337 #endif
338 [~EVFILT_EXCEPT] = &file_filtops,
339 #if SKYWALK
340 [~EVFILT_NW_CHANNEL] = &file_filtops,
341 #else /* !SKYWALK */
342 [~EVFILT_NW_CHANNEL] = &bad_filtops,
343 #endif /* !SKYWALK */
344 [~EVFILT_WORKLOOP] = &workloop_filtops,
345
346 /* Private filters */
347 [EVFILTID_KQREAD] = &kqread_filtops,
348 [EVFILTID_PIPE_N] = &pipe_nfiltops,
349 [EVFILTID_PIPE_R] = &pipe_rfiltops,
350 [EVFILTID_PIPE_W] = &pipe_wfiltops,
351 [EVFILTID_PTSD] = &ptsd_kqops,
352 [EVFILTID_SOREAD] = &soread_filtops,
353 [EVFILTID_SOWRITE] = &sowrite_filtops,
354 [EVFILTID_SCK] = &sock_filtops,
355 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
356 [EVFILTID_SPEC] = &spec_filtops,
357 [EVFILTID_BPFREAD] = &bpfread_filtops,
358 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
359 #if SKYWALK
360 [EVFILTID_SKYWALK_CHANNEL_W] = &skywalk_channel_wfiltops,
361 [EVFILTID_SKYWALK_CHANNEL_R] = &skywalk_channel_rfiltops,
362 [EVFILTID_SKYWALK_CHANNEL_E] = &skywalk_channel_efiltops,
363 #else /* !SKYWALK */
364 [EVFILTID_SKYWALK_CHANNEL_W] = &bad_filtops,
365 [EVFILTID_SKYWALK_CHANNEL_R] = &bad_filtops,
366 [EVFILTID_SKYWALK_CHANNEL_E] = &bad_filtops,
367 #endif /* !SKYWALK */
368 [EVFILTID_FSEVENT] = &fsevent_filtops,
369 [EVFILTID_VN] = &vnode_filtops,
370 [EVFILTID_TTY] = &tty_filtops,
371 [EVFILTID_PTMX] = &ptmx_kqops,
372
373 /* fake filter for detached knotes, keep last */
374 [EVFILTID_DETACHED] = &bad_filtops,
375 };
376
377 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)378 kqr_thread_bound(workq_threadreq_t kqr)
379 {
380 return kqr->tr_state == WORKQ_TR_STATE_BOUND;
381 }
382
383 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)384 kqr_thread_requested_pending(workq_threadreq_t kqr)
385 {
386 workq_tr_state_t tr_state = kqr->tr_state;
387 return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
388 }
389
390 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)391 kqr_thread_requested(workq_threadreq_t kqr)
392 {
393 return kqr->tr_state != WORKQ_TR_STATE_IDLE;
394 }
395
396 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)397 kqr_thread_fast(workq_threadreq_t kqr)
398 {
399 assert(kqr_thread_bound(kqr));
400 return kqr->tr_thread;
401 }
402
403 static inline thread_t
kqr_thread(workq_threadreq_t kqr)404 kqr_thread(workq_threadreq_t kqr)
405 {
406 return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
407 }
408
409 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)410 kqr_kqworkloop(workq_threadreq_t kqr)
411 {
412 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
413 return __container_of(kqr, struct kqworkloop, kqwl_request);
414 }
415 return NULL;
416 }
417
418 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)419 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
420 {
421 kqueue_t kqu;
422 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
423 kqu.kqwl = kqr_kqworkloop(kqr);
424 } else {
425 kqu.kqwq = p->p_fd.fd_wqkqueue;
426 assert(kqr >= kqu.kqwq->kqwq_request &&
427 kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
428 }
429 return kqu;
430 }
431
432 #if CONFIG_PREADOPT_TG
433 /* There are no guarantees about which locks are held when this is called */
434 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)435 kqr_preadopt_thread_group(workq_threadreq_t req)
436 {
437 struct kqworkloop *kqwl = kqr_kqworkloop(req);
438 return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
439 }
440
441 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)442 inline _Atomic(thread_group_qos_t) *
443 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
444 {
445 struct kqworkloop *kqwl = kqr_kqworkloop(req);
446 return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
447 }
448 #endif
449
450 /*
451 * kqueue/note lock implementations
452 *
453 * The kqueue lock guards the kq state, the state of its queues,
454 * and the kqueue-aware status and locks of individual knotes.
455 *
456 * The kqueue workq lock is used to protect state guarding the
457 * interaction of the kqueue with the workq. This state cannot
458 * be guarded by the kq lock - as it needs to be taken when we
459 * already have the waitq set lock held (during the waitq hook
460 * callback). It might be better to use the waitq lock itself
461 * for this, but the IRQ requirements make that difficult).
462 *
463 * Knote flags, filter flags, and associated data are protected
464 * by the underlying object lock - and are only ever looked at
465 * by calling the filter to get a [consistent] snapshot of that
466 * data.
467 */
468
469 static inline void
kqlock(kqueue_t kqu)470 kqlock(kqueue_t kqu)
471 {
472 lck_spin_lock(&kqu.kq->kq_lock);
473 }
474
475 static inline void
kqlock_held(__assert_only kqueue_t kqu)476 kqlock_held(__assert_only kqueue_t kqu)
477 {
478 LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
479 }
480
481 static inline void
kqunlock(kqueue_t kqu)482 kqunlock(kqueue_t kqu)
483 {
484 lck_spin_unlock(&kqu.kq->kq_lock);
485 }
486
487 static inline void
knhash_lock(struct filedesc * fdp)488 knhash_lock(struct filedesc *fdp)
489 {
490 lck_mtx_lock(&fdp->fd_knhashlock);
491 }
492
493 static inline void
knhash_unlock(struct filedesc * fdp)494 knhash_unlock(struct filedesc *fdp)
495 {
496 lck_mtx_unlock(&fdp->fd_knhashlock);
497 }
498
499 /* wait event for knote locks */
500 static inline event_t
knote_lock_wev(struct knote * kn)501 knote_lock_wev(struct knote *kn)
502 {
503 return (event_t)(&kn->kn_hook);
504 }
505
506 /* wait event for kevent_register_wait_* */
507 static inline event64_t
knote_filt_wev64(struct knote * kn)508 knote_filt_wev64(struct knote *kn)
509 {
510 /* kdp_workloop_sync_wait_find_owner knows about this */
511 return CAST_EVENT64_T(kn);
512 }
513
514 /* wait event for knote_post/knote_drop */
515 static inline event_t
knote_post_wev(struct knote * kn)516 knote_post_wev(struct knote *kn)
517 {
518 return &kn->kn_kevent;
519 }
520
521 /*!
522 * @function knote_has_qos
523 *
524 * @brief
525 * Whether the knote has a regular QoS.
526 *
527 * @discussion
528 * kn_qos_override is:
529 * - 0 on kqfiles
530 * - THREAD_QOS_LAST for special buckets (manager)
531 *
532 * Other values mean the knote participates to QoS propagation.
533 */
534 static inline bool
knote_has_qos(struct knote * kn)535 knote_has_qos(struct knote *kn)
536 {
537 return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
538 }
539
540 #pragma mark knote locks
541
542 /*
543 * Enum used by the knote_lock_* functions.
544 *
545 * KNOTE_KQ_LOCK_ALWAYS
546 * The function will always return with the kq lock held.
547 *
548 * KNOTE_KQ_LOCK_ON_SUCCESS
549 * The function will return with the kq lock held if it was successful
550 * (knote_lock() is the only function that can fail).
551 *
552 * KNOTE_KQ_LOCK_ON_FAILURE
553 * The function will return with the kq lock held if it was unsuccessful
554 * (knote_lock() is the only function that can fail).
555 *
556 * KNOTE_KQ_UNLOCK:
557 * The function returns with the kq unlocked.
558 */
559 enum kqlocking {
560 KNOTE_KQ_LOCK_ALWAYS,
561 KNOTE_KQ_LOCK_ON_SUCCESS,
562 KNOTE_KQ_LOCK_ON_FAILURE,
563 KNOTE_KQ_UNLOCK,
564 };
565
566 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)567 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
568 {
569 struct knote_lock_ctx *ctx;
570 LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
571 if (ctx->knlc_knote == kn) {
572 return ctx;
573 }
574 }
575 panic("knote lock context not found: %p", kn);
576 __builtin_trap();
577 }
578
579 /* slowpath of knote_lock() */
580 __attribute__((noinline))
581 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)582 knote_lock_slow(kqueue_t kqu, struct knote *kn,
583 struct knote_lock_ctx *knlc, int kqlocking)
584 {
585 struct knote_lock_ctx *owner_lc;
586 struct uthread *uth = current_uthread();
587 wait_result_t wr;
588
589 kqlock_held(kqu);
590
591 owner_lc = knote_lock_ctx_find(kqu, kn);
592 #if DEBUG || DEVELOPMENT
593 knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
594 #endif
595 owner_lc->knlc_waiters++;
596
597 /*
598 * Make our lock context visible to knote_unlock()
599 */
600 uth->uu_knlock = knlc;
601
602 wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
603 knote_lock_wev(kn), owner_lc->knlc_thread,
604 THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
605
606 if (wr == THREAD_RESTART) {
607 /*
608 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
609 * We need to cleanup the state since no one did.
610 */
611 uth->uu_knlock = NULL;
612 #if DEBUG || DEVELOPMENT
613 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
614 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
615 #endif
616
617 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
618 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
619 kqlock(kqu);
620 }
621 return false;
622 } else {
623 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
624 kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
625 kqlock(kqu);
626 #if DEBUG || DEVELOPMENT
627 /*
628 * This state is set under the lock so we can't
629 * really assert this unless we hold the lock.
630 */
631 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
632 #endif
633 }
634 return true;
635 }
636 }
637
638 /*
639 * Attempts to take the "knote" lock.
640 *
641 * Called with the kqueue lock held.
642 *
643 * Returns true if the knote lock is acquired, false if it has been dropped
644 */
645 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)646 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
647 enum kqlocking kqlocking)
648 {
649 kqlock_held(kqu);
650
651 #if DEBUG || DEVELOPMENT
652 assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
653 #endif
654 knlc->knlc_knote = kn;
655 knlc->knlc_thread = current_thread();
656 knlc->knlc_waiters = 0;
657
658 if (__improbable(kn->kn_status & KN_LOCKED)) {
659 return knote_lock_slow(kqu, kn, knlc, kqlocking);
660 }
661
662 /*
663 * When the knote will be dropped, the knote lock is taken before
664 * KN_DROPPING is set, and then the knote will be removed from any
665 * hash table that references it before the lock is canceled.
666 */
667 assert((kn->kn_status & KN_DROPPING) == 0);
668 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
669 kn->kn_status |= KN_LOCKED;
670 #if DEBUG || DEVELOPMENT
671 knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
672 #endif
673
674 if (kqlocking == KNOTE_KQ_UNLOCK ||
675 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
676 kqunlock(kqu);
677 }
678 return true;
679 }
680
681 /*
682 * Unlocks a knote successfully locked with knote_lock().
683 *
684 * Called with the kqueue lock held.
685 *
686 * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
687 */
688 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)689 knote_unlock(kqueue_t kqu, struct knote *kn,
690 struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
691 {
692 kqlock_held(kqu);
693
694 assert(knlc->knlc_knote == kn);
695 assert(kn->kn_status & KN_LOCKED);
696 #if DEBUG || DEVELOPMENT
697 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
698 #endif
699
700 LIST_REMOVE(knlc, knlc_link);
701
702 if (knlc->knlc_waiters) {
703 thread_t thread = THREAD_NULL;
704
705 wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
706 LCK_WAKE_DEFAULT, &thread);
707
708 /*
709 * knote_lock_slow() publishes the lock context of waiters
710 * in uthread::uu_knlock.
711 *
712 * Reach out and make this context the new owner.
713 */
714 struct uthread *ut = get_bsdthread_info(thread);
715 struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
716
717 assert(next_owner_lc->knlc_knote == kn);
718 next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
719 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
720 #if DEBUG || DEVELOPMENT
721 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
722 #endif
723 ut->uu_knlock = NULL;
724 thread_deallocate_safe(thread);
725 } else {
726 kn->kn_status &= ~KN_LOCKED;
727 }
728
729 if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
730 /*
731 * No f_event() in flight anymore, we can leave QoS "Merge" mode
732 *
733 * See knote_adjust_qos()
734 */
735 kn->kn_status &= ~KN_MERGE_QOS;
736 }
737 if (kqlocking == KNOTE_KQ_UNLOCK) {
738 kqunlock(kqu);
739 }
740 #if DEBUG || DEVELOPMENT
741 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
742 #endif
743 }
744
745 /*
746 * Aborts all waiters for a knote lock, and unlock the knote.
747 *
748 * Called with the kqueue lock held.
749 *
750 * Returns with the kqueue unlocked.
751 */
752 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)753 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
754 struct knote_lock_ctx *knlc)
755 {
756 kqlock_held(kq);
757
758 assert(knlc->knlc_knote == kn);
759 assert(kn->kn_status & KN_LOCKED);
760 assert(kn->kn_status & KN_DROPPING);
761
762 LIST_REMOVE(knlc, knlc_link);
763 kn->kn_status &= ~KN_LOCKED;
764 kqunlock(kq);
765
766 if (knlc->knlc_waiters) {
767 wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
768 }
769 #if DEBUG || DEVELOPMENT
770 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
771 #endif
772 }
773
774 /*
775 * Call the f_event hook of a given filter.
776 *
777 * Takes a use count to protect against concurrent drops.
778 * Called with the object lock held.
779 */
780 static void
knote_post(struct knote * kn,long hint)781 knote_post(struct knote *kn, long hint)
782 {
783 struct kqueue *kq = knote_get_kq(kn);
784 int dropping, result;
785
786 kqlock(kq);
787
788 if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
789 return kqunlock(kq);
790 }
791
792 if (__improbable(kn->kn_status & KN_POSTING)) {
793 panic("KNOTE() called concurrently on knote %p", kn);
794 }
795
796 kn->kn_status |= KN_POSTING;
797
798 kqunlock(kq);
799 result = filter_call(knote_fops(kn), f_event(kn, hint));
800 kqlock(kq);
801
802 /* Someone dropped the knote/the monitored object vanished while we
803 * were in f_event, swallow the side effects of the post.
804 */
805 dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
806
807 if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
808 kqueue_update_iotier_override(kq);
809 }
810
811 if (!dropping && (result & FILTER_ACTIVE)) {
812 knote_activate(kq, kn, result);
813 }
814
815 if ((kn->kn_status & KN_LOCKED) == 0) {
816 /*
817 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
818 *
819 * See knote_adjust_qos()
820 */
821 kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
822 } else {
823 kn->kn_status &= ~KN_POSTING;
824 }
825
826 if (__improbable(dropping)) {
827 thread_wakeup(knote_post_wev(kn));
828 }
829
830 kqunlock(kq);
831 }
832
833 /*
834 * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
835 * caller to be done.
836 *
837 * - kq locked at entry
838 * - kq unlocked at exit
839 */
840 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)841 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
842 {
843 kqlock_held(kq);
844
845 assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
846
847 if (kn->kn_status & KN_POSTING) {
848 lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
849 THREAD_UNINT | THREAD_WAIT_NOREPORT);
850 } else {
851 kqunlock(kq);
852 }
853 }
854
855 #pragma mark knote helpers for filters
856
857 OS_ALWAYS_INLINE
858 void
knote_set_error(struct knote * kn,int error)859 knote_set_error(struct knote *kn, int error)
860 {
861 kn->kn_flags |= EV_ERROR;
862 kn->kn_sdata = error;
863 }
864
865 OS_ALWAYS_INLINE
866 int64_t
knote_low_watermark(const struct knote * kn)867 knote_low_watermark(const struct knote *kn)
868 {
869 return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
870 }
871
872 /*!
873 * @function knote_fill_kevent_with_sdata
874 *
875 * @brief
876 * Fills in a kevent from the current content of a knote.
877 *
878 * @discussion
879 * This is meant to be called from filter's f_event hooks.
880 * The kevent data is filled with kn->kn_sdata.
881 *
882 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
883 *
884 * Using knote_fill_kevent is typically preferred.
885 */
886 OS_ALWAYS_INLINE
887 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)888 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
889 {
890 #define knote_assert_aliases(name1, offs1, name2) \
891 static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
892 offsetof(struct kevent_internal_s, name2), \
893 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
894 /*
895 * All the code makes assumptions on these aliasing,
896 * so make sure we fail the build if we ever ever ever break them.
897 */
898 knote_assert_aliases(ident, 0, kei_ident);
899 #ifdef __LITTLE_ENDIAN__
900 knote_assert_aliases(filter, 0, kei_filter); // non trivial overlap
901 knote_assert_aliases(filter, 1, kei_filtid); // non trivial overlap
902 #else
903 knote_assert_aliases(filter, 0, kei_filtid); // non trivial overlap
904 knote_assert_aliases(filter, 1, kei_filter); // non trivial overlap
905 #endif
906 knote_assert_aliases(flags, 0, kei_flags);
907 knote_assert_aliases(qos, 0, kei_qos);
908 knote_assert_aliases(udata, 0, kei_udata);
909 knote_assert_aliases(fflags, 0, kei_fflags);
910 knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
911 knote_assert_aliases(data, 0, kei_sdata); // non trivial overlap
912 knote_assert_aliases(ext, 0, kei_ext);
913 #undef knote_assert_aliases
914
915 /*
916 * Fix the differences between kevent_qos_s and kevent_internal_s:
917 * - xflags is where kn_sfflags lives, we need to zero it
918 * - fixup the high bits of `filter` where kn_filtid lives
919 */
920 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
921 kev->xflags = 0;
922 kev->filter |= 0xff00;
923 if (kn->kn_flags & EV_CLEAR) {
924 kn->kn_fflags = 0;
925 }
926 }
927
928 /*!
929 * @function knote_fill_kevent
930 *
931 * @brief
932 * Fills in a kevent from the current content of a knote.
933 *
934 * @discussion
935 * This is meant to be called from filter's f_event hooks.
936 * The kevent data is filled with the passed in data.
937 *
938 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
939 */
940 OS_ALWAYS_INLINE
941 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)942 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
943 {
944 knote_fill_kevent_with_sdata(kn, kev);
945 kev->filter = kn->kn_filter;
946 kev->data = data;
947 }
948
949
950 #pragma mark file_filtops
951
952 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)953 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
954 {
955 return fo_kqfilter(kn->kn_fp, kn, kev);
956 }
957
958 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
959 .f_isfd = 1,
960 .f_attach = filt_fileattach,
961 };
962
963 #pragma mark kqread_filtops
964
965 #define f_flag fp_glob->fg_flag
966 #define f_ops fp_glob->fg_ops
967 #define f_lflags fp_glob->fg_lflags
968
969 static void
filt_kqdetach(struct knote * kn)970 filt_kqdetach(struct knote *kn)
971 {
972 struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
973 struct kqueue *kq = &kqf->kqf_kqueue;
974
975 kqlock(kq);
976 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
977 kqunlock(kq);
978 }
979
980 static int
filt_kqueue(struct knote * kn,__unused long hint)981 filt_kqueue(struct knote *kn, __unused long hint)
982 {
983 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
984
985 return kq->kq_count > 0;
986 }
987
988 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)989 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
990 {
991 #pragma unused(kev)
992 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
993 int res;
994
995 kqlock(kq);
996 res = (kq->kq_count > 0);
997 kqunlock(kq);
998
999 return res;
1000 }
1001
1002 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1003 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1004 {
1005 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1006 int res = 0;
1007
1008 kqlock(kq);
1009 if (kq->kq_count) {
1010 knote_fill_kevent(kn, kev, kq->kq_count);
1011 res = 1;
1012 }
1013 kqunlock(kq);
1014
1015 return res;
1016 }
1017
1018 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1019 .f_isfd = 1,
1020 .f_detach = filt_kqdetach,
1021 .f_event = filt_kqueue,
1022 .f_touch = filt_kqtouch,
1023 .f_process = filt_kqprocess,
1024 };
1025
1026 #pragma mark proc_filtops
1027
1028 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1029 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1030 {
1031 struct proc *p;
1032
1033 assert(PID_MAX < NOTE_PDATAMASK);
1034
1035 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1036 knote_set_error(kn, ENOTSUP);
1037 return 0;
1038 }
1039
1040 p = proc_find((int)kn->kn_id);
1041 if (p == NULL) {
1042 knote_set_error(kn, ESRCH);
1043 return 0;
1044 }
1045
1046 const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1047
1048 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1049 do {
1050 pid_t selfpid = proc_selfpid();
1051
1052 if (p->p_ppid == selfpid) {
1053 break; /* parent => ok */
1054 }
1055 if ((p->p_lflag & P_LTRACED) != 0 &&
1056 (p->p_oppid == selfpid)) {
1057 break; /* parent-in-waiting => ok */
1058 }
1059 if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1060 break; /* allowed to signal => ok */
1061 }
1062 proc_rele(p);
1063 knote_set_error(kn, EACCES);
1064 return 0;
1065 } while (0);
1066 }
1067
1068 kn->kn_proc = p;
1069 kn->kn_flags |= EV_CLEAR; /* automatically set */
1070 kn->kn_sdata = 0; /* incoming data is ignored */
1071
1072 proc_klist_lock();
1073
1074 KNOTE_ATTACH(&p->p_klist, kn);
1075
1076 proc_klist_unlock();
1077
1078 proc_rele(p);
1079
1080 /*
1081 * only captures edge-triggered events after this point
1082 * so it can't already be fired.
1083 */
1084 return 0;
1085 }
1086
1087
1088 /*
1089 * The knote may be attached to a different process, which may exit,
1090 * leaving nothing for the knote to be attached to. In that case,
1091 * the pointer to the process will have already been nulled out.
1092 */
1093 static void
filt_procdetach(struct knote * kn)1094 filt_procdetach(struct knote *kn)
1095 {
1096 struct proc *p;
1097
1098 proc_klist_lock();
1099
1100 p = kn->kn_proc;
1101 if (p != PROC_NULL) {
1102 kn->kn_proc = PROC_NULL;
1103 KNOTE_DETACH(&p->p_klist, kn);
1104 }
1105
1106 proc_klist_unlock();
1107 }
1108
1109 static int
filt_procevent(struct knote * kn,long hint)1110 filt_procevent(struct knote *kn, long hint)
1111 {
1112 u_int event;
1113
1114 /* ALWAYS CALLED WITH proc_klist_lock */
1115
1116 /*
1117 * Note: a lot of bits in hint may be obtained from the knote
1118 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1119 * bits in hint for filt_procevent
1120 *
1121 * mask off extra data
1122 */
1123 event = (u_int)hint & NOTE_PCTRLMASK;
1124
1125 /*
1126 * termination lifecycle events can happen while a debugger
1127 * has reparented a process, in which case notifications
1128 * should be quashed except to the tracing parent. When
1129 * the debugger reaps the child (either via wait4(2) or
1130 * process exit), the child will be reparented to the original
1131 * parent and these knotes re-fired.
1132 */
1133 if (event & NOTE_EXIT) {
1134 if ((kn->kn_proc->p_oppid != 0)
1135 && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1136 /*
1137 * This knote is not for the current ptrace(2) parent, ignore.
1138 */
1139 return 0;
1140 }
1141 }
1142
1143 /*
1144 * if the user is interested in this event, record it.
1145 */
1146 if (kn->kn_sfflags & event) {
1147 kn->kn_fflags |= event;
1148 }
1149
1150 #pragma clang diagnostic push
1151 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1152 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1153 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1154 }
1155 #pragma clang diagnostic pop
1156
1157
1158 /*
1159 * The kernel has a wrapper in place that returns the same data
1160 * as is collected here, in kn_hook32. Any changes to how
1161 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1162 * should also be reflected in the proc_pidnoteexit() wrapper.
1163 */
1164 if (event == NOTE_EXIT) {
1165 kn->kn_hook32 = 0;
1166 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1167 kn->kn_fflags |= NOTE_EXITSTATUS;
1168 kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1169 }
1170 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1171 kn->kn_fflags |= NOTE_EXIT_DETAIL;
1172 if ((kn->kn_proc->p_lflag &
1173 P_LTERM_DECRYPTFAIL) != 0) {
1174 kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1175 }
1176 if ((kn->kn_proc->p_lflag &
1177 P_LTERM_JETSAM) != 0) {
1178 kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1179 switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1180 case P_JETSAM_VMPAGESHORTAGE:
1181 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1182 break;
1183 case P_JETSAM_VMTHRASHING:
1184 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1185 break;
1186 case P_JETSAM_FCTHRASHING:
1187 kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1188 break;
1189 case P_JETSAM_VNODE:
1190 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1191 break;
1192 case P_JETSAM_HIWAT:
1193 kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1194 break;
1195 case P_JETSAM_PID:
1196 kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1197 break;
1198 case P_JETSAM_IDLEEXIT:
1199 kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1200 break;
1201 }
1202 }
1203 if ((proc_getcsflags(kn->kn_proc) &
1204 CS_KILLED) != 0) {
1205 kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1206 }
1207 }
1208 }
1209
1210 /* if we have any matching state, activate the knote */
1211 return kn->kn_fflags != 0;
1212 }
1213
1214 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1215 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1216 {
1217 int res;
1218
1219 proc_klist_lock();
1220
1221 /* accept new filter flags and mask off output events no long interesting */
1222 kn->kn_sfflags = kev->fflags;
1223
1224 /* restrict the current results to the (smaller?) set of new interest */
1225 /*
1226 * For compatibility with previous implementations, we leave kn_fflags
1227 * as they were before.
1228 */
1229 //kn->kn_fflags &= kn->kn_sfflags;
1230
1231 res = (kn->kn_fflags != 0);
1232
1233 proc_klist_unlock();
1234
1235 return res;
1236 }
1237
1238 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1239 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1240 {
1241 int res = 0;
1242
1243 proc_klist_lock();
1244 if (kn->kn_fflags) {
1245 knote_fill_kevent(kn, kev, kn->kn_hook32);
1246 kn->kn_hook32 = 0;
1247 res = 1;
1248 }
1249 proc_klist_unlock();
1250 return res;
1251 }
1252
1253 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1254 .f_attach = filt_procattach,
1255 .f_detach = filt_procdetach,
1256 .f_event = filt_procevent,
1257 .f_touch = filt_proctouch,
1258 .f_process = filt_procprocess,
1259 };
1260
1261 #pragma mark timer_filtops
1262
1263 struct filt_timer_params {
1264 uint64_t deadline; /* deadline in abs/cont time
1265 * (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1266 uint64_t leeway; /* leeway in abstime, or 0 if none */
1267 uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1268 };
1269
1270 /*
1271 * Values stored in the knote at rest (using Mach absolute time units)
1272 *
1273 * kn->kn_thcall where the thread_call object is stored
1274 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1275 * kn->kn_ext[1] leeway value
1276 * kn->kn_sdata interval timer: the interval
1277 * absolute/deadline timer: 0
1278 * kn->kn_hook32 timer state (with gencount)
1279 *
1280 * TIMER_IDLE:
1281 * The timer has either never been scheduled or been cancelled.
1282 * It is safe to schedule a new one in this state.
1283 *
1284 * TIMER_ARMED:
1285 * The timer has been scheduled
1286 *
1287 * TIMER_FIRED
1288 * The timer has fired and an event needs to be delivered.
1289 * When in this state, the callout may still be running.
1290 *
1291 * TIMER_IMMEDIATE
1292 * The timer has fired at registration time, and the callout was never
1293 * dispatched.
1294 */
1295 #define TIMER_IDLE 0x0
1296 #define TIMER_ARMED 0x1
1297 #define TIMER_FIRED 0x2
1298 #define TIMER_IMMEDIATE 0x3
1299 #define TIMER_STATE_MASK 0x3
1300 #define TIMER_GEN_INC 0x4
1301
1302 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1303 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1304 {
1305 kn->kn_ext[0] = params->deadline;
1306 kn->kn_ext[1] = params->leeway;
1307 kn->kn_sdata = params->interval;
1308 }
1309
1310 /*
1311 * filt_timervalidate - process data from user
1312 *
1313 * Sets up the deadline, interval, and leeway from the provided user data
1314 *
1315 * Input:
1316 * kn_sdata timer deadline or interval time
1317 * kn_sfflags style of timer, unit of measurement
1318 *
1319 * Output:
1320 * struct filter_timer_params to apply to the filter with
1321 * filt_timer_set_params when changes are ready to be commited.
1322 *
1323 * Returns:
1324 * EINVAL Invalid user data parameters
1325 * ERANGE Various overflows with the parameters
1326 *
1327 * Called with timer filter lock held.
1328 */
1329 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1330 filt_timervalidate(const struct kevent_qos_s *kev,
1331 struct filt_timer_params *params)
1332 {
1333 /*
1334 * There are 5 knobs that need to be chosen for a timer registration:
1335 *
1336 * A) Units of time (what is the time duration of the specified number)
1337 * Absolute and interval take:
1338 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1339 * Defaults to milliseconds if not specified
1340 *
1341 * B) Clock epoch (what is the zero point of the specified number)
1342 * For interval, there is none
1343 * For absolute, defaults to the gettimeofday/calendar epoch
1344 * With NOTE_MACHTIME, uses mach_absolute_time()
1345 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1346 *
1347 * C) The knote's behavior on delivery
1348 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1349 * Absolute is a forced one-shot timer which deletes on delivery
1350 * TODO: Add a way for absolute to be not forced one-shot
1351 *
1352 * D) Whether the time duration is relative to now or absolute
1353 * Interval fires at now + duration when it is set up
1354 * Absolute fires at now + difference between now walltime and passed in walltime
1355 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1356 *
1357 * E) Whether the timer continues to tick across sleep
1358 * By default all three do not.
1359 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1360 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1361 * expires when mach_continuous_time() is > the passed in value.
1362 */
1363
1364 uint64_t multiplier;
1365
1366 boolean_t use_abstime = FALSE;
1367
1368 switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1369 case NOTE_SECONDS:
1370 multiplier = NSEC_PER_SEC;
1371 break;
1372 case NOTE_USECONDS:
1373 multiplier = NSEC_PER_USEC;
1374 break;
1375 case NOTE_NSECONDS:
1376 multiplier = 1;
1377 break;
1378 case NOTE_MACHTIME:
1379 multiplier = 0;
1380 use_abstime = TRUE;
1381 break;
1382 case 0: /* milliseconds (default) */
1383 multiplier = NSEC_PER_SEC / 1000;
1384 break;
1385 default:
1386 return EINVAL;
1387 }
1388
1389 /* transform the leeway in kn_ext[1] to same time scale */
1390 if (kev->fflags & NOTE_LEEWAY) {
1391 uint64_t leeway_abs;
1392
1393 if (use_abstime) {
1394 leeway_abs = (uint64_t)kev->ext[1];
1395 } else {
1396 uint64_t leeway_ns;
1397 if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1398 return ERANGE;
1399 }
1400
1401 nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1402 }
1403
1404 params->leeway = leeway_abs;
1405 } else {
1406 params->leeway = 0;
1407 }
1408
1409 if (kev->fflags & NOTE_ABSOLUTE) {
1410 uint64_t deadline_abs;
1411
1412 if (use_abstime) {
1413 deadline_abs = (uint64_t)kev->data;
1414 } else {
1415 uint64_t calendar_deadline_ns;
1416
1417 if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1418 return ERANGE;
1419 }
1420
1421 /* calendar_deadline_ns is in nanoseconds since the epoch */
1422
1423 clock_sec_t seconds;
1424 clock_nsec_t nanoseconds;
1425
1426 /*
1427 * Note that the conversion through wall-time is only done once.
1428 *
1429 * If the relationship between MAT and gettimeofday changes,
1430 * the underlying timer does not update.
1431 *
1432 * TODO: build a wall-time denominated timer_call queue
1433 * and a flag to request DTRTing with wall-time timers
1434 */
1435 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1436
1437 uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1438
1439 /* if deadline is in the future */
1440 if (calendar_now_ns < calendar_deadline_ns) {
1441 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1442 uint64_t interval_abs;
1443
1444 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1445
1446 /*
1447 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1448 * causes the timer to keep ticking across sleep, but
1449 * it does not change the calendar timebase.
1450 */
1451
1452 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1453 clock_continuoustime_interval_to_deadline(interval_abs,
1454 &deadline_abs);
1455 } else {
1456 clock_absolutetime_interval_to_deadline(interval_abs,
1457 &deadline_abs);
1458 }
1459 } else {
1460 deadline_abs = 0; /* cause immediate expiration */
1461 }
1462 }
1463
1464 params->deadline = deadline_abs;
1465 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1466 } else if (kev->data < 0) {
1467 /*
1468 * Negative interval timers fire immediately, once.
1469 *
1470 * Ideally a negative interval would be an error, but certain clients
1471 * pass negative values on accident, and expect an event back.
1472 *
1473 * In the old implementation the timer would repeat with no delay
1474 * N times until mach_absolute_time() + (N * interval) underflowed,
1475 * then it would wait ~forever by accidentally arming a timer for the far future.
1476 *
1477 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1478 */
1479
1480 params->deadline = 0; /* expire immediately */
1481 params->interval = 0; /* non-repeating */
1482 } else {
1483 uint64_t interval_abs = 0;
1484
1485 if (use_abstime) {
1486 interval_abs = (uint64_t)kev->data;
1487 } else {
1488 uint64_t interval_ns;
1489 if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1490 return ERANGE;
1491 }
1492
1493 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1494 }
1495
1496 uint64_t deadline = 0;
1497
1498 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1499 clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1500 } else {
1501 clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1502 }
1503
1504 params->deadline = deadline;
1505 params->interval = interval_abs;
1506 }
1507
1508 return 0;
1509 }
1510
1511 /*
1512 * filt_timerexpire - the timer callout routine
1513 */
1514 static void
filt_timerexpire(void * knx,void * state_on_arm)1515 filt_timerexpire(void *knx, void *state_on_arm)
1516 {
1517 struct knote *kn = knx;
1518
1519 uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1520 uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1521
1522 if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1523 // our f_event always would say FILTER_ACTIVE,
1524 // so be leaner and just do it.
1525 struct kqueue *kq = knote_get_kq(kn);
1526 kqlock(kq);
1527 knote_activate(kq, kn, FILTER_ACTIVE);
1528 kqunlock(kq);
1529 } else {
1530 /*
1531 * The timer has been reprogrammed or canceled since it was armed,
1532 * and this is a late firing for the timer, just ignore it.
1533 */
1534 }
1535 }
1536
1537 /*
1538 * Does this deadline needs a timer armed for it, or has it expired?
1539 */
1540 static bool
filt_timer_is_ready(struct knote * kn)1541 filt_timer_is_ready(struct knote *kn)
1542 {
1543 uint64_t now, deadline = kn->kn_ext[0];
1544
1545 if (deadline == 0) {
1546 return true;
1547 }
1548
1549 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1550 now = mach_continuous_time();
1551 } else {
1552 now = mach_absolute_time();
1553 }
1554 return deadline <= now;
1555 }
1556
1557 /*
1558 * Arm a timer
1559 *
1560 * It is the responsibility of the caller to make sure the timer call
1561 * has completed or been cancelled properly prior to arming it.
1562 */
1563 static void
filt_timerarm(struct knote * kn)1564 filt_timerarm(struct knote *kn)
1565 {
1566 uint64_t deadline = kn->kn_ext[0];
1567 uint64_t leeway = kn->kn_ext[1];
1568 uint32_t state;
1569
1570 int filter_flags = kn->kn_sfflags;
1571 unsigned int timer_flags = 0;
1572
1573 if (filter_flags & NOTE_CRITICAL) {
1574 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1575 } else if (filter_flags & NOTE_BACKGROUND) {
1576 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1577 } else {
1578 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1579 }
1580
1581 if (filter_flags & NOTE_LEEWAY) {
1582 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1583 }
1584
1585 if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1586 timer_flags |= THREAD_CALL_CONTINUOUS;
1587 }
1588
1589 /*
1590 * Move to ARMED.
1591 *
1592 * We increase the gencount, and setup the thread call with this expected
1593 * state. It means that if there was a previous generation of the timer in
1594 * flight that needs to be ignored, then 3 things are possible:
1595 *
1596 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1597 * but we clobber it with ARMED and a new gencount. The knote will still
1598 * be activated, but filt_timerprocess() which is serialized with this
1599 * call will not see the FIRED bit set and will not deliver an event.
1600 *
1601 * - this code runs first, but filt_timerexpire() comes second. Because it
1602 * knows an old gencount, it will debounce and not activate the knote.
1603 *
1604 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1605 * will just cancel it properly.
1606 *
1607 * This is important as userspace expects to never be woken up for past
1608 * timers after filt_timertouch ran.
1609 */
1610 state = os_atomic_load(&kn->kn_hook32, relaxed);
1611 state &= ~TIMER_STATE_MASK;
1612 state += TIMER_GEN_INC + TIMER_ARMED;
1613 os_atomic_store(&kn->kn_hook32, state, relaxed);
1614
1615 thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1616 (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1617 }
1618
1619 /*
1620 * Mark a timer as "already fired" when it is being reprogrammed
1621 *
1622 * If there is a timer in flight, this will do a best effort at canceling it,
1623 * but will not wait. If the thread call was in flight, having set the
1624 * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1625 * cancelation.
1626 */
1627 static void
filt_timerfire_immediate(struct knote * kn)1628 filt_timerfire_immediate(struct knote *kn)
1629 {
1630 uint32_t state;
1631
1632 static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1633 "validate that this atomic or will transition to IMMEDIATE");
1634 state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1635
1636 if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1637 thread_call_cancel(kn->kn_thcall);
1638 }
1639 }
1640
1641 /*
1642 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1643 */
1644 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1645 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1646 {
1647 thread_call_t callout;
1648 struct filt_timer_params params;
1649 int error;
1650
1651 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1652 knote_set_error(kn, error);
1653 return 0;
1654 }
1655
1656 callout = thread_call_allocate_with_options(filt_timerexpire,
1657 (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1658 THREAD_CALL_OPTIONS_ONCE);
1659
1660 if (NULL == callout) {
1661 knote_set_error(kn, ENOMEM);
1662 return 0;
1663 }
1664
1665 filt_timer_set_params(kn, ¶ms);
1666 kn->kn_thcall = callout;
1667 kn->kn_flags |= EV_CLEAR;
1668 os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1669
1670 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1671 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1672 kn->kn_flags |= EV_ONESHOT;
1673 }
1674
1675 if (filt_timer_is_ready(kn)) {
1676 os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1677 return FILTER_ACTIVE;
1678 } else {
1679 filt_timerarm(kn);
1680 return 0;
1681 }
1682 }
1683
1684 /*
1685 * Shut down the timer if it's running, and free the callout.
1686 */
1687 static void
filt_timerdetach(struct knote * kn)1688 filt_timerdetach(struct knote *kn)
1689 {
1690 __assert_only boolean_t freed;
1691
1692 /*
1693 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1694 * running anymore.
1695 */
1696 thread_call_cancel_wait(kn->kn_thcall);
1697 freed = thread_call_free(kn->kn_thcall);
1698 assert(freed);
1699 }
1700
1701 /*
1702 * filt_timertouch - update timer knote with new user input
1703 *
1704 * Cancel and restart the timer based on new user data. When
1705 * the user picks up a knote, clear the count of how many timer
1706 * pops have gone off (in kn_data).
1707 */
1708 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1709 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1710 {
1711 struct filt_timer_params params;
1712 uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1713 int error;
1714
1715 if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1716 !_pthread_priority_thread_qos(kev->qos)) {
1717 /* validate usage of FILTER_UPDATE_REQ_QOS */
1718 kev->flags |= EV_ERROR;
1719 kev->data = ERANGE;
1720 return 0;
1721 }
1722
1723 if (changed_flags & NOTE_ABSOLUTE) {
1724 kev->flags |= EV_ERROR;
1725 kev->data = EINVAL;
1726 return 0;
1727 }
1728
1729 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1730 kev->flags |= EV_ERROR;
1731 kev->data = error;
1732 return 0;
1733 }
1734
1735 /* capture the new values used to compute deadline */
1736 filt_timer_set_params(kn, ¶ms);
1737 kn->kn_sfflags = kev->fflags;
1738
1739 if (filt_timer_is_ready(kn)) {
1740 filt_timerfire_immediate(kn);
1741 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1742 } else {
1743 filt_timerarm(kn);
1744 return FILTER_UPDATE_REQ_QOS;
1745 }
1746 }
1747
1748 /*
1749 * filt_timerprocess - query state of knote and snapshot event data
1750 *
1751 * Determine if the timer has fired in the past, snapshot the state
1752 * of the kevent for returning to user-space, and clear pending event
1753 * counters for the next time.
1754 */
1755 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1756 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1757 {
1758 uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1759
1760 /*
1761 * filt_timerprocess is serialized with any filter routine except for
1762 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1763 * transition, and on success, activates the knote.
1764 *
1765 * Hence, we don't need atomic modifications of the state, only to peek at
1766 * whether we see any of the "FIRED" state, and if we do, it is safe to
1767 * do simple state machine transitions.
1768 */
1769 switch (state & TIMER_STATE_MASK) {
1770 case TIMER_IDLE:
1771 case TIMER_ARMED:
1772 /*
1773 * This can happen if a touch resets a timer that had fired
1774 * without being processed
1775 */
1776 return 0;
1777 }
1778
1779 os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1780
1781 /*
1782 * Copy out the interesting kevent state,
1783 * but don't leak out the raw time calculations.
1784 *
1785 * TODO: potential enhancements - tell the user about:
1786 * - deadline to which this timer thought it was expiring
1787 * - return kn_sfflags in the fflags field so the client can know
1788 * under what flags the timer fired
1789 */
1790 knote_fill_kevent(kn, kev, 1);
1791 kev->ext[0] = 0;
1792 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1793
1794 if (kn->kn_sdata != 0) {
1795 /*
1796 * This is a 'repeating' timer, so we have to emit
1797 * how many intervals expired between the arm
1798 * and the process.
1799 *
1800 * A very strange style of interface, because
1801 * this could easily be done in the client...
1802 */
1803
1804 uint64_t now;
1805
1806 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1807 now = mach_continuous_time();
1808 } else {
1809 now = mach_absolute_time();
1810 }
1811
1812 uint64_t first_deadline = kn->kn_ext[0];
1813 uint64_t interval_abs = kn->kn_sdata;
1814 uint64_t orig_arm_time = first_deadline - interval_abs;
1815
1816 assert(now > orig_arm_time);
1817 assert(now > first_deadline);
1818
1819 uint64_t elapsed = now - orig_arm_time;
1820
1821 uint64_t num_fired = elapsed / interval_abs;
1822
1823 /*
1824 * To reach this code, we must have seen the timer pop
1825 * and be in repeating mode, so therefore it must have been
1826 * more than 'interval' time since the attach or last
1827 * successful touch.
1828 */
1829 assert(num_fired > 0);
1830
1831 /* report how many intervals have elapsed to the user */
1832 kev->data = (int64_t)num_fired;
1833
1834 /* We only need to re-arm the timer if it's not about to be destroyed */
1835 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1836 /* fire at the end of the next interval */
1837 uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1838
1839 assert(new_deadline > now);
1840
1841 kn->kn_ext[0] = new_deadline;
1842
1843 /*
1844 * This can't shortcut setting up the thread call, because
1845 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1846 */
1847 filt_timerarm(kn);
1848 }
1849 }
1850
1851 return FILTER_ACTIVE;
1852 }
1853
1854 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1855 .f_extended_codes = true,
1856 .f_attach = filt_timerattach,
1857 .f_detach = filt_timerdetach,
1858 .f_event = filt_bad_event,
1859 .f_touch = filt_timertouch,
1860 .f_process = filt_timerprocess,
1861 };
1862
1863 #pragma mark user_filtops
1864
1865 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1866 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1867 {
1868 if (kn->kn_sfflags & NOTE_TRIGGER) {
1869 kn->kn_hook32 = FILTER_ACTIVE;
1870 } else {
1871 kn->kn_hook32 = 0;
1872 }
1873 return kn->kn_hook32;
1874 }
1875
1876 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1877 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1878 {
1879 uint32_t ffctrl;
1880 int fflags;
1881
1882 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1883 fflags = kev->fflags & NOTE_FFLAGSMASK;
1884 switch (ffctrl) {
1885 case NOTE_FFNOP:
1886 break;
1887 case NOTE_FFAND:
1888 kn->kn_sfflags &= fflags;
1889 break;
1890 case NOTE_FFOR:
1891 kn->kn_sfflags |= fflags;
1892 break;
1893 case NOTE_FFCOPY:
1894 kn->kn_sfflags = fflags;
1895 break;
1896 }
1897 kn->kn_sdata = kev->data;
1898
1899 if (kev->fflags & NOTE_TRIGGER) {
1900 kn->kn_hook32 = FILTER_ACTIVE;
1901 }
1902 return (int)kn->kn_hook32;
1903 }
1904
1905 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1906 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1907 {
1908 int result = (int)kn->kn_hook32;
1909
1910 if (result) {
1911 /* EVFILT_USER returns the data that was passed in */
1912 knote_fill_kevent_with_sdata(kn, kev);
1913 kev->fflags = kn->kn_sfflags;
1914 if (kn->kn_flags & EV_CLEAR) {
1915 /* knote_fill_kevent cleared kn_fflags */
1916 kn->kn_hook32 = 0;
1917 }
1918 }
1919
1920 return result;
1921 }
1922
1923 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1924 .f_extended_codes = true,
1925 .f_attach = filt_userattach,
1926 .f_detach = filt_no_detach,
1927 .f_event = filt_bad_event,
1928 .f_touch = filt_usertouch,
1929 .f_process = filt_userprocess,
1930 };
1931
1932 #pragma mark workloop_filtops
1933
1934 #define EPREEMPTDISABLED (-1)
1935
1936 static inline void
filt_wllock(struct kqworkloop * kqwl)1937 filt_wllock(struct kqworkloop *kqwl)
1938 {
1939 lck_spin_lock(&kqwl->kqwl_statelock);
1940 }
1941
1942 static inline void
filt_wlunlock(struct kqworkloop * kqwl)1943 filt_wlunlock(struct kqworkloop *kqwl)
1944 {
1945 lck_spin_unlock(&kqwl->kqwl_statelock);
1946 }
1947
1948 /*
1949 * Returns true when the interlock for the turnstile is the workqueue lock
1950 *
1951 * When this is the case, all turnstiles operations are delegated
1952 * to the workqueue subsystem.
1953 *
1954 * This is required because kqueue_threadreq_bind_prepost only holds the
1955 * workqueue lock but needs to move the inheritor from the workloop turnstile
1956 * away from the creator thread, so that this now fulfilled request cannot be
1957 * picked anymore by other threads.
1958 */
1959 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)1960 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1961 {
1962 return kqr_thread_requested_pending(&kqwl->kqwl_request);
1963 }
1964
1965 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)1966 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
1967 turnstile_update_flags_t flags)
1968 {
1969 turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1970 workq_threadreq_t kqr = &kqwl->kqwl_request;
1971
1972 /*
1973 * binding to the workq should always happen through
1974 * workq_kern_threadreq_update_inheritor()
1975 */
1976 assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1977
1978 if ((inheritor = kqwl->kqwl_owner)) {
1979 flags |= TURNSTILE_INHERITOR_THREAD;
1980 } else if ((inheritor = kqr_thread(kqr))) {
1981 flags |= TURNSTILE_INHERITOR_THREAD;
1982 }
1983
1984 turnstile_update_inheritor(ts, inheritor, flags);
1985 }
1986
1987 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
1988 #define FILT_WLATTACH 0
1989 #define FILT_WLTOUCH 1
1990 #define FILT_WLDROP 2
1991
1992 __result_use_check
1993 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)1994 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
1995 struct kevent_qos_s *kev, kq_index_t qos_index, int op)
1996 {
1997 user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1998 workq_threadreq_t kqr = &kqwl->kqwl_request;
1999 thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2000 kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2001 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2002 int action = KQWL_UTQ_NONE, error = 0;
2003 bool wl_inheritor_updated = false, needs_wake = false;
2004 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2005 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2006 uint64_t udata = 0;
2007 struct turnstile *ts = TURNSTILE_NULL;
2008
2009 filt_wllock(kqwl);
2010
2011 again:
2012 new_owner = cur_owner = kqwl->kqwl_owner;
2013
2014 /*
2015 * Phase 1:
2016 *
2017 * If asked, load the uint64 value at the user provided address and compare
2018 * it against the passed in mask and expected value.
2019 *
2020 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2021 * a thread reference.
2022 *
2023 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2024 * the current thread, then end ownership.
2025 *
2026 * Lastly decide whether we need to perform a QoS update.
2027 */
2028 if (uaddr) {
2029 /*
2030 * Until <rdar://problem/24999882> exists,
2031 * disabling preemption copyin forces any
2032 * vm_fault we encounter to fail.
2033 */
2034 error = copyin_atomic64(uaddr, &udata);
2035
2036 /*
2037 * If we get EFAULT, drop locks, and retry.
2038 * If we still get an error report it,
2039 * else assume the memory has been faulted
2040 * and attempt to copyin under lock again.
2041 */
2042 switch (error) {
2043 case 0:
2044 break;
2045 case EFAULT:
2046 if (efault_retry-- > 0) {
2047 filt_wlunlock(kqwl);
2048 error = copyin_atomic64(uaddr, &udata);
2049 filt_wllock(kqwl);
2050 if (error == 0) {
2051 goto again;
2052 }
2053 }
2054 OS_FALLTHROUGH;
2055 default:
2056 goto out;
2057 }
2058
2059 /* Update state as copied in. */
2060 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2061
2062 if ((udata & mask) != (kdata & mask)) {
2063 error = ESTALE;
2064 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2065 /*
2066 * Decipher the owner port name, and translate accordingly.
2067 * The low 2 bits were borrowed for other flags, so mask them off.
2068 *
2069 * Then attempt translation to a thread reference or fail.
2070 */
2071 mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2072 if (name != MACH_PORT_NULL) {
2073 name = ipc_entry_name_mask(name);
2074 extra_thread_ref = port_name_to_thread(name,
2075 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2076 if (extra_thread_ref == THREAD_NULL) {
2077 error = EOWNERDEAD;
2078 goto out;
2079 }
2080 new_owner = extra_thread_ref;
2081 }
2082 }
2083 }
2084
2085 if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2086 new_owner = THREAD_NULL;
2087 }
2088
2089 if (error == 0) {
2090 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2091 action = KQWL_UTQ_SET_QOS_INDEX;
2092 } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2093 action = KQWL_UTQ_SET_QOS_INDEX;
2094 }
2095
2096 if (op == FILT_WLTOUCH) {
2097 /*
2098 * Save off any additional fflags/data we just accepted
2099 * But only keep the last round of "update" bits we acted on which helps
2100 * debugging a lot.
2101 */
2102 kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2103 kn->kn_sfflags |= kev->fflags;
2104 if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2105 needs_wake = (kn->kn_thread != THREAD_NULL);
2106 }
2107 } else if (op == FILT_WLDROP) {
2108 if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2109 NOTE_WL_SYNC_WAIT) {
2110 /*
2111 * When deleting a SYNC_WAIT knote that hasn't been woken up
2112 * explicitly, issue a wake up.
2113 */
2114 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2115 needs_wake = (kn->kn_thread != THREAD_NULL);
2116 }
2117 }
2118 }
2119
2120 /*
2121 * Phase 2:
2122 *
2123 * Commit ownership and QoS changes if any, possibly wake up waiters
2124 */
2125
2126 if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2127 goto out;
2128 }
2129
2130 kqlock(kqwl);
2131
2132 /* If already tracked as servicer, don't track as owner */
2133 if (new_owner == kqr_thread(kqr)) {
2134 new_owner = THREAD_NULL;
2135 }
2136
2137 if (cur_owner != new_owner) {
2138 kqwl->kqwl_owner = new_owner;
2139 if (new_owner == extra_thread_ref) {
2140 /* we just transfered this ref to kqwl_owner */
2141 extra_thread_ref = THREAD_NULL;
2142 }
2143 cur_override = kqworkloop_override(kqwl);
2144
2145 if (new_owner) {
2146 /* override it before we drop the old */
2147 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2148 thread_add_kevent_override(new_owner, cur_override);
2149 }
2150 if (kqr_thread_requested_pending(kqr)) {
2151 if (action == KQWL_UTQ_NONE) {
2152 action = KQWL_UTQ_REDRIVE_EVENTS;
2153 }
2154 }
2155 } else if (action == KQWL_UTQ_NONE &&
2156 !kqr_thread_requested(kqr) &&
2157 kqwl->kqwl_wakeup_qos) {
2158 action = KQWL_UTQ_REDRIVE_EVENTS;
2159 }
2160 }
2161
2162 if (action != KQWL_UTQ_NONE) {
2163 kqworkloop_update_threads_qos(kqwl, action, qos_index);
2164 }
2165
2166 ts = kqwl->kqwl_turnstile;
2167 if (cur_owner != new_owner && ts) {
2168 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2169 /*
2170 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2171 * the code went through workq_kern_threadreq_initiate()
2172 * and the workqueue has set the inheritor already
2173 */
2174 assert(filt_wlturnstile_interlock_is_workq(kqwl));
2175 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2176 workq_kern_threadreq_lock(kqwl->kqwl_p);
2177 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2178 ts, TURNSTILE_IMMEDIATE_UPDATE);
2179 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2180 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2181 /*
2182 * If the workq is no longer the interlock, then
2183 * workq_kern_threadreq_update_inheritor() has finished a bind
2184 * and we need to fallback to the regular path.
2185 */
2186 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2187 }
2188 wl_inheritor_updated = true;
2189 } else {
2190 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2191 wl_inheritor_updated = true;
2192 }
2193
2194 /*
2195 * We need a turnstile reference because we are dropping the interlock
2196 * and the caller has not called turnstile_prepare.
2197 */
2198 if (wl_inheritor_updated) {
2199 turnstile_reference(ts);
2200 }
2201 }
2202
2203 if (needs_wake && ts) {
2204 waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2205 kn->kn_thread, THREAD_AWAKENED);
2206 if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2207 disable_preemption();
2208 error = EPREEMPTDISABLED;
2209 }
2210 }
2211
2212 kqunlock(kqwl);
2213
2214 out:
2215 /*
2216 * Phase 3:
2217 *
2218 * Unlock and cleanup various lingering references and things.
2219 */
2220 filt_wlunlock(kqwl);
2221
2222 #if CONFIG_WORKLOOP_DEBUG
2223 KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2224 .updater = current_thread(),
2225 .servicer = kqr_thread(kqr), /* Note: racy */
2226 .old_owner = cur_owner,
2227 .new_owner = new_owner,
2228
2229 .kev_ident = kev->ident,
2230 .error = (int16_t)error,
2231 .kev_flags = kev->flags,
2232 .kev_fflags = kev->fflags,
2233
2234 .kev_mask = mask,
2235 .kev_value = kdata,
2236 .in_value = udata,
2237 });
2238 #endif // CONFIG_WORKLOOP_DEBUG
2239
2240 if (wl_inheritor_updated) {
2241 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2242 turnstile_deallocate_safe(ts);
2243 }
2244
2245 if (cur_owner && new_owner != cur_owner) {
2246 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2247 thread_drop_kevent_override(cur_owner);
2248 }
2249 thread_deallocate_safe(cur_owner);
2250 }
2251 if (extra_thread_ref) {
2252 thread_deallocate_safe(extra_thread_ref);
2253 }
2254 return error;
2255 }
2256
2257 /*
2258 * Remembers the last updated that came in from userspace for debugging reasons.
2259 * - fflags is mirrored from the userspace kevent
2260 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2261 * - ext[VALUE] is set to what the kernel loaded atomically
2262 * - data is set to the error if any
2263 */
2264 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2265 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2266 int error)
2267 {
2268 kn->kn_fflags = kev->fflags;
2269 kn->kn_sdata = error;
2270 memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2271 }
2272
2273 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2274 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2275 struct kevent_qos_s *kev, int op)
2276 {
2277 user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2278 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2279 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2280 uint64_t udata = 0;
2281 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2282 int error = 0;
2283
2284 if (op == FILT_WLATTACH) {
2285 (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2286 } else if (uaddr == 0) {
2287 return 0;
2288 }
2289
2290 filt_wllock(kqwl);
2291
2292 again:
2293
2294 /*
2295 * Do the debounce thing, the lock serializing the state is the knote lock.
2296 */
2297 if (uaddr) {
2298 /*
2299 * Until <rdar://problem/24999882> exists,
2300 * disabling preemption copyin forces any
2301 * vm_fault we encounter to fail.
2302 */
2303 error = copyin_atomic64(uaddr, &udata);
2304
2305 /*
2306 * If we get EFAULT, drop locks, and retry.
2307 * If we still get an error report it,
2308 * else assume the memory has been faulted
2309 * and attempt to copyin under lock again.
2310 */
2311 switch (error) {
2312 case 0:
2313 break;
2314 case EFAULT:
2315 if (efault_retry-- > 0) {
2316 filt_wlunlock(kqwl);
2317 error = copyin_atomic64(uaddr, &udata);
2318 filt_wllock(kqwl);
2319 if (error == 0) {
2320 goto again;
2321 }
2322 }
2323 OS_FALLTHROUGH;
2324 default:
2325 goto out;
2326 }
2327
2328 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2329 kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2330
2331 if ((udata & mask) != (kdata & mask)) {
2332 error = ESTALE;
2333 goto out;
2334 }
2335 }
2336
2337 if (op == FILT_WLATTACH) {
2338 error = filt_wlattach_sync_ipc(kn);
2339 if (error == 0) {
2340 disable_preemption();
2341 error = EPREEMPTDISABLED;
2342 }
2343 }
2344
2345 out:
2346 filt_wlunlock(kqwl);
2347 return error;
2348 }
2349
2350 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2351 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2352 {
2353 struct kqueue *kq = knote_get_kq(kn);
2354 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2355 int error = 0, result = 0;
2356 kq_index_t qos_index = 0;
2357
2358 if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2359 error = ENOTSUP;
2360 goto out;
2361 }
2362
2363 uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2364 switch (command) {
2365 case NOTE_WL_THREAD_REQUEST:
2366 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2367 error = EINVAL;
2368 goto out;
2369 }
2370 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2371 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2372 error = ERANGE;
2373 goto out;
2374 }
2375 if (kqwl->kqwl_request.tr_kq_qos_index) {
2376 /*
2377 * There already is a thread request, and well, you're only allowed
2378 * one per workloop, so fail the attach.
2379 */
2380 error = EALREADY;
2381 goto out;
2382 }
2383 break;
2384 case NOTE_WL_SYNC_WAIT:
2385 case NOTE_WL_SYNC_WAKE:
2386 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2387 error = EINVAL;
2388 goto out;
2389 }
2390 if ((kn->kn_flags & EV_DISABLE) == 0) {
2391 error = EINVAL;
2392 goto out;
2393 }
2394 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2395 error = EINVAL;
2396 goto out;
2397 }
2398 break;
2399
2400 case NOTE_WL_SYNC_IPC:
2401 if ((kn->kn_flags & EV_DISABLE) == 0) {
2402 error = EINVAL;
2403 goto out;
2404 }
2405 if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2406 error = EINVAL;
2407 goto out;
2408 }
2409 break;
2410 default:
2411 error = EINVAL;
2412 goto out;
2413 }
2414
2415 if (command == NOTE_WL_SYNC_IPC) {
2416 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2417 } else {
2418 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2419 }
2420
2421 if (error == EPREEMPTDISABLED) {
2422 error = 0;
2423 result = FILTER_THREADREQ_NODEFEER;
2424 }
2425 out:
2426 if (error) {
2427 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2428 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2429 error = 0;
2430 }
2431 knote_set_error(kn, error);
2432 return result;
2433 }
2434 if (command == NOTE_WL_SYNC_WAIT) {
2435 return kevent_register_wait_prepare(kn, kev, result);
2436 }
2437 /* Just attaching the thread request successfully will fire it */
2438 if (command == NOTE_WL_THREAD_REQUEST) {
2439 /*
2440 * Thread Request knotes need an explicit touch to be active again,
2441 * so delivering an event needs to also consume it.
2442 */
2443 kn->kn_flags |= EV_CLEAR;
2444 return result | FILTER_ACTIVE;
2445 }
2446 return result;
2447 }
2448
2449 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2450 filt_wlwait_continue(void *parameter, wait_result_t wr)
2451 {
2452 struct _kevent_register *cont_args = parameter;
2453 struct kqworkloop *kqwl = cont_args->kqwl;
2454
2455 kqlock(kqwl);
2456 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2457 workq_kern_threadreq_lock(kqwl->kqwl_p);
2458 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2459 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2460 } else {
2461 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2462 }
2463 kqunlock(kqwl);
2464
2465 turnstile_cleanup();
2466
2467 if (wr == THREAD_INTERRUPTED) {
2468 cont_args->kev.flags |= EV_ERROR;
2469 cont_args->kev.data = EINTR;
2470 } else if (wr != THREAD_AWAKENED) {
2471 panic("Unexpected wait result: %d", wr);
2472 }
2473
2474 kevent_register_wait_return(cont_args);
2475 }
2476
2477 /*
2478 * Called with the workloop mutex held, most of the time never returns as it
2479 * calls filt_wlwait_continue through a continuation.
2480 */
2481 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2482 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2483 struct _kevent_register *cont_args)
2484 {
2485 struct kqworkloop *kqwl = cont_args->kqwl;
2486 workq_threadreq_t kqr = &kqwl->kqwl_request;
2487 struct turnstile *ts;
2488 bool workq_locked = false;
2489
2490 kqlock_held(kqwl);
2491
2492 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2493 workq_kern_threadreq_lock(kqwl->kqwl_p);
2494 workq_locked = true;
2495 }
2496
2497 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2498 TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2499
2500 if (workq_locked) {
2501 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2502 &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2503 TURNSTILE_DELAYED_UPDATE);
2504 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2505 /*
2506 * if the interlock is no longer the workqueue lock,
2507 * then we don't need to hold it anymore.
2508 */
2509 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2510 workq_locked = false;
2511 }
2512 }
2513 if (!workq_locked) {
2514 /*
2515 * If the interlock is the workloop's, then it's our responsibility to
2516 * call update_inheritor, so just do it.
2517 */
2518 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2519 }
2520
2521 thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2522 waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2523 THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2524
2525 if (workq_locked) {
2526 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2527 }
2528
2529 thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2530 if (thread) {
2531 thread_reference(thread);
2532 }
2533
2534 kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2535 }
2536
2537 /* called in stackshot context to report the thread responsible for blocking this thread */
2538 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2539 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2540 event64_t event, thread_waitinfo_t *waitinfo)
2541 {
2542 struct knote *kn = (struct knote *)event;
2543
2544 zone_require(knote_zone, kn);
2545
2546 assert(kn->kn_thread == thread);
2547
2548 struct kqueue *kq = knote_get_kq(kn);
2549
2550 zone_require(kqworkloop_zone, kq);
2551 assert(kq->kq_state & KQ_WORKLOOP);
2552
2553 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2554 workq_threadreq_t kqr = &kqwl->kqwl_request;
2555
2556 thread_t kqwl_owner = kqwl->kqwl_owner;
2557
2558 if (kqwl_owner != THREAD_NULL) {
2559 thread_require(kqwl_owner);
2560 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2561 } else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2562 thread_require(kqr->tr_thread);
2563 waitinfo->owner = thread_tid(kqr->tr_thread);
2564 } else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2565 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2566 } else {
2567 waitinfo->owner = 0;
2568 }
2569
2570 waitinfo->context = kqwl->kqwl_dynamicid;
2571 }
2572
2573 static void
filt_wldetach(struct knote * kn)2574 filt_wldetach(struct knote *kn)
2575 {
2576 if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2577 filt_wldetach_sync_ipc(kn);
2578 } else if (kn->kn_thread) {
2579 kevent_register_wait_cleanup(kn);
2580 }
2581 }
2582
2583 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2584 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2585 thread_qos_t *qos_index)
2586 {
2587 uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2588 uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2589
2590 if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2591 return EINVAL;
2592 }
2593 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2594 if (kev->flags & EV_DELETE) {
2595 return EINVAL;
2596 }
2597 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2598 return EINVAL;
2599 }
2600 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2601 return ERANGE;
2602 }
2603 }
2604
2605 switch (new_commands) {
2606 case NOTE_WL_THREAD_REQUEST:
2607 /* thread requests can only update themselves */
2608 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2609 return EINVAL;
2610 }
2611 break;
2612
2613 case NOTE_WL_SYNC_WAIT:
2614 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2615 return EINVAL;
2616 }
2617 goto sync_checks;
2618
2619 case NOTE_WL_SYNC_WAKE:
2620 sync_checks:
2621 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2622 return EINVAL;
2623 }
2624 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2625 return EINVAL;
2626 }
2627 break;
2628
2629 case NOTE_WL_SYNC_IPC:
2630 if (sav_commands != NOTE_WL_SYNC_IPC) {
2631 return EINVAL;
2632 }
2633 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2634 return EINVAL;
2635 }
2636 break;
2637
2638 default:
2639 return EINVAL;
2640 }
2641 return 0;
2642 }
2643
2644 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2645 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2646 {
2647 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2648 thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2649 int result = 0;
2650
2651 int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2652 if (error) {
2653 goto out;
2654 }
2655
2656 uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2657 if (command == NOTE_WL_SYNC_IPC) {
2658 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2659 } else {
2660 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2661 filt_wlremember_last_update(kn, kev, error);
2662 }
2663 if (error == EPREEMPTDISABLED) {
2664 error = 0;
2665 result = FILTER_THREADREQ_NODEFEER;
2666 }
2667
2668 out:
2669 if (error) {
2670 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2671 /* If userland wants ESTALE to be hidden, do not activate */
2672 return result;
2673 }
2674 kev->flags |= EV_ERROR;
2675 kev->data = error;
2676 return result;
2677 }
2678 if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2679 return kevent_register_wait_prepare(kn, kev, result);
2680 }
2681 /* Just touching the thread request successfully will fire it */
2682 if (command == NOTE_WL_THREAD_REQUEST) {
2683 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2684 result |= FILTER_UPDATE_REQ_QOS;
2685 }
2686 result |= FILTER_ACTIVE;
2687 }
2688 return result;
2689 }
2690
2691 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2692 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2693 {
2694 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2695
2696 int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2697 if (error) {
2698 goto out;
2699 }
2700
2701 uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2702 if (command == NOTE_WL_SYNC_IPC) {
2703 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2704 } else {
2705 error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2706 filt_wlremember_last_update(kn, kev, error);
2707 }
2708 assert(error != EPREEMPTDISABLED);
2709
2710 out:
2711 if (error) {
2712 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2713 return false;
2714 }
2715 kev->flags |= EV_ERROR;
2716 kev->data = error;
2717 return false;
2718 }
2719 return true;
2720 }
2721
2722 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2723 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2724 {
2725 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2726 int rc = 0;
2727
2728 assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2729
2730 kqlock(kqwl);
2731
2732 if (kqwl->kqwl_owner) {
2733 /*
2734 * <rdar://problem/33584321> userspace sometimes due to events being
2735 * delivered but not triggering a drain session can cause a process
2736 * of the thread request knote.
2737 *
2738 * When that happens, the automatic deactivation due to process
2739 * would swallow the event, so we have to activate the knote again.
2740 */
2741 knote_activate(kqwl, kn, FILTER_ACTIVE);
2742 } else {
2743 #if DEBUG || DEVELOPMENT
2744 if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2745 /*
2746 * see src/queue_internal.h in libdispatch
2747 */
2748 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2749 user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2750 task_t t = current_task();
2751 uint64_t val;
2752 if (addr && task_is_active(t) && !task_is_halting(t) &&
2753 copyin_atomic64(addr, &val) == 0 &&
2754 val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2755 (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2756 panic("kevent: workloop %#016llx is not enqueued "
2757 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2758 kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2759 }
2760 }
2761 #endif
2762 knote_fill_kevent(kn, kev, 0);
2763 kev->fflags = kn->kn_sfflags;
2764 rc |= FILTER_ACTIVE;
2765 }
2766
2767 kqunlock(kqwl);
2768
2769 if (rc & FILTER_ACTIVE) {
2770 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2771 }
2772 return rc;
2773 }
2774
2775 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2776 .f_extended_codes = true,
2777 .f_attach = filt_wlattach,
2778 .f_detach = filt_wldetach,
2779 .f_event = filt_bad_event,
2780 .f_touch = filt_wltouch,
2781 .f_process = filt_wlprocess,
2782 .f_allow_drop = filt_wlallow_drop,
2783 .f_post_register_wait = filt_wlpost_register_wait,
2784 };
2785
2786 #pragma mark - kqueues allocation and deallocation
2787
2788 OS_NOINLINE
2789 static void
2790 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2791
2792 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2793 kqworkloop_try_retain(struct kqworkloop *kqwl)
2794 {
2795 return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2796 }
2797
2798 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2799 kqworkloop_retain(struct kqworkloop *kqwl)
2800 {
2801 return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2802 }
2803
2804 OS_ALWAYS_INLINE
2805 static inline void
kqueue_retain(kqueue_t kqu)2806 kqueue_retain(kqueue_t kqu)
2807 {
2808 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2809 kqworkloop_retain(kqu.kqwl);
2810 }
2811 }
2812
2813 OS_ALWAYS_INLINE
2814 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2815 kqworkloop_release_live(struct kqworkloop *kqwl)
2816 {
2817 os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2818 }
2819
2820 OS_ALWAYS_INLINE
2821 static inline void
kqueue_release_live(kqueue_t kqu)2822 kqueue_release_live(kqueue_t kqu)
2823 {
2824 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2825 kqworkloop_release_live(kqu.kqwl);
2826 }
2827 }
2828
2829 OS_ALWAYS_INLINE
2830 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2831 kqworkloop_release(struct kqworkloop *kqwl)
2832 {
2833 if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2834 kqworkloop_dealloc(kqwl, true);
2835 }
2836 }
2837
2838 OS_ALWAYS_INLINE
2839 static inline void
kqueue_release(kqueue_t kqu)2840 kqueue_release(kqueue_t kqu)
2841 {
2842 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2843 kqworkloop_release(kqu.kqwl);
2844 }
2845 }
2846
2847 /*!
2848 * @function kqueue_destroy
2849 *
2850 * @brief
2851 * Common part to all kqueue dealloc functions.
2852 */
2853 OS_NOINLINE
2854 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2855 kqueue_destroy(kqueue_t kqu, zone_t zone)
2856 {
2857 lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2858
2859 zfree(zone, kqu.kq);
2860 }
2861
2862 /*!
2863 * @function kqueue_init
2864 *
2865 * @brief
2866 * Common part to all kqueue alloc functions.
2867 */
2868 static kqueue_t
kqueue_init(kqueue_t kqu)2869 kqueue_init(kqueue_t kqu)
2870 {
2871 lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2872 return kqu;
2873 }
2874
2875 #pragma mark kqfile allocation and deallocation
2876
2877 /*!
2878 * @function kqueue_dealloc
2879 *
2880 * @brief
2881 * Detach all knotes from a kqfile and free it.
2882 *
2883 * @discussion
2884 * We walk each list looking for knotes referencing this
2885 * this kqueue. If we find one, we try to drop it. But
2886 * if we fail to get a drop reference, that will wait
2887 * until it is dropped. So, we can just restart again
2888 * safe in the assumption that the list will eventually
2889 * not contain any more references to this kqueue (either
2890 * we dropped them all, or someone else did).
2891 *
2892 * Assumes no new events are being added to the kqueue.
2893 * Nothing locked on entry or exit.
2894 */
2895 void
kqueue_dealloc(struct kqueue * kq)2896 kqueue_dealloc(struct kqueue *kq)
2897 {
2898 KNOTE_LOCK_CTX(knlc);
2899 struct proc *p = kq->kq_p;
2900 struct filedesc *fdp = &p->p_fd;
2901 struct knote *kn;
2902
2903 assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2904
2905 proc_fdlock(p);
2906 for (int i = 0; i < fdp->fd_knlistsize; i++) {
2907 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2908 while (kn != NULL) {
2909 if (kq == knote_get_kq(kn)) {
2910 kqlock(kq);
2911 proc_fdunlock(p);
2912 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2913 knote_drop(kq, kn, &knlc);
2914 }
2915 proc_fdlock(p);
2916 /* start over at beginning of list */
2917 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2918 continue;
2919 }
2920 kn = SLIST_NEXT(kn, kn_link);
2921 }
2922 }
2923
2924 knhash_lock(fdp);
2925 proc_fdunlock(p);
2926
2927 if (fdp->fd_knhashmask != 0) {
2928 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2929 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2930 while (kn != NULL) {
2931 if (kq == knote_get_kq(kn)) {
2932 kqlock(kq);
2933 knhash_unlock(fdp);
2934 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2935 knote_drop(kq, kn, &knlc);
2936 }
2937 knhash_lock(fdp);
2938 /* start over at beginning of list */
2939 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2940 continue;
2941 }
2942 kn = SLIST_NEXT(kn, kn_link);
2943 }
2944 }
2945 }
2946 knhash_unlock(fdp);
2947
2948 kqueue_destroy(kq, kqfile_zone);
2949 }
2950
2951 /*!
2952 * @function kqueue_alloc
2953 *
2954 * @brief
2955 * Allocate a kqfile.
2956 */
2957 struct kqueue *
kqueue_alloc(struct proc * p)2958 kqueue_alloc(struct proc *p)
2959 {
2960 struct kqfile *kqf;
2961
2962 /*
2963 * kqfiles are created with kqueue() so we need to wait for
2964 * the first kevent syscall to know which bit among
2965 * KQ_KEV_{32,64,QOS} will be set in kqf_state
2966 */
2967 kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
2968 kqf->kqf_p = p;
2969 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
2970 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
2971
2972 return kqueue_init(kqf).kq;
2973 }
2974
2975 /*!
2976 * @function kqueue_internal
2977 *
2978 * @brief
2979 * Core implementation for kqueue and guarded_kqueue_np()
2980 */
2981 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)2982 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
2983 {
2984 struct kqueue *kq;
2985 struct fileproc *fp;
2986 int fd, error;
2987
2988 error = falloc_withinit(p, &fp, &fd, vfs_context_current(),
2989 fp_init, initarg);
2990 if (error) {
2991 return error;
2992 }
2993
2994 kq = kqueue_alloc(p);
2995 if (kq == NULL) {
2996 fp_free(p, fd, fp);
2997 return ENOMEM;
2998 }
2999
3000 fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3001 fp->f_flag = FREAD | FWRITE;
3002 fp->f_ops = &kqueueops;
3003 fp_set_data(fp, kq);
3004 fp->f_lflags |= FG_CONFINED;
3005
3006 proc_fdlock(p);
3007 procfdtbl_releasefd(p, fd, NULL);
3008 fp_drop(p, fd, fp, 1);
3009 proc_fdunlock(p);
3010
3011 *retval = fd;
3012 return error;
3013 }
3014
3015 /*!
3016 * @function kqueue
3017 *
3018 * @brief
3019 * The kqueue syscall.
3020 */
3021 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3022 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3023 {
3024 return kqueue_internal(p, NULL, NULL, retval);
3025 }
3026
3027 #pragma mark kqworkq allocation and deallocation
3028
3029 /*!
3030 * @function kqworkq_dealloc
3031 *
3032 * @brief
3033 * Deallocates a workqueue kqueue.
3034 *
3035 * @discussion
3036 * This only happens at process death, or for races with concurrent
3037 * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3038 * this kqueue, either there are none, or someone else took care of them.
3039 */
3040 void
kqworkq_dealloc(struct kqworkq * kqwq)3041 kqworkq_dealloc(struct kqworkq *kqwq)
3042 {
3043 kqueue_destroy(kqwq, kqworkq_zone);
3044 }
3045
3046 /*!
3047 * @function kqworkq_alloc
3048 *
3049 * @brief
3050 * Allocates a workqueue kqueue.
3051 *
3052 * @discussion
3053 * This is the slow path of kevent_get_kqwq.
3054 * This takes care of making sure procs have a single workq kqueue.
3055 */
3056 OS_NOINLINE
3057 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3058 kqworkq_alloc(struct proc *p, unsigned int flags)
3059 {
3060 struct kqworkq *kqwq, *tmp;
3061
3062 kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3063
3064 assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3065 if (flags & KEVENT_FLAG_LEGACY64) {
3066 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3067 } else {
3068 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3069 }
3070 kqwq->kqwq_p = p;
3071
3072 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3073 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3074 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3075 }
3076 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3077 /*
3078 * Because of how the bucketized system works, we mix overcommit
3079 * sources with not overcommit: each time we move a knote from
3080 * one bucket to the next due to overrides, we'd had to track
3081 * overcommitness, and it's really not worth it in the workloop
3082 * enabled world that track this faithfully.
3083 *
3084 * Incidentally, this behaves like the original manager-based
3085 * kqwq where event delivery always happened (hence is
3086 * "overcommit")
3087 */
3088 kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3089 kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3090 if (i != KQWQ_QOS_MANAGER) {
3091 kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3092 }
3093 kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3094 }
3095
3096 kqueue_init(kqwq);
3097
3098 if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3099 kqworkq_dealloc(kqwq);
3100 return tmp;
3101 }
3102
3103 return kqwq;
3104 }
3105
3106 #pragma mark kqworkloop allocation and deallocation
3107
3108 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3109 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3110
3111 OS_ALWAYS_INLINE
3112 static inline void
kqhash_lock(struct filedesc * fdp)3113 kqhash_lock(struct filedesc *fdp)
3114 {
3115 lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3116 }
3117
3118 OS_ALWAYS_INLINE
3119 static inline void
kqhash_unlock(struct filedesc * fdp)3120 kqhash_unlock(struct filedesc *fdp)
3121 {
3122 lck_mtx_unlock(&fdp->fd_kqhashlock);
3123 }
3124
3125 OS_ALWAYS_INLINE
3126 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3127 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3128 struct kqworkloop *kqwl)
3129 {
3130 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3131 LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3132 }
3133
3134 OS_ALWAYS_INLINE
3135 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3136 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3137 {
3138 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3139 struct kqworkloop *kqwl;
3140
3141 LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3142 if (kqwl->kqwl_dynamicid == id) {
3143 return kqwl;
3144 }
3145 }
3146 return NULL;
3147 }
3148
3149 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3150 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3151 {
3152 struct kqworkloop *kqwl = NULL;
3153
3154 kqhash_lock(fdp);
3155 if (__probable(fdp->fd_kqhash)) {
3156 kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3157 if (kqwl && !kqworkloop_try_retain(kqwl)) {
3158 kqwl = NULL;
3159 }
3160 }
3161 kqhash_unlock(fdp);
3162 return kqwl;
3163 }
3164
3165 OS_NOINLINE
3166 static void
kqworkloop_hash_init(struct filedesc * fdp)3167 kqworkloop_hash_init(struct filedesc *fdp)
3168 {
3169 struct kqwllist *alloc_hash;
3170 u_long alloc_mask;
3171
3172 kqhash_unlock(fdp);
3173 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3174 kqhash_lock(fdp);
3175
3176 /* See if we won the race */
3177 if (__probable(fdp->fd_kqhashmask == 0)) {
3178 fdp->fd_kqhash = alloc_hash;
3179 fdp->fd_kqhashmask = alloc_mask;
3180 } else {
3181 kqhash_unlock(fdp);
3182 hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3183 kqhash_lock(fdp);
3184 }
3185 }
3186
3187 /*
3188 * kqueue iotier override is only supported for kqueue that has
3189 * only one port as a mach port source. Updating the iotier
3190 * override on the mach port source will update the override
3191 * on kqueue as well. Since kqueue with iotier override will
3192 * only have one port attached, there is no logic for saturation
3193 * like qos override, the iotier override of mach port source
3194 * would be reflected in kevent iotier override.
3195 */
3196 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3197 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3198 {
3199 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3200 return;
3201 }
3202
3203 struct kqworkloop *kqwl = kqu.kqwl;
3204 os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3205 }
3206
3207 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3208 kqueue_get_iotier_override(kqueue_t kqu)
3209 {
3210 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3211 return THROTTLE_LEVEL_END;
3212 }
3213
3214 struct kqworkloop *kqwl = kqu.kqwl;
3215 return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3216 }
3217
3218 #if CONFIG_PREADOPT_TG
3219 /*
3220 * This function is called with a borrowed reference on the thread group without
3221 * kq lock held with the mqueue lock held. It may or may not have the knote lock
3222 * (called from both fevent as well as fattach/ftouch). Upon success, an
3223 * additional reference on the TG is taken
3224 */
3225 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3226 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3227 {
3228 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3229 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3230 (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3231 return;
3232 }
3233
3234 struct kqworkloop *kqwl = kqu.kqwl;
3235
3236 assert(qos < THREAD_QOS_LAST);
3237
3238 thread_group_retain(tg);
3239
3240 thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3241 int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3242 if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3243 os_atomic_rmw_loop_give_up(break);
3244 }
3245
3246 if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3247 /*
3248 * Note that old_tg could be a NULL TG pointer but with a QoS
3249 * set. See also workq_thread_reset_pri.
3250 *
3251 * Compare the QoS of existing preadopted tg with new one and
3252 * only overwrite the thread group if we have one with a higher
3253 * QoS.
3254 */
3255 thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3256 if (existing_qos >= qos) {
3257 os_atomic_rmw_loop_give_up(break);
3258 }
3259 }
3260
3261 // Transfer the ref taken earlier in the function to the kqwl
3262 new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3263 });
3264
3265 if (ret) {
3266 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3267
3268 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3269 thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3270 }
3271
3272 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3273 } else {
3274 // We failed to write to the kqwl_preadopt_tg, drop the ref we took
3275 // earlier in the function
3276 thread_group_deallocate_safe(tg);
3277 }
3278 }
3279
3280 /*
3281 * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3282 */
3283 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3284 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3285 {
3286 bool success = false;
3287 if (kq->kq_state & KQ_WORKLOOP) {
3288 struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3289 thread_group_qos_t old_tg;
3290 success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3291 KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3292 &old_tg, relaxed);
3293 if (success) {
3294 thread_set_preadopt_thread_group(thread, tg);
3295 }
3296
3297 __assert_only thread_group_qos_t preadopt_tg;
3298 preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3299 assert(preadopt_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3300 preadopt_tg == KQWL_PREADOPTED_TG_NEVER);
3301 }
3302
3303 return success;
3304 }
3305 #endif
3306
3307 /*!
3308 * @function kqworkloop_dealloc
3309 *
3310 * @brief
3311 * Deallocates a workloop kqueue.
3312 *
3313 * @discussion
3314 * Knotes hold references on the workloop, so we can't really reach this
3315 * function unless all of these are already gone.
3316 *
3317 * Nothing locked on entry or exit.
3318 *
3319 * @param hash_remove
3320 * Whether to remove the workloop from its hash table.
3321 */
3322 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3323 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3324 {
3325 thread_t cur_owner;
3326
3327 cur_owner = kqwl->kqwl_owner;
3328 if (cur_owner) {
3329 if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3330 thread_drop_kevent_override(cur_owner);
3331 }
3332 thread_deallocate(cur_owner);
3333 kqwl->kqwl_owner = THREAD_NULL;
3334 }
3335
3336 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3337 struct turnstile *ts;
3338 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3339 &ts, TURNSTILE_WORKLOOPS);
3340 turnstile_cleanup();
3341 turnstile_deallocate(ts);
3342 }
3343
3344 if (hash_remove) {
3345 struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3346
3347 kqhash_lock(fdp);
3348 LIST_REMOVE(kqwl, kqwl_hashlink);
3349 kqhash_unlock(fdp);
3350 }
3351
3352 #if CONFIG_PREADOPT_TG
3353 thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3354 if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3355 thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3356 }
3357 #endif
3358
3359 assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3360 assert(kqwl->kqwl_owner == THREAD_NULL);
3361 assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3362
3363 lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3364 kqueue_destroy(kqwl, kqworkloop_zone);
3365 }
3366
3367 /*!
3368 * @function kqworkloop_alloc
3369 *
3370 * @brief
3371 * Allocates a workloop kqueue.
3372 */
3373 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp)3374 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3375 kqueue_id_t id, workq_threadreq_param_t *trp)
3376 {
3377 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3378 os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3379 kqwl->kqwl_dynamicid = id;
3380 kqwl->kqwl_p = p;
3381 if (trp) {
3382 kqwl->kqwl_params = trp->trp_value;
3383 }
3384
3385 workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3386 if (trp) {
3387 if (trp->trp_flags & TRP_PRIORITY) {
3388 tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3389 }
3390 if (trp->trp_flags) {
3391 tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3392 }
3393 }
3394 kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3395 kqwl->kqwl_request.tr_flags = tr_flags;
3396 os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3397 #if CONFIG_PREADOPT_TG
3398 if (task_is_app(current_task())) {
3399 /* Apps will never adopt a thread group that is not their own. This is a
3400 * gross hack to simulate the post-process that is done in the voucher
3401 * subsystem today for thread groups */
3402 os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3403 }
3404 #endif
3405
3406 for (int i = 0; i < KQWL_NBUCKETS; i++) {
3407 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3408 }
3409 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3410
3411 lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3412
3413 kqueue_init(kqwl);
3414 }
3415
3416 /*!
3417 * @function kqworkloop_get_or_create
3418 *
3419 * @brief
3420 * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
3421 *
3422 * @returns
3423 * 0: success
3424 * EINVAL: invalid parameters
3425 * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3426 * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3427 * ENOMEM: allocation failed
3428 */
3429 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,unsigned int flags,struct kqworkloop ** kqwlp)3430 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3431 workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp)
3432 {
3433 struct filedesc *fdp = &p->p_fd;
3434 struct kqworkloop *alloc_kqwl = NULL;
3435 struct kqworkloop *kqwl = NULL;
3436 int error = 0;
3437
3438 assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3439
3440 if (id == 0 || id == (kqueue_id_t)-1) {
3441 return EINVAL;
3442 }
3443
3444 for (;;) {
3445 kqhash_lock(fdp);
3446 if (__improbable(fdp->fd_kqhash == NULL)) {
3447 kqworkloop_hash_init(fdp);
3448 }
3449
3450 kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3451 if (kqwl) {
3452 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3453 /*
3454 * If MUST_NOT_EXIST was passed, even if we would have failed
3455 * the try_retain, it could have gone the other way, and
3456 * userspace can't tell. Let'em fix their race.
3457 */
3458 error = EEXIST;
3459 break;
3460 }
3461
3462 if (__probable(kqworkloop_try_retain(kqwl))) {
3463 /*
3464 * This is a valid live workloop !
3465 */
3466 *kqwlp = kqwl;
3467 error = 0;
3468 break;
3469 }
3470 }
3471
3472 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3473 error = ENOENT;
3474 break;
3475 }
3476
3477 /*
3478 * We didn't find what we were looking for.
3479 *
3480 * If this is the second time we reach this point (alloc_kqwl != NULL),
3481 * then we're done.
3482 *
3483 * If this is the first time we reach this point (alloc_kqwl == NULL),
3484 * then try to allocate one without blocking.
3485 */
3486 if (__probable(alloc_kqwl == NULL)) {
3487 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3488 }
3489 if (__probable(alloc_kqwl)) {
3490 kqworkloop_init(alloc_kqwl, p, id, trp);
3491 kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3492 kqhash_unlock(fdp);
3493 *kqwlp = alloc_kqwl;
3494 return 0;
3495 }
3496
3497 /*
3498 * We have to block to allocate a workloop, drop the lock,
3499 * allocate one, but then we need to retry lookups as someone
3500 * else could race with us.
3501 */
3502 kqhash_unlock(fdp);
3503
3504 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3505 }
3506
3507 kqhash_unlock(fdp);
3508
3509 if (__improbable(alloc_kqwl)) {
3510 zfree(kqworkloop_zone, alloc_kqwl);
3511 }
3512
3513 return error;
3514 }
3515
3516 #pragma mark - knotes
3517
3518 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3519 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3520 {
3521 knote_set_error(kn, ENOTSUP);
3522 return 0;
3523 }
3524
3525 static void
filt_no_detach(__unused struct knote * kn)3526 filt_no_detach(__unused struct knote *kn)
3527 {
3528 }
3529
3530 static int __dead2
filt_bad_event(struct knote * kn,long hint)3531 filt_bad_event(struct knote *kn, long hint)
3532 {
3533 panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3534 }
3535
3536 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3537 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3538 {
3539 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3540 }
3541
3542 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3543 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3544 {
3545 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3546 }
3547
3548 /*
3549 * knotes_dealloc - detach all knotes for the process and drop them
3550 *
3551 * Process is in such a state that it will not try to allocate
3552 * any more knotes during this process (stopped for exit or exec).
3553 */
3554 void
knotes_dealloc(proc_t p)3555 knotes_dealloc(proc_t p)
3556 {
3557 struct filedesc *fdp = &p->p_fd;
3558 struct kqueue *kq;
3559 struct knote *kn;
3560 struct klist *kn_hash = NULL;
3561 u_long kn_hashmask;
3562 int i;
3563
3564 proc_fdlock(p);
3565
3566 /* Close all the fd-indexed knotes up front */
3567 if (fdp->fd_knlistsize > 0) {
3568 for (i = 0; i < fdp->fd_knlistsize; i++) {
3569 while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3570 kq = knote_get_kq(kn);
3571 kqlock(kq);
3572 proc_fdunlock(p);
3573 knote_drop(kq, kn, NULL);
3574 proc_fdlock(p);
3575 }
3576 }
3577 /* free the table */
3578 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3579 }
3580 fdp->fd_knlistsize = 0;
3581
3582 proc_fdunlock(p);
3583
3584 knhash_lock(fdp);
3585
3586 /* Clean out all the hashed knotes as well */
3587 if (fdp->fd_knhashmask != 0) {
3588 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3589 while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3590 kq = knote_get_kq(kn);
3591 kqlock(kq);
3592 knhash_unlock(fdp);
3593 knote_drop(kq, kn, NULL);
3594 knhash_lock(fdp);
3595 }
3596 }
3597 kn_hash = fdp->fd_knhash;
3598 kn_hashmask = fdp->fd_knhashmask;
3599 fdp->fd_knhashmask = 0;
3600 fdp->fd_knhash = NULL;
3601 }
3602
3603 knhash_unlock(fdp);
3604
3605 if (kn_hash) {
3606 hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3607 }
3608 }
3609
3610 /*
3611 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3612 * scheduling parameters
3613 *
3614 * Process is in such a state that it will not try to allocate
3615 * any more knotes during this process (stopped for exit or exec).
3616 */
3617 void
kqworkloops_dealloc(proc_t p)3618 kqworkloops_dealloc(proc_t p)
3619 {
3620 struct filedesc *fdp = &p->p_fd;
3621 struct kqworkloop *kqwl, *kqwln;
3622 struct kqwllist tofree;
3623
3624 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3625 return;
3626 }
3627
3628 kqhash_lock(fdp);
3629
3630 if (fdp->fd_kqhashmask == 0) {
3631 kqhash_unlock(fdp);
3632 return;
3633 }
3634
3635 LIST_INIT(&tofree);
3636
3637 for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3638 LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3639 /*
3640 * kqworkloops that have scheduling parameters have an
3641 * implicit retain from kqueue_workloop_ctl that needs
3642 * to be balanced on process exit.
3643 */
3644 assert(kqwl->kqwl_params);
3645 LIST_REMOVE(kqwl, kqwl_hashlink);
3646 LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3647 }
3648 }
3649
3650 kqhash_unlock(fdp);
3651
3652 LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3653 uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3654 if (ref != 1) {
3655 panic("kq(%p) invalid refcount %d", kqwl, ref);
3656 }
3657 kqworkloop_dealloc(kqwl, false);
3658 }
3659 }
3660
3661 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3662 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3663 struct kevent_qos_s *kev)
3664 {
3665 /* We don't care about the priority of a disabled or deleted knote */
3666 if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3667 return 0;
3668 }
3669
3670 if (kq->kq_state & KQ_WORKLOOP) {
3671 /*
3672 * Workloops need valid priorities with a QOS (excluding manager) for
3673 * any enabled knote.
3674 *
3675 * When it is pre-existing, just make sure it has a valid QoS as
3676 * kevent_register() will not use the incoming priority (filters who do
3677 * have the responsibility to validate it again, see filt_wltouch).
3678 *
3679 * If the knote is being made, validate the incoming priority.
3680 */
3681 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3682 return ERANGE;
3683 }
3684 }
3685
3686 return 0;
3687 }
3688
3689 /*
3690 * Prepare a filter for waiting after register.
3691 *
3692 * The f_post_register_wait hook will be called later by kevent_register()
3693 * and should call kevent_register_wait_block()
3694 */
3695 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3696 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3697 {
3698 thread_t thread = current_thread();
3699
3700 assert(knote_fops(kn)->f_extended_codes);
3701
3702 if (kn->kn_thread == NULL) {
3703 thread_reference(thread);
3704 kn->kn_thread = thread;
3705 } else if (kn->kn_thread != thread) {
3706 /*
3707 * kn_thread may be set from a previous aborted wait
3708 * However, it has to be from the same thread.
3709 */
3710 kev->flags |= EV_ERROR;
3711 kev->data = EXDEV;
3712 return 0;
3713 }
3714
3715 return FILTER_REGISTER_WAIT | rc;
3716 }
3717
3718 /*
3719 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3720 * aborted instead of properly woken up with thread_wakeup_thread().
3721 */
3722 static void
kevent_register_wait_cleanup(struct knote * kn)3723 kevent_register_wait_cleanup(struct knote *kn)
3724 {
3725 thread_t thread = kn->kn_thread;
3726 kn->kn_thread = NULL;
3727 thread_deallocate(thread);
3728 }
3729
3730 /*
3731 * Must be called at the end of a f_post_register_wait call from a filter.
3732 */
3733 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3734 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3735 thread_continue_t cont, struct _kevent_register *cont_args)
3736 {
3737 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3738 kqunlock(cont_args->kqwl);
3739 cont_args->handoff_thread = thread;
3740 thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3741 }
3742
3743 /*
3744 * Called by Filters using a f_post_register_wait to return from their wait.
3745 */
3746 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3747 kevent_register_wait_return(struct _kevent_register *cont_args)
3748 {
3749 struct kqworkloop *kqwl = cont_args->kqwl;
3750 struct kevent_qos_s *kev = &cont_args->kev;
3751 int error = 0;
3752
3753 if (cont_args->handoff_thread) {
3754 thread_deallocate(cont_args->handoff_thread);
3755 }
3756
3757 if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3758 if ((kev->flags & EV_ERROR) == 0) {
3759 kev->flags |= EV_ERROR;
3760 kev->data = 0;
3761 }
3762 error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3763 if (error == 0) {
3764 cont_args->eventout++;
3765 }
3766 }
3767
3768 kqworkloop_release(kqwl);
3769 if (error == 0) {
3770 *(int32_t *)¤t_uthread()->uu_rval = cont_args->eventout;
3771 }
3772 unix_syscall_return(error);
3773 }
3774
3775 /*
3776 * kevent_register - add a new event to a kqueue
3777 *
3778 * Creates a mapping between the event source and
3779 * the kqueue via a knote data structure.
3780 *
3781 * Because many/most the event sources are file
3782 * descriptor related, the knote is linked off
3783 * the filedescriptor table for quick access.
3784 *
3785 * called with nothing locked
3786 * caller holds a reference on the kqueue
3787 */
3788
3789 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)3790 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
3791 struct knote **kn_out)
3792 {
3793 struct proc *p = kq->kq_p;
3794 const struct filterops *fops;
3795 struct knote *kn = NULL;
3796 int result = 0, error = 0;
3797 unsigned short kev_flags = kev->flags;
3798 KNOTE_LOCK_CTX(knlc);
3799
3800 if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
3801 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
3802 } else {
3803 error = EINVAL;
3804 goto out;
3805 }
3806
3807 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3808 if (__improbable((kev->flags & EV_VANISHED) &&
3809 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
3810 error = EINVAL;
3811 goto out;
3812 }
3813
3814 /* Simplify the flags - delete and disable overrule */
3815 if (kev->flags & EV_DELETE) {
3816 kev->flags &= ~EV_ADD;
3817 }
3818 if (kev->flags & EV_DISABLE) {
3819 kev->flags &= ~EV_ENABLE;
3820 }
3821
3822 if (kq->kq_state & KQ_WORKLOOP) {
3823 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3824 ((struct kqworkloop *)kq)->kqwl_dynamicid,
3825 kev->udata, kev->flags, kev->filter);
3826 } else if (kq->kq_state & KQ_WORKQ) {
3827 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3828 0, kev->udata, kev->flags, kev->filter);
3829 } else {
3830 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3831 VM_KERNEL_UNSLIDE_OR_PERM(kq),
3832 kev->udata, kev->flags, kev->filter);
3833 }
3834
3835 restart:
3836 /* find the matching knote from the fd tables/hashes */
3837 kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
3838 error = kevent_register_validate_priority(kq, kn, kev);
3839 result = 0;
3840 if (error) {
3841 if (kn) {
3842 kqunlock(kq);
3843 }
3844 goto out;
3845 }
3846
3847 if (kn == NULL && (kev->flags & EV_ADD) == 0) {
3848 /*
3849 * No knote found, EV_ADD wasn't specified
3850 */
3851
3852 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3853 (kq->kq_state & KQ_WORKLOOP)) {
3854 /*
3855 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3856 * that doesn't care about ENOENT, so just pretend the deletion
3857 * happened.
3858 */
3859 } else {
3860 error = ENOENT;
3861 }
3862 goto out;
3863 } else if (kn == NULL) {
3864 /*
3865 * No knote found, need to attach a new one (attach)
3866 */
3867
3868 struct fileproc *knote_fp = NULL;
3869
3870 /* grab a file reference for the new knote */
3871 if (fops->f_isfd) {
3872 if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
3873 goto out;
3874 }
3875 }
3876
3877 kn = knote_alloc();
3878 kn->kn_fp = knote_fp;
3879 kn->kn_is_fd = fops->f_isfd;
3880 kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3881 kn->kn_status = 0;
3882
3883 /* was vanish support requested */
3884 if (kev->flags & EV_VANISHED) {
3885 kev->flags &= ~EV_VANISHED;
3886 kn->kn_status |= KN_REQVANISH;
3887 }
3888
3889 /* snapshot matching/dispatching protocol flags into knote */
3890 if (kev->flags & EV_DISABLE) {
3891 kn->kn_status |= KN_DISABLED;
3892 }
3893
3894 /*
3895 * copy the kevent state into knote
3896 * protocol is that fflags and data
3897 * are saved off, and cleared before
3898 * calling the attach routine.
3899 *
3900 * - kn->kn_sfflags aliases with kev->xflags
3901 * - kn->kn_sdata aliases with kev->data
3902 * - kn->kn_filter is the top 8 bits of kev->filter
3903 */
3904 kn->kn_kevent = *(struct kevent_internal_s *)kev;
3905 kn->kn_sfflags = kev->fflags;
3906 kn->kn_filtid = (uint8_t)~kev->filter;
3907 kn->kn_fflags = 0;
3908 knote_reset_priority(kq, kn, kev->qos);
3909
3910 /* Add the knote for lookup thru the fd table */
3911 error = kq_add_knote(kq, kn, &knlc, p);
3912 if (error) {
3913 knote_free(kn);
3914 if (knote_fp != NULL) {
3915 fp_drop(p, (int)kev->ident, knote_fp, 0);
3916 }
3917
3918 if (error == ERESTART) {
3919 goto restart;
3920 }
3921 goto out;
3922 }
3923
3924 /* fp reference count now applies to knote */
3925
3926 /*
3927 * we can't use filter_call() because f_attach can change the filter ops
3928 * for a filter that supports f_extended_codes, so we need to reload
3929 * knote_fops() and not use `fops`.
3930 */
3931 result = fops->f_attach(kn, kev);
3932 if (result && !knote_fops(kn)->f_extended_codes) {
3933 result = FILTER_ACTIVE;
3934 }
3935
3936 kqlock(kq);
3937
3938 if (result & FILTER_THREADREQ_NODEFEER) {
3939 enable_preemption();
3940 }
3941
3942 if (kn->kn_flags & EV_ERROR) {
3943 /*
3944 * Failed to attach correctly, so drop.
3945 */
3946 kn->kn_filtid = EVFILTID_DETACHED;
3947 error = (int)kn->kn_sdata;
3948 knote_drop(kq, kn, &knlc);
3949 result = 0;
3950 goto out;
3951 }
3952
3953 /*
3954 * end "attaching" phase - now just attached
3955 *
3956 * Mark the thread request overcommit, if appropos
3957 *
3958 * If the attach routine indicated that an
3959 * event is already fired, activate the knote.
3960 */
3961 if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
3962 (kq->kq_state & KQ_WORKLOOP)) {
3963 kqworkloop_set_overcommit((struct kqworkloop *)kq);
3964 }
3965 } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3966 /*
3967 * The knote was dropped while we were waiting for the lock,
3968 * we need to re-evaluate entirely
3969 */
3970
3971 goto restart;
3972 } else if (kev->flags & EV_DELETE) {
3973 /*
3974 * Deletion of a knote (drop)
3975 *
3976 * If the filter wants to filter drop events, let it do so.
3977 *
3978 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
3979 * we must wait for the knote to be re-enabled (unless it is being
3980 * re-enabled atomically here).
3981 */
3982
3983 if (knote_fops(kn)->f_allow_drop) {
3984 bool drop;
3985
3986 kqunlock(kq);
3987 drop = knote_fops(kn)->f_allow_drop(kn, kev);
3988 kqlock(kq);
3989
3990 if (!drop) {
3991 goto out_unlock;
3992 }
3993 }
3994
3995 if ((kev->flags & EV_ENABLE) == 0 &&
3996 (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
3997 (kn->kn_status & KN_DISABLED) != 0) {
3998 kn->kn_status |= KN_DEFERDELETE;
3999 error = EINPROGRESS;
4000 goto out_unlock;
4001 }
4002
4003 knote_drop(kq, kn, &knlc);
4004 goto out;
4005 } else {
4006 /*
4007 * Regular update of a knote (touch)
4008 *
4009 * Call touch routine to notify filter of changes in filter values
4010 * (and to re-determine if any events are fired).
4011 *
4012 * If the knote is in defer-delete, avoid calling the filter touch
4013 * routine (it has delivered its last event already).
4014 *
4015 * If the touch routine had no failure,
4016 * apply the requested side effects to the knote.
4017 */
4018
4019 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4020 if (kev->flags & EV_ENABLE) {
4021 result = FILTER_ACTIVE;
4022 }
4023 } else {
4024 kqunlock(kq);
4025 result = filter_call(knote_fops(kn), f_touch(kn, kev));
4026 kqlock(kq);
4027 if (result & FILTER_THREADREQ_NODEFEER) {
4028 enable_preemption();
4029 }
4030 }
4031
4032 if (kev->flags & EV_ERROR) {
4033 result = 0;
4034 goto out_unlock;
4035 }
4036
4037 if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4038 kn->kn_udata != kev->udata) {
4039 // this allows klist_copy_udata() not to take locks
4040 os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4041 }
4042 if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4043 kn->kn_status |= KN_DISABLED;
4044 knote_dequeue(kq, kn);
4045 }
4046 }
4047
4048 /* accept new kevent state */
4049 knote_apply_touch(kq, kn, kev, result);
4050
4051 out_unlock:
4052 /*
4053 * When the filter asked for a post-register wait,
4054 * we leave the kqueue locked for kevent_register()
4055 * to call the filter's f_post_register_wait hook.
4056 */
4057 if (result & FILTER_REGISTER_WAIT) {
4058 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4059 *kn_out = kn;
4060 } else {
4061 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4062 }
4063
4064 out:
4065 /* output local errors through the kevent */
4066 if (error) {
4067 kev->flags |= EV_ERROR;
4068 kev->data = error;
4069 }
4070 return result;
4071 }
4072
4073 /*
4074 * knote_process - process a triggered event
4075 *
4076 * Validate that it is really still a triggered event
4077 * by calling the filter routines (if necessary). Hold
4078 * a use reference on the knote to avoid it being detached.
4079 *
4080 * If it is still considered triggered, we will have taken
4081 * a copy of the state under the filter lock. We use that
4082 * snapshot to dispatch the knote for future processing (or
4083 * not, if this was a lost event).
4084 *
4085 * Our caller assures us that nobody else can be processing
4086 * events from this knote during the whole operation. But
4087 * others can be touching or posting events to the knote
4088 * interspersed with our processing it.
4089 *
4090 * caller holds a reference on the kqueue.
4091 * kqueue locked on entry and exit - but may be dropped
4092 */
4093 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4094 knote_process(struct knote *kn, kevent_ctx_t kectx,
4095 kevent_callback_t callback)
4096 {
4097 struct kevent_qos_s kev;
4098 struct kqueue *kq = knote_get_kq(kn);
4099 KNOTE_LOCK_CTX(knlc);
4100 int result = FILTER_ACTIVE;
4101 int error = 0;
4102 bool drop = false;
4103
4104 /*
4105 * Must be active
4106 * Must be queued and not disabled/suppressed or dropping
4107 */
4108 assert(kn->kn_status & KN_QUEUED);
4109 assert(kn->kn_status & KN_ACTIVE);
4110 assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4111
4112 if (kq->kq_state & KQ_WORKLOOP) {
4113 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4114 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4115 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4116 kn->kn_filtid);
4117 } else if (kq->kq_state & KQ_WORKQ) {
4118 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4119 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4120 kn->kn_filtid);
4121 } else {
4122 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4123 VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4124 kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4125 }
4126
4127 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4128 /*
4129 * When the knote is dropping or has dropped,
4130 * then there's nothing we want to process.
4131 */
4132 return EJUSTRETURN;
4133 }
4134
4135 /*
4136 * While waiting for the knote lock, we may have dropped the kq lock.
4137 * and a touch may have disabled and dequeued the knote.
4138 */
4139 if (!(kn->kn_status & KN_QUEUED)) {
4140 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4141 return EJUSTRETURN;
4142 }
4143
4144 /*
4145 * For deferred-drop or vanished events, we just create a fake
4146 * event to acknowledge end-of-life. Otherwise, we call the
4147 * filter's process routine to snapshot the kevent state under
4148 * the filter's locking protocol.
4149 *
4150 * suppress knotes to avoid returning the same event multiple times in
4151 * a single call.
4152 */
4153 knote_suppress(kq, kn);
4154
4155 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4156 uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4157 if (kn->kn_status & KN_DEFERDELETE) {
4158 kev_flags |= EV_DELETE;
4159 } else {
4160 kev_flags |= EV_VANISHED;
4161 }
4162
4163 /* create fake event */
4164 kev = (struct kevent_qos_s){
4165 .filter = kn->kn_filter,
4166 .ident = kn->kn_id,
4167 .flags = kev_flags,
4168 .udata = kn->kn_udata,
4169 };
4170 } else {
4171 kqunlock(kq);
4172 kev = (struct kevent_qos_s) { };
4173 result = filter_call(knote_fops(kn), f_process(kn, &kev));
4174 kqlock(kq);
4175 }
4176
4177 /*
4178 * Determine how to dispatch the knote for future event handling.
4179 * not-fired: just return (do not callout, leave deactivated).
4180 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4181 * is the deferred delete event delivery itself). Otherwise,
4182 * drop it.
4183 * Dispatch: don't clear state, just mark it disabled.
4184 * Cleared: just leave it deactivated.
4185 * Others: re-activate as there may be more events to handle.
4186 * This will not wake up more handlers right now, but
4187 * at the completion of handling events it may trigger
4188 * more handler threads (TODO: optimize based on more than
4189 * just this one event being detected by the filter).
4190 */
4191 if ((result & FILTER_ACTIVE) == 0) {
4192 if ((kn->kn_status & KN_ACTIVE) == 0) {
4193 /*
4194 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4195 * within f_process() but that doesn't necessarily make them
4196 * ready to process, so we should leave them be.
4197 *
4198 * For other knotes, since we will not return an event,
4199 * there's no point keeping the knote suppressed.
4200 */
4201 knote_unsuppress(kq, kn);
4202 }
4203 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4204 return EJUSTRETURN;
4205 }
4206
4207 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4208 knote_adjust_qos(kq, kn, result);
4209 }
4210
4211 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4212 kqueue_update_iotier_override(kq);
4213 }
4214
4215 kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4216
4217 if (kev.flags & EV_ONESHOT) {
4218 if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4219 (kn->kn_status & KN_DEFERDELETE) == 0) {
4220 /* defer dropping non-delete oneshot dispatch2 events */
4221 kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4222 } else {
4223 drop = true;
4224 }
4225 } else if (kn->kn_flags & EV_DISPATCH) {
4226 /* disable all dispatch knotes */
4227 kn->kn_status |= KN_DISABLED;
4228 } else if ((kn->kn_flags & EV_CLEAR) == 0) {
4229 /* re-activate in case there are more events */
4230 knote_activate(kq, kn, FILTER_ACTIVE);
4231 }
4232
4233 /*
4234 * callback to handle each event as we find it.
4235 * If we have to detach and drop the knote, do
4236 * it while we have the kq unlocked.
4237 */
4238 if (drop) {
4239 knote_drop(kq, kn, &knlc);
4240 } else {
4241 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4242 }
4243
4244 if (kev.flags & EV_VANISHED) {
4245 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4246 kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4247 kn->kn_filtid);
4248 }
4249
4250 error = (callback)(&kev, kectx);
4251 kqlock(kq);
4252 return error;
4253 }
4254
4255 /*
4256 * Returns -1 if the kqueue was unbound and processing should not happen
4257 */
4258 #define KQWQAE_BEGIN_PROCESSING 1
4259 #define KQWQAE_END_PROCESSING 2
4260 #define KQWQAE_UNBIND 3
4261 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4262 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4263 int kevent_flags, int kqwqae_op)
4264 {
4265 struct knote *kn;
4266 int rc = 0;
4267 bool unbind;
4268 struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4269 struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4270
4271 kqlock_held(&kqwq->kqwq_kqueue);
4272
4273 /*
4274 * Return suppressed knotes to their original state.
4275 * For workq kqueues, suppressed ones that are still
4276 * truly active (not just forced into the queue) will
4277 * set flags we check below to see if anything got
4278 * woken up.
4279 */
4280 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4281 knote_unsuppress(kqwq, kn);
4282 }
4283
4284 if (kqwqae_op == KQWQAE_UNBIND) {
4285 unbind = true;
4286 } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4287 unbind = false;
4288 } else {
4289 unbind = TAILQ_EMPTY(queue);
4290 }
4291 if (unbind) {
4292 thread_t thread = kqr_thread_fast(kqr);
4293 thread_qos_t old_override;
4294
4295 #if DEBUG || DEVELOPMENT
4296 thread_t self = current_thread();
4297 struct uthread *ut = get_bsdthread_info(self);
4298
4299 assert(thread == self);
4300 assert(ut->uu_kqr_bound == kqr);
4301 #endif // DEBUG || DEVELOPMENT
4302
4303 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4304 if (!TAILQ_EMPTY(queue)) {
4305 /*
4306 * Request a new thread if we didn't process the whole
4307 * queue.
4308 */
4309 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4310 kqr->tr_kq_qos_index, 0);
4311 }
4312 if (old_override) {
4313 thread_drop_kevent_override(thread);
4314 }
4315 rc = -1;
4316 }
4317
4318 return rc;
4319 }
4320
4321 /*
4322 * Return 0 to indicate that processing should proceed,
4323 * -1 if there is nothing to process.
4324 *
4325 * Called with kqueue locked and returns the same way,
4326 * but may drop lock temporarily.
4327 */
4328 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4329 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4330 int kevent_flags)
4331 {
4332 int rc = 0;
4333
4334 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4335 0, kqr->tr_kq_qos_index);
4336
4337 rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4338 KQWQAE_BEGIN_PROCESSING);
4339
4340 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4341 thread_tid(kqr_thread(kqr)),
4342 !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4343
4344 return rc;
4345 }
4346
4347 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4348 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4349 {
4350 kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4351 struct knote *kn, *tmp;
4352
4353 kqlock_held(kqwl);
4354
4355 TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4356 /*
4357 * If a knote that can adjust QoS is disabled because of the automatic
4358 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4359 * further overrides keep pushing.
4360 */
4361 if (knote_fops(kn)->f_adjusts_qos &&
4362 (kn->kn_status & KN_DISABLED) != 0 &&
4363 (kn->kn_status & KN_DROPPING) == 0 &&
4364 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4365 qos = MAX(qos, kn->kn_qos_override);
4366 continue;
4367 }
4368 knote_unsuppress(kqwl, kn);
4369 }
4370
4371 return qos;
4372 }
4373
4374 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4375 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4376 {
4377 workq_threadreq_t kqr = &kqwl->kqwl_request;
4378 struct kqueue *kq = &kqwl->kqwl_kqueue;
4379 int rc = 0, op = KQWL_UTQ_NONE;
4380
4381 kqlock_held(kq);
4382
4383 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4384 kqwl->kqwl_dynamicid, 0, 0);
4385
4386 /* nobody else should still be processing */
4387 assert((kq->kq_state & KQ_PROCESSING) == 0);
4388
4389 kq->kq_state |= KQ_PROCESSING;
4390
4391 if (kevent_flags & KEVENT_FLAG_PARKING) {
4392 /*
4393 * When "parking" we want to process events and if no events are found
4394 * unbind.
4395 *
4396 * However, non overcommit threads sometimes park even when they have
4397 * more work so that the pool can narrow. For these, we need to unbind
4398 * early, so that calling kqworkloop_update_threads_qos() can ask the
4399 * workqueue subsystem whether the thread should park despite having
4400 * pending events.
4401 */
4402 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4403 op = KQWL_UTQ_PARKING;
4404 } else {
4405 op = KQWL_UTQ_UNBINDING;
4406 }
4407 } else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4408 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4409 }
4410
4411 if (op != KQWL_UTQ_NONE) {
4412 thread_qos_t qos_override;
4413 thread_t thread = kqr_thread_fast(kqr);
4414
4415 qos_override = kqworkloop_acknowledge_events(kqwl);
4416
4417 if (op == KQWL_UTQ_UNBINDING) {
4418 kqworkloop_unbind_locked(kqwl, thread,
4419 KQWL_OVERRIDE_DROP_IMMEDIATELY);
4420 kqworkloop_release_live(kqwl);
4421 }
4422 kqworkloop_update_threads_qos(kqwl, op, qos_override);
4423 if (op == KQWL_UTQ_PARKING &&
4424 (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4425 kqworkloop_unbind_locked(kqwl, thread,
4426 KQWL_OVERRIDE_DROP_DELAYED);
4427 kqworkloop_release_live(kqwl);
4428 rc = -1;
4429 } else if (op == KQWL_UTQ_UNBINDING &&
4430 kqr_thread(kqr) != thread) {
4431 rc = -1;
4432 }
4433
4434 if (rc == -1) {
4435 kq->kq_state &= ~KQ_PROCESSING;
4436 kqworkloop_unbind_delayed_override_drop(thread);
4437 }
4438 }
4439
4440 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4441 kqwl->kqwl_dynamicid, 0, 0);
4442
4443 return rc;
4444 }
4445
4446 /*
4447 * Return 0 to indicate that processing should proceed,
4448 * -1 if there is nothing to process.
4449 * EBADF if the kqueue is draining
4450 *
4451 * Called with kqueue locked and returns the same way,
4452 * but may drop lock temporarily.
4453 * May block.
4454 */
4455 static int
kqfile_begin_processing(struct kqfile * kq)4456 kqfile_begin_processing(struct kqfile *kq)
4457 {
4458 kqlock_held(kq);
4459
4460 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4461 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4462 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4463
4464 /* wait to become the exclusive processing thread */
4465 while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4466 kq->kqf_state |= KQ_PROCWAIT;
4467 lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4468 &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4469 }
4470
4471 if (kq->kqf_state & KQ_DRAIN) {
4472 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4473 VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4474 return EBADF;
4475 }
4476
4477 /* Nobody else processing */
4478
4479 /* anything left to process? */
4480 if (kq->kqf_count == 0) {
4481 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4482 VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4483 return -1;
4484 }
4485
4486 /* convert to processing mode */
4487 kq->kqf_state |= KQ_PROCESSING;
4488
4489 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4490 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4491 return 0;
4492 }
4493
4494 /*
4495 * Try to end the processing, only called when a workq thread is attempting to
4496 * park (KEVENT_FLAG_PARKING is set).
4497 *
4498 * When returning -1, the kqworkq is setup again so that it is ready to be
4499 * processed.
4500 */
4501 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4502 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4503 int kevent_flags)
4504 {
4505 if (kevent_flags & KEVENT_FLAG_PARKING) {
4506 /*
4507 * if acknowledge events "succeeds" it means there are events,
4508 * which is a failure condition for end_processing.
4509 */
4510 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4511 KQWQAE_END_PROCESSING);
4512 if (rc == 0) {
4513 return -1;
4514 }
4515 }
4516
4517 return 0;
4518 }
4519
4520 /*
4521 * Try to end the processing, only called when a workq thread is attempting to
4522 * park (KEVENT_FLAG_PARKING is set).
4523 *
4524 * When returning -1, the kqworkq is setup again so that it is ready to be
4525 * processed (as if kqworkloop_begin_processing had just been called).
4526 *
4527 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4528 * the kqworkloop is unbound from its servicer as a side effect.
4529 */
4530 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4531 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4532 {
4533 struct kqueue *kq = &kqwl->kqwl_kqueue;
4534 workq_threadreq_t kqr = &kqwl->kqwl_request;
4535 int rc = 0;
4536
4537 kqlock_held(kq);
4538
4539 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4540 kqwl->kqwl_dynamicid, 0, 0);
4541
4542 if (kevent_flags & KEVENT_FLAG_PARKING) {
4543 thread_t thread = kqr_thread_fast(kqr);
4544 thread_qos_t qos_override;
4545
4546 /*
4547 * When KEVENT_FLAG_PARKING is set, we need to attempt
4548 * an unbind while still under the lock.
4549 *
4550 * So we do everything kqworkloop_unbind() would do, but because
4551 * we're inside kqueue_process(), if the workloop actually
4552 * received events while our locks were dropped, we have
4553 * the opportunity to fail the end processing and loop again.
4554 *
4555 * This avoids going through the process-wide workqueue lock
4556 * hence scales better.
4557 */
4558 assert(flags & KQ_PROCESSING);
4559 qos_override = kqworkloop_acknowledge_events(kqwl);
4560 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4561
4562 if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4563 rc = -1;
4564 } else {
4565 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4566 kqworkloop_release_live(kqwl);
4567 kq->kq_state &= ~flags;
4568 kqworkloop_unbind_delayed_override_drop(thread);
4569 }
4570 } else {
4571 kq->kq_state &= ~flags;
4572 kq->kq_state |= KQ_R2K_ARMED;
4573 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4574 }
4575
4576 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4577 kqwl->kqwl_dynamicid, 0, 0);
4578
4579 return rc;
4580 }
4581
4582 /*
4583 * Called with kqueue lock held.
4584 *
4585 * 0: no more events
4586 * -1: has more events
4587 * EBADF: kqueue is in draining mode
4588 */
4589 static int
kqfile_end_processing(struct kqfile * kq)4590 kqfile_end_processing(struct kqfile *kq)
4591 {
4592 struct knote *kn;
4593 int procwait;
4594
4595 kqlock_held(kq);
4596
4597 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4598
4599 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4600 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4601
4602 /*
4603 * Return suppressed knotes to their original state.
4604 */
4605 while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4606 knote_unsuppress(kq, kn);
4607 }
4608
4609 procwait = (kq->kqf_state & KQ_PROCWAIT);
4610 kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4611
4612 if (procwait) {
4613 /* first wake up any thread already waiting to process */
4614 thread_wakeup(&kq->kqf_suppressed);
4615 }
4616
4617 if (kq->kqf_state & KQ_DRAIN) {
4618 return EBADF;
4619 }
4620 return kq->kqf_count != 0 ? -1 : 0;
4621 }
4622
4623 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4624 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4625 struct kqueue_workloop_params *params, int *retval)
4626 {
4627 int error = 0;
4628 struct kqworkloop *kqwl;
4629 struct filedesc *fdp = &p->p_fd;
4630 workq_threadreq_param_t trp = { };
4631
4632 switch (cmd) {
4633 case KQ_WORKLOOP_CREATE:
4634 if (!params->kqwlp_flags) {
4635 error = EINVAL;
4636 break;
4637 }
4638
4639 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4640 (params->kqwlp_sched_pri < 1 ||
4641 params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4642 error = EINVAL;
4643 break;
4644 }
4645
4646 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4647 invalid_policy(params->kqwlp_sched_pol)) {
4648 error = EINVAL;
4649 break;
4650 }
4651
4652 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4653 (params->kqwlp_cpu_percent <= 0 ||
4654 params->kqwlp_cpu_percent > 100 ||
4655 params->kqwlp_cpu_refillms <= 0 ||
4656 params->kqwlp_cpu_refillms > 0x00ffffff)) {
4657 error = EINVAL;
4658 break;
4659 }
4660
4661 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4662 trp.trp_flags |= TRP_PRIORITY;
4663 trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4664 }
4665 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4666 trp.trp_flags |= TRP_POLICY;
4667 trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4668 }
4669 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4670 trp.trp_flags |= TRP_CPUPERCENT;
4671 trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4672 trp.trp_refillms = params->kqwlp_cpu_refillms;
4673 }
4674
4675 error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
4676 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4677 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
4678 if (error) {
4679 break;
4680 }
4681
4682 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
4683 /* FD_WORKLOOP indicates we've ever created a workloop
4684 * via this syscall but its only ever added to a process, never
4685 * removed.
4686 */
4687 proc_fdlock(p);
4688 fdt_flag_set(fdp, FD_WORKLOOP);
4689 proc_fdunlock(p);
4690 }
4691 break;
4692 case KQ_WORKLOOP_DESTROY:
4693 error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
4694 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4695 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
4696 if (error) {
4697 break;
4698 }
4699 kqlock(kqwl);
4700 trp.trp_value = kqwl->kqwl_params;
4701 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4702 trp.trp_flags |= TRP_RELEASED;
4703 kqwl->kqwl_params = trp.trp_value;
4704 kqworkloop_release_live(kqwl);
4705 } else {
4706 error = EINVAL;
4707 }
4708 kqunlock(kqwl);
4709 kqworkloop_release(kqwl);
4710 break;
4711 }
4712 *retval = 0;
4713 return error;
4714 }
4715
4716 int
kqueue_workloop_ctl(proc_t p,struct kqueue_workloop_ctl_args * uap,int * retval)4717 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
4718 {
4719 struct kqueue_workloop_params params = {
4720 .kqwlp_id = 0,
4721 };
4722 if (uap->sz < sizeof(params.kqwlp_version)) {
4723 return EINVAL;
4724 }
4725
4726 size_t copyin_sz = MIN(sizeof(params), uap->sz);
4727 int rv = copyin(uap->addr, ¶ms, copyin_sz);
4728 if (rv) {
4729 return rv;
4730 }
4731
4732 if (params.kqwlp_version != (int)uap->sz) {
4733 return EINVAL;
4734 }
4735
4736 return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms,
4737 retval);
4738 }
4739
4740 static int
kqueue_select(struct fileproc * fp,int which,void * wql,__unused vfs_context_t ctx)4741 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
4742 {
4743 struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
4744 int retnum = 0;
4745
4746 assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4747
4748 if (which == FREAD) {
4749 kqlock(kq);
4750 if (kqfile_begin_processing(kq) == 0) {
4751 retnum = kq->kqf_count;
4752 kqfile_end_processing(kq);
4753 } else if ((kq->kqf_state & KQ_DRAIN) == 0) {
4754 selrecord(kq->kqf_p, &kq->kqf_sel, wql);
4755 }
4756 kqunlock(kq);
4757 }
4758 return retnum;
4759 }
4760
4761 /*
4762 * kqueue_close -
4763 */
4764 static int
kqueue_close(struct fileglob * fg,__unused vfs_context_t ctx)4765 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4766 {
4767 struct kqfile *kqf = fg_get_data(fg);
4768
4769 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4770 kqlock(kqf);
4771 selthreadclear(&kqf->kqf_sel);
4772 kqunlock(kqf);
4773 kqueue_dealloc(&kqf->kqf_kqueue);
4774 fg_set_data(fg, NULL);
4775 return 0;
4776 }
4777
4778 /*
4779 * Max depth of the nested kq path that can be created.
4780 * Note that this has to be less than the size of kq_level
4781 * to avoid wrapping around and mislabeling the level. We also
4782 * want to be aggressive about this so that we don't overflow the
4783 * kernel stack while posting kevents
4784 */
4785 #define MAX_NESTED_KQ 10
4786
4787 /*
4788 * The callers has taken a use-count reference on this kqueue and will donate it
4789 * to the kqueue we are being added to. This keeps the kqueue from closing until
4790 * that relationship is torn down.
4791 */
4792 static int
kqueue_kqfilter(struct fileproc * fp,struct knote * kn,__unused struct kevent_qos_s * kev)4793 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
4794 __unused struct kevent_qos_s *kev)
4795 {
4796 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
4797 struct kqueue *kq = &kqf->kqf_kqueue;
4798 struct kqueue *parentkq = knote_get_kq(kn);
4799
4800 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4801
4802 if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
4803 knote_set_error(kn, EINVAL);
4804 return 0;
4805 }
4806
4807 /*
4808 * We have to avoid creating a cycle when nesting kqueues
4809 * inside another. Rather than trying to walk the whole
4810 * potential DAG of nested kqueues, we just use a simple
4811 * ceiling protocol. When a kqueue is inserted into another,
4812 * we check that the (future) parent is not already nested
4813 * into another kqueue at a lower level than the potenial
4814 * child (because it could indicate a cycle). If that test
4815 * passes, we just mark the nesting levels accordingly.
4816 *
4817 * Only up to MAX_NESTED_KQ can be nested.
4818 *
4819 * Note: kqworkq and kqworkloop cannot be nested and have reused their
4820 * kq_level field, so ignore these as parent.
4821 */
4822
4823 kqlock(parentkq);
4824
4825 if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
4826 if (parentkq->kq_level > 0 &&
4827 parentkq->kq_level < kq->kq_level) {
4828 kqunlock(parentkq);
4829 knote_set_error(kn, EINVAL);
4830 return 0;
4831 }
4832
4833 /* set parent level appropriately */
4834 uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
4835 if (plevel < kq->kq_level + 1) {
4836 if (kq->kq_level + 1 > MAX_NESTED_KQ) {
4837 kqunlock(parentkq);
4838 knote_set_error(kn, EINVAL);
4839 return 0;
4840 }
4841 plevel = kq->kq_level + 1;
4842 }
4843
4844 parentkq->kq_level = plevel;
4845 }
4846
4847 kqunlock(parentkq);
4848
4849 kn->kn_filtid = EVFILTID_KQREAD;
4850 kqlock(kq);
4851 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
4852 /* indicate nesting in child, if needed */
4853 if (kq->kq_level == 0) {
4854 kq->kq_level = 1;
4855 }
4856
4857 int count = kq->kq_count;
4858 kqunlock(kq);
4859 return count > 0;
4860 }
4861
4862 __attribute__((noinline))
4863 static void
kqfile_wakeup(struct kqfile * kqf,long hint,wait_result_t wr)4864 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
4865 {
4866 /* wakeup a thread waiting on this queue */
4867 selwakeup(&kqf->kqf_sel);
4868
4869 /* wake up threads in kqueue_scan() */
4870 if (kqf->kqf_state & KQ_SLEEP) {
4871 kqf->kqf_state &= ~KQ_SLEEP;
4872 thread_wakeup_with_result(&kqf->kqf_count, wr);
4873 }
4874
4875 if (hint == NOTE_REVOKE) {
4876 /* wakeup threads waiting their turn to process */
4877 if (kqf->kqf_state & KQ_PROCWAIT) {
4878 assert(kqf->kqf_state & KQ_PROCESSING);
4879 kqf->kqf_state &= ~KQ_PROCWAIT;
4880 thread_wakeup(&kqf->kqf_suppressed);
4881 }
4882
4883 /* no need to KNOTE: knote_fdclose() takes care of it */
4884 } else {
4885 /* wakeup other kqueues/select sets we're inside */
4886 KNOTE(&kqf->kqf_sel.si_note, hint);
4887 }
4888 }
4889
4890 /*
4891 * kqueue_drain - called when kq is closed
4892 */
4893 static int
kqueue_drain(struct fileproc * fp,__unused vfs_context_t ctx)4894 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
4895 {
4896 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
4897
4898 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4899
4900 kqlock(kqf);
4901 kqf->kqf_state |= KQ_DRAIN;
4902 kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
4903 kqunlock(kqf);
4904 return 0;
4905 }
4906
4907 int
kqueue_stat(struct kqueue * kq,void * ub,int isstat64,proc_t p)4908 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
4909 {
4910 assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4911
4912 kqlock(kq);
4913 if (isstat64 != 0) {
4914 struct stat64 *sb64 = (struct stat64 *)ub;
4915
4916 bzero((void *)sb64, sizeof(*sb64));
4917 sb64->st_size = kq->kq_count;
4918 if (kq->kq_state & KQ_KEV_QOS) {
4919 sb64->st_blksize = sizeof(struct kevent_qos_s);
4920 } else if (kq->kq_state & KQ_KEV64) {
4921 sb64->st_blksize = sizeof(struct kevent64_s);
4922 } else if (IS_64BIT_PROCESS(p)) {
4923 sb64->st_blksize = sizeof(struct user64_kevent);
4924 } else {
4925 sb64->st_blksize = sizeof(struct user32_kevent);
4926 }
4927 sb64->st_mode = S_IFIFO;
4928 } else {
4929 struct stat *sb = (struct stat *)ub;
4930
4931 bzero((void *)sb, sizeof(*sb));
4932 sb->st_size = kq->kq_count;
4933 if (kq->kq_state & KQ_KEV_QOS) {
4934 sb->st_blksize = sizeof(struct kevent_qos_s);
4935 } else if (kq->kq_state & KQ_KEV64) {
4936 sb->st_blksize = sizeof(struct kevent64_s);
4937 } else if (IS_64BIT_PROCESS(p)) {
4938 sb->st_blksize = sizeof(struct user64_kevent);
4939 } else {
4940 sb->st_blksize = sizeof(struct user32_kevent);
4941 }
4942 sb->st_mode = S_IFIFO;
4943 }
4944 kqunlock(kq);
4945 return 0;
4946 }
4947
4948 static inline bool
kqueue_threadreq_can_use_ast(struct kqueue * kq)4949 kqueue_threadreq_can_use_ast(struct kqueue *kq)
4950 {
4951 if (current_proc() == kq->kq_p) {
4952 /*
4953 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
4954 * do combined send/receive and in the case of self-IPC, the AST may bet
4955 * set on a thread that will not return to userspace and needs the
4956 * thread the AST would create to unblock itself.
4957 *
4958 * At this time, we really want to target:
4959 *
4960 * - kevent variants that can cause thread creations, and dispatch
4961 * really only uses kevent_qos and kevent_id,
4962 *
4963 * - workq_kernreturn (directly about thread creations)
4964 *
4965 * - bsdthread_ctl which is used for qos changes and has direct impact
4966 * on the creator thread scheduling decisions.
4967 */
4968 switch (current_uthread()->syscall_code) {
4969 case SYS_kevent_qos:
4970 case SYS_kevent_id:
4971 case SYS_workq_kernreturn:
4972 case SYS_bsdthread_ctl:
4973 return true;
4974 }
4975 }
4976 return false;
4977 }
4978
4979 /*
4980 * Interact with the pthread kext to request a servicing there at a specific QoS
4981 * level.
4982 *
4983 * - Caller holds the kqlock
4984 *
4985 * - May be called with the kqueue's wait queue set locked,
4986 * so cannot do anything that could recurse on that.
4987 */
4988 static void
kqueue_threadreq_initiate(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,int flags)4989 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
4990 kq_index_t qos, int flags)
4991 {
4992 assert(kqr_thread(kqr) == THREAD_NULL);
4993 assert(!kqr_thread_requested(kqr));
4994 struct turnstile *ts = TURNSTILE_NULL;
4995
4996 if (workq_is_exiting(kqu.kq->kq_p)) {
4997 return;
4998 }
4999
5000 kqlock_held(kqu);
5001
5002 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5003 struct kqworkloop *kqwl = kqu.kqwl;
5004
5005 assert(kqwl->kqwl_owner == THREAD_NULL);
5006 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5007 kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5008 ts = kqwl->kqwl_turnstile;
5009 /* Add a thread request reference on the kqueue. */
5010 kqworkloop_retain(kqwl);
5011
5012 #if CONFIG_PREADOPT_TG
5013 /* This thread is the one which is ack-ing the thread group on the kqwl
5014 * under the kqlock and will take action accordingly, pairs with the
5015 * release barrier in kqueue_set_preadopted_thread_group */
5016 uint16_t tg_acknowledged;
5017 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5018 KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5019 &tg_acknowledged, acquire)) {
5020 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5021 }
5022 #endif
5023 } else {
5024 assert(kqu.kq->kq_state & KQ_WORKQ);
5025 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5026 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5027 }
5028
5029 /*
5030 * New-style thread request supported.
5031 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5032 * its use until a corresponding kqueue_threadreq_bind callback.
5033 */
5034 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5035 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5036 }
5037 if (qos == KQWQ_QOS_MANAGER) {
5038 qos = WORKQ_THREAD_QOS_MANAGER;
5039 }
5040
5041 if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5042 /*
5043 * Process is shutting down or exec'ing.
5044 * All the kqueues are going to be cleaned up
5045 * soon. Forget we even asked for a thread -
5046 * and make sure we don't ask for more.
5047 */
5048 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5049 kqueue_release_live(kqu);
5050 }
5051 }
5052
5053 /*
5054 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5055 *
5056 * This is used when kqueue_threadreq_bind may cause a lock inversion.
5057 */
5058 __attribute__((always_inline))
5059 void
kqueue_threadreq_bind_prepost(struct proc * p __unused,workq_threadreq_t kqr,struct uthread * ut)5060 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5061 struct uthread *ut)
5062 {
5063 ut->uu_kqr_bound = kqr;
5064 kqr->tr_thread = get_machthread(ut);
5065 kqr->tr_state = WORKQ_TR_STATE_BINDING;
5066 }
5067
5068 /*
5069 * kqueue_threadreq_bind_commit - commit a bind prepost
5070 *
5071 * The workq code has to commit any binding prepost before the thread has
5072 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5073 */
5074 void
kqueue_threadreq_bind_commit(struct proc * p,thread_t thread)5075 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5076 {
5077 struct uthread *ut = get_bsdthread_info(thread);
5078 workq_threadreq_t kqr = ut->uu_kqr_bound;
5079 kqueue_t kqu = kqr_kqueue(p, kqr);
5080
5081 kqlock(kqu);
5082 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5083 kqueue_threadreq_bind(p, kqr, thread, 0);
5084 }
5085 kqunlock(kqu);
5086 }
5087
5088 static void
kqueue_threadreq_modify(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,workq_kern_threadreq_flags_t flags)5089 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5090 workq_kern_threadreq_flags_t flags)
5091 {
5092 assert(kqr_thread_requested_pending(kqr));
5093
5094 kqlock_held(kqu);
5095
5096 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5097 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5098 }
5099
5100 #if CONFIG_PREADOPT_TG
5101 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5102 uint16_t tg_ack_status;
5103 struct kqworkloop *kqwl = kqu.kqwl;
5104
5105 /* This thread is the one which is ack-ing the thread group on the kqwl
5106 * under the kqlock and will take action accordingly, needs acquire
5107 * barrier */
5108 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5109 KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5110 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5111 }
5112 }
5113 #endif
5114
5115 workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5116 }
5117
5118 /*
5119 * kqueue_threadreq_bind - bind thread to processing kqrequest
5120 *
5121 * The provided thread will be responsible for delivering events
5122 * associated with the given kqrequest. Bind it and get ready for
5123 * the thread to eventually arrive.
5124 */
5125 void
kqueue_threadreq_bind(struct proc * p,workq_threadreq_t kqr,thread_t thread,unsigned int flags)5126 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5127 unsigned int flags)
5128 {
5129 kqueue_t kqu = kqr_kqueue(p, kqr);
5130 struct uthread *ut = get_bsdthread_info(thread);
5131
5132 kqlock_held(kqu);
5133
5134 assert(ut->uu_kqueue_override == 0);
5135
5136 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5137 assert(ut->uu_kqr_bound == kqr);
5138 assert(kqr->tr_thread == thread);
5139 } else {
5140 assert(kqr_thread_requested_pending(kqr));
5141 assert(kqr->tr_thread == THREAD_NULL);
5142 assert(ut->uu_kqr_bound == NULL);
5143 ut->uu_kqr_bound = kqr;
5144 kqr->tr_thread = thread;
5145 }
5146
5147 kqr->tr_state = WORKQ_TR_STATE_BOUND;
5148
5149 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5150 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5151
5152 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5153 /*
5154 * <rdar://problem/38626999> shows that asserting here is not ok.
5155 *
5156 * This is not supposed to happen for correct use of the interface,
5157 * but it is sadly possible for userspace (with the help of memory
5158 * corruption, such as over-release of a dispatch queue) to make
5159 * the creator thread the "owner" of a workloop.
5160 *
5161 * Once that happens, and that creator thread picks up the same
5162 * workloop as a servicer, we trip this codepath. We need to fixup
5163 * the state to forget about this thread being the owner, as the
5164 * entire workloop state machine expects servicers to never be
5165 * owners and everything would basically go downhill from here.
5166 */
5167 kqu.kqwl->kqwl_owner = THREAD_NULL;
5168 if (kqworkloop_override(kqu.kqwl)) {
5169 thread_drop_kevent_override(thread);
5170 }
5171 }
5172
5173 if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5174 /*
5175 * Past this point, the interlock is the kq req lock again,
5176 * so we can fix the inheritor for good.
5177 */
5178 filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5179 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5180 }
5181
5182 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5183 thread_tid(thread), kqr->tr_kq_qos_index,
5184 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5185
5186 ut->uu_kqueue_override = kqr->tr_kq_override_index;
5187 if (kqr->tr_kq_override_index) {
5188 thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5189 }
5190
5191 #if CONFIG_PREADOPT_TG
5192 /* Remove reference from kqwl and mark it as bound with the SENTINEL */
5193 thread_group_qos_t old_tg;
5194 thread_group_qos_t new_tg;
5195 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5196 if (old_tg == KQWL_PREADOPTED_TG_NEVER) {
5197 os_atomic_rmw_loop_give_up(break); // It's an app, nothing to do
5198 }
5199 assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5200 new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5201 });
5202
5203 if (ret) {
5204 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5205
5206 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5207 struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5208 assert(tg != NULL);
5209
5210 thread_set_preadopt_thread_group(thread, tg);
5211 thread_group_release_live(tg); // The thread has a reference
5212 } else {
5213 /*
5214 * The thread may already have a preadopt thread group on it -
5215 * we need to make sure to clear that.
5216 */
5217 thread_set_preadopt_thread_group(thread, NULL);
5218 }
5219
5220 /* We have taken action on the preadopted thread group set on the
5221 * set on the kqwl, clear any redrive requests */
5222 os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5223 }
5224 #endif
5225 kqueue_update_iotier_override(kqu);
5226 } else {
5227 assert(kqr->tr_kq_override_index == 0);
5228
5229 #if CONFIG_PREADOPT_TG
5230 /*
5231 * The thread may have a preadopt thread group on it already because it
5232 * got tagged with it as a creator thread. So we need to make sure to
5233 * clear that since we don't have preadopt thread groups for non-kqwl
5234 * cases
5235 */
5236 thread_set_preadopt_thread_group(thread, NULL);
5237 #endif
5238 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5239 thread_tid(thread), kqr->tr_kq_qos_index,
5240 (kqr->tr_kq_override_index << 16) |
5241 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5242 }
5243 }
5244
5245 /*
5246 * kqueue_threadreq_cancel - abort a pending thread request
5247 *
5248 * Called when exiting/exec'ing. Forget our pending request.
5249 */
5250 void
kqueue_threadreq_cancel(struct proc * p,workq_threadreq_t kqr)5251 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5252 {
5253 kqueue_release(kqr_kqueue(p, kqr));
5254 }
5255
5256 workq_threadreq_param_t
kqueue_threadreq_workloop_param(workq_threadreq_t kqr)5257 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5258 {
5259 struct kqworkloop *kqwl;
5260 workq_threadreq_param_t trp;
5261
5262 assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5263 kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5264 trp.trp_value = kqwl->kqwl_params;
5265 return trp;
5266 }
5267
5268 /*
5269 * kqueue_threadreq_unbind - unbind thread from processing kqueue
5270 *
5271 * End processing the per-QoS bucket of events and allow other threads
5272 * to be requested for future servicing.
5273 *
5274 * caller holds a reference on the kqueue.
5275 */
5276 void
kqueue_threadreq_unbind(struct proc * p,workq_threadreq_t kqr)5277 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5278 {
5279 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5280 kqworkloop_unbind(kqr_kqworkloop(kqr));
5281 } else {
5282 kqworkq_unbind(p, kqr);
5283 }
5284 }
5285
5286 /*
5287 * If we aren't already busy processing events [for this QoS],
5288 * request workq thread support as appropriate.
5289 *
5290 * TBD - for now, we don't segregate out processing by QoS.
5291 *
5292 * - May be called with the kqueue's wait queue set locked,
5293 * so cannot do anything that could recurse on that.
5294 */
5295 static void
kqworkq_wakeup(struct kqworkq * kqwq,kq_index_t qos_index)5296 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5297 {
5298 workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5299
5300 /* convert to thread qos value */
5301 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5302
5303 if (!kqr_thread_requested(kqr)) {
5304 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5305 }
5306 }
5307
5308 /*
5309 * This represent the asynchronous QoS a given workloop contributes,
5310 * hence is the max of the current active knotes (override index)
5311 * and the workloop max qos (userspace async qos).
5312 */
5313 static kq_index_t
kqworkloop_override(struct kqworkloop * kqwl)5314 kqworkloop_override(struct kqworkloop *kqwl)
5315 {
5316 workq_threadreq_t kqr = &kqwl->kqwl_request;
5317 return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5318 }
5319
5320 static inline void
kqworkloop_request_fire_r2k_notification(struct kqworkloop * kqwl)5321 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5322 {
5323 workq_threadreq_t kqr = &kqwl->kqwl_request;
5324
5325 kqlock_held(kqwl);
5326
5327 if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5328 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5329 act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5330 }
5331 }
5332
5333 static void
kqworkloop_update_threads_qos(struct kqworkloop * kqwl,int op,kq_index_t qos)5334 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5335 {
5336 workq_threadreq_t kqr = &kqwl->kqwl_request;
5337 struct kqueue *kq = &kqwl->kqwl_kqueue;
5338 kq_index_t old_override = kqworkloop_override(kqwl);
5339
5340 kqlock_held(kqwl);
5341
5342 switch (op) {
5343 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5344 kqwl->kqwl_wakeup_qos = qos;
5345 kqworkloop_request_fire_r2k_notification(kqwl);
5346 goto recompute;
5347
5348 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5349 kqr->tr_kq_override_index = qos;
5350 goto recompute;
5351
5352 case KQWL_UTQ_PARKING:
5353 case KQWL_UTQ_UNBINDING:
5354 kqr->tr_kq_override_index = qos;
5355 OS_FALLTHROUGH;
5356
5357 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5358 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5359 assert(qos == THREAD_QOS_UNSPECIFIED);
5360 }
5361 if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5362 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5363 }
5364 kqwl->kqwl_wakeup_qos = 0;
5365 for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5366 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5367 kqwl->kqwl_wakeup_qos = i;
5368 kqworkloop_request_fire_r2k_notification(kqwl);
5369 break;
5370 }
5371 }
5372 OS_FALLTHROUGH;
5373
5374 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5375 recompute:
5376 /*
5377 * When modifying the wakeup QoS or the override QoS, we always need to
5378 * maintain our invariant that kqr_override_index is at least as large
5379 * as the highest QoS for which an event is fired.
5380 *
5381 * However this override index can be larger when there is an overriden
5382 * suppressed knote pushing on the kqueue.
5383 */
5384 if (qos < kqwl->kqwl_wakeup_qos) {
5385 qos = kqwl->kqwl_wakeup_qos;
5386 }
5387 if (kqr->tr_kq_override_index < qos) {
5388 kqr->tr_kq_override_index = qos;
5389 }
5390 break;
5391
5392 case KQWL_UTQ_REDRIVE_EVENTS:
5393 break;
5394
5395 case KQWL_UTQ_SET_QOS_INDEX:
5396 kqr->tr_kq_qos_index = qos;
5397 break;
5398
5399 default:
5400 panic("unknown kqwl thread qos update operation: %d", op);
5401 }
5402
5403 thread_t kqwl_owner = kqwl->kqwl_owner;
5404 thread_t servicer = kqr_thread(kqr);
5405 boolean_t qos_changed = FALSE;
5406 kq_index_t new_override = kqworkloop_override(kqwl);
5407
5408 /*
5409 * Apply the diffs to the owner if applicable
5410 */
5411 if (kqwl_owner) {
5412 #if 0
5413 /* JMM - need new trace hooks for owner overrides */
5414 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5415 kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5416 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5417 #endif
5418 if (new_override == old_override) {
5419 // nothing to do
5420 } else if (old_override == THREAD_QOS_UNSPECIFIED) {
5421 thread_add_kevent_override(kqwl_owner, new_override);
5422 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5423 thread_drop_kevent_override(kqwl_owner);
5424 } else { /* old_override != new_override */
5425 thread_update_kevent_override(kqwl_owner, new_override);
5426 }
5427 }
5428
5429 /*
5430 * apply the diffs to the servicer
5431 */
5432
5433 if (!kqr_thread_requested(kqr)) {
5434 /*
5435 * No servicer, nor thread-request
5436 *
5437 * Make a new thread request, unless there is an owner (or the workloop
5438 * is suspended in userland) or if there is no asynchronous work in the
5439 * first place.
5440 */
5441
5442 if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5443 int initiate_flags = 0;
5444 if (op == KQWL_UTQ_UNBINDING) {
5445 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5446 }
5447
5448 /* kqueue_threadreq_initiate handles the acknowledgement of the TG
5449 * if needed */
5450 kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5451 }
5452 } else if (servicer) {
5453 /*
5454 * Servicer in flight
5455 *
5456 * Just apply the diff to the servicer
5457 */
5458
5459 #if CONFIG_PREADOPT_TG
5460 /* When there's a servicer for the kqwl already, then the servicer will
5461 * adopt the thread group in the kqr, we don't need to poke the
5462 * workqueue subsystem to make different decisions due to the thread
5463 * group. Consider the current request ack-ed.
5464 */
5465 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5466 #endif
5467
5468 struct uthread *ut = get_bsdthread_info(servicer);
5469 if (ut->uu_kqueue_override != new_override) {
5470 if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5471 thread_add_servicer_override(servicer, new_override);
5472 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5473 thread_drop_servicer_override(servicer);
5474 } else { /* ut->uu_kqueue_override != new_override */
5475 thread_update_servicer_override(servicer, new_override);
5476 }
5477 ut->uu_kqueue_override = new_override;
5478 qos_changed = TRUE;
5479 }
5480 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5481 /*
5482 * No events to deliver anymore.
5483 *
5484 * However canceling with turnstiles is challenging, so the fact that
5485 * the request isn't useful will be discovered by the servicer himself
5486 * later on.
5487 */
5488 } else if (old_override != new_override) {
5489 /*
5490 * Request is in flight
5491 *
5492 * Apply the diff to the thread request.
5493 */
5494 kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
5495 qos_changed = TRUE;
5496 }
5497
5498 if (qos_changed) {
5499 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5500 thread_tid(servicer), kqr->tr_kq_qos_index,
5501 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5502 }
5503 }
5504
5505 static void
kqworkloop_update_iotier_override(struct kqworkloop * kqwl)5506 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
5507 {
5508 workq_threadreq_t kqr = &kqwl->kqwl_request;
5509 thread_t servicer = kqr_thread(kqr);
5510 uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
5511
5512 kqlock_held(kqwl);
5513
5514 if (servicer) {
5515 thread_update_servicer_iotier_override(servicer, iotier);
5516 }
5517 }
5518
5519 static void
kqworkloop_wakeup(struct kqworkloop * kqwl,kq_index_t qos)5520 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5521 {
5522 if (qos <= kqwl->kqwl_wakeup_qos) {
5523 /*
5524 * Shortcut wakeups that really do nothing useful
5525 */
5526 return;
5527 }
5528
5529 if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5530 kqr_thread(&kqwl->kqwl_request) == current_thread()) {
5531 /*
5532 * kqworkloop_end_processing() will perform the required QoS
5533 * computations when it unsets the processing mode.
5534 */
5535 return;
5536 }
5537
5538 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5539 }
5540
5541 static struct kqtailq *
kqueue_get_suppressed_queue(kqueue_t kq,struct knote * kn)5542 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5543 {
5544 if (kq.kq->kq_state & KQ_WORKLOOP) {
5545 return &kq.kqwl->kqwl_suppressed;
5546 } else if (kq.kq->kq_state & KQ_WORKQ) {
5547 return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
5548 } else {
5549 return &kq.kqf->kqf_suppressed;
5550 }
5551 }
5552
5553 struct turnstile *
kqueue_alloc_turnstile(kqueue_t kqu)5554 kqueue_alloc_turnstile(kqueue_t kqu)
5555 {
5556 struct kqworkloop *kqwl = kqu.kqwl;
5557 kq_state_t kq_state;
5558
5559 kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5560 if (kq_state & KQ_HAS_TURNSTILE) {
5561 /* force a dependency to pair with the atomic or with release below */
5562 return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5563 (uintptr_t)kq_state);
5564 }
5565
5566 if (!(kq_state & KQ_WORKLOOP)) {
5567 return TURNSTILE_NULL;
5568 }
5569
5570 struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
5571 bool workq_locked = false;
5572
5573 kqlock(kqu);
5574
5575 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5576 workq_locked = true;
5577 workq_kern_threadreq_lock(kqwl->kqwl_p);
5578 }
5579
5580 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5581 free_ts = ts;
5582 ts = kqwl->kqwl_turnstile;
5583 } else {
5584 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
5585 ts, TURNSTILE_WORKLOOPS);
5586
5587 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5588 os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5589
5590 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5591 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
5592 &kqwl->kqwl_request, kqwl->kqwl_owner,
5593 ts, TURNSTILE_IMMEDIATE_UPDATE);
5594 /*
5595 * The workq may no longer be the interlock after this.
5596 * In which case the inheritor wasn't updated.
5597 */
5598 }
5599 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5600 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5601 }
5602 }
5603
5604 if (workq_locked) {
5605 workq_kern_threadreq_unlock(kqwl->kqwl_p);
5606 }
5607
5608 kqunlock(kqu);
5609
5610 if (free_ts) {
5611 turnstile_deallocate(free_ts);
5612 } else {
5613 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
5614 }
5615 return ts;
5616 }
5617
5618 __attribute__((always_inline))
5619 struct turnstile *
kqueue_turnstile(kqueue_t kqu)5620 kqueue_turnstile(kqueue_t kqu)
5621 {
5622 kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5623 if (kq_state & KQ_WORKLOOP) {
5624 return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5625 }
5626 return TURNSTILE_NULL;
5627 }
5628
5629 __attribute__((always_inline))
5630 struct turnstile *
kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)5631 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5632 {
5633 struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5634 if (kqwl) {
5635 return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5636 }
5637 return TURNSTILE_NULL;
5638 }
5639
5640 static void
kqworkloop_set_overcommit(struct kqworkloop * kqwl)5641 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5642 {
5643 workq_threadreq_t kqr = &kqwl->kqwl_request;
5644
5645 /*
5646 * This test is racy, but since we never remove this bit,
5647 * it allows us to avoid taking a lock.
5648 */
5649 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5650 return;
5651 }
5652
5653 kqlock_held(kqwl);
5654
5655 if (kqr_thread_requested_pending(kqr)) {
5656 kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
5657 WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5658 } else {
5659 kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
5660 }
5661 }
5662
5663 static void
kqworkq_update_override(struct kqworkq * kqwq,struct knote * kn,kq_index_t override_index)5664 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
5665 kq_index_t override_index)
5666 {
5667 workq_threadreq_t kqr;
5668 kq_index_t old_override_index;
5669 kq_index_t queue_index = kn->kn_qos_index;
5670
5671 if (override_index <= queue_index) {
5672 return;
5673 }
5674
5675 kqr = kqworkq_get_request(kqwq, queue_index);
5676
5677 kqlock_held(kqwq);
5678
5679 old_override_index = kqr->tr_kq_override_index;
5680 if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5681 thread_t servicer = kqr_thread(kqr);
5682 kqr->tr_kq_override_index = override_index;
5683
5684 /* apply the override to [incoming?] servicing thread */
5685 if (servicer) {
5686 if (old_override_index) {
5687 thread_update_kevent_override(servicer, override_index);
5688 } else {
5689 thread_add_kevent_override(servicer, override_index);
5690 }
5691 }
5692 }
5693 }
5694
5695 static void
kqueue_update_iotier_override(kqueue_t kqu)5696 kqueue_update_iotier_override(kqueue_t kqu)
5697 {
5698 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5699 kqworkloop_update_iotier_override(kqu.kqwl);
5700 }
5701 }
5702
5703 static void
kqueue_update_override(kqueue_t kqu,struct knote * kn,thread_qos_t qos)5704 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5705 {
5706 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5707 kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5708 qos);
5709 } else {
5710 kqworkq_update_override(kqu.kqwq, kn, qos);
5711 }
5712 }
5713
5714 static void
kqworkloop_unbind_locked(struct kqworkloop * kqwl,thread_t thread,enum kqwl_unbind_locked_mode how)5715 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5716 enum kqwl_unbind_locked_mode how)
5717 {
5718 struct uthread *ut = get_bsdthread_info(thread);
5719 workq_threadreq_t kqr = &kqwl->kqwl_request;
5720
5721 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5722 thread_tid(thread), 0, 0);
5723
5724 kqlock_held(kqwl);
5725
5726 assert(ut->uu_kqr_bound == kqr);
5727 ut->uu_kqr_bound = NULL;
5728 if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5729 ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5730 thread_drop_servicer_override(thread);
5731 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5732 }
5733
5734 if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5735 turnstile_update_inheritor(kqwl->kqwl_turnstile,
5736 TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
5737 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
5738 TURNSTILE_INTERLOCK_HELD);
5739 }
5740
5741 #if CONFIG_PREADOPT_TG
5742 /* The kqueue is able to adopt a thread group again */
5743
5744 thread_group_qos_t old_tg, new_tg = NULL;
5745 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5746 new_tg = old_tg;
5747 if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
5748 new_tg = KQWL_PREADOPTED_TG_NULL;
5749 }
5750 });
5751 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
5752
5753 if (ret) {
5754 // Servicer can drop any preadopt thread group it has since it has
5755 // unbound.
5756 thread_set_preadopt_thread_group(thread, NULL);
5757 }
5758 #endif
5759 thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
5760
5761 kqr->tr_thread = THREAD_NULL;
5762 kqr->tr_state = WORKQ_TR_STATE_IDLE;
5763 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5764 }
5765
5766 static void
kqworkloop_unbind_delayed_override_drop(thread_t thread)5767 kqworkloop_unbind_delayed_override_drop(thread_t thread)
5768 {
5769 struct uthread *ut = get_bsdthread_info(thread);
5770 assert(ut->uu_kqr_bound == NULL);
5771 if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5772 thread_drop_servicer_override(thread);
5773 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5774 }
5775 }
5776
5777 /*
5778 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5779 *
5780 * It will acknowledge events, and possibly request a new thread if:
5781 * - there were active events left
5782 * - we pended waitq hook callouts during processing
5783 * - we pended wakeups while processing (or unsuppressing)
5784 *
5785 * Called with kqueue lock held.
5786 */
5787 static void
kqworkloop_unbind(struct kqworkloop * kqwl)5788 kqworkloop_unbind(struct kqworkloop *kqwl)
5789 {
5790 struct kqueue *kq = &kqwl->kqwl_kqueue;
5791 workq_threadreq_t kqr = &kqwl->kqwl_request;
5792 thread_t thread = kqr_thread_fast(kqr);
5793 int op = KQWL_UTQ_PARKING;
5794 kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
5795
5796 assert(thread == current_thread());
5797
5798 kqlock(kqwl);
5799
5800 /*
5801 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
5802 * unsuppressing knotes not to be applied until the eventual call to
5803 * kqworkloop_update_threads_qos() below.
5804 */
5805 assert((kq->kq_state & KQ_PROCESSING) == 0);
5806 if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5807 kq->kq_state |= KQ_PROCESSING;
5808 qos_override = kqworkloop_acknowledge_events(kqwl);
5809 kq->kq_state &= ~KQ_PROCESSING;
5810 }
5811
5812 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
5813 kqworkloop_update_threads_qos(kqwl, op, qos_override);
5814
5815 kqunlock(kqwl);
5816
5817 /*
5818 * Drop the override on the current thread last, after the call to
5819 * kqworkloop_update_threads_qos above.
5820 */
5821 kqworkloop_unbind_delayed_override_drop(thread);
5822
5823 /* If last reference, dealloc the workloop kq */
5824 kqworkloop_release(kqwl);
5825 }
5826
5827 static thread_qos_t
kqworkq_unbind_locked(struct kqworkq * kqwq,workq_threadreq_t kqr,thread_t thread)5828 kqworkq_unbind_locked(struct kqworkq *kqwq,
5829 workq_threadreq_t kqr, thread_t thread)
5830 {
5831 struct uthread *ut = get_bsdthread_info(thread);
5832 kq_index_t old_override = kqr->tr_kq_override_index;
5833
5834 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
5835 thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
5836
5837 kqlock_held(kqwq);
5838
5839 assert(ut->uu_kqr_bound == kqr);
5840 ut->uu_kqr_bound = NULL;
5841 kqr->tr_thread = THREAD_NULL;
5842 kqr->tr_state = WORKQ_TR_STATE_IDLE;
5843 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5844 kqwq->kqwq_state &= ~KQ_R2K_ARMED;
5845
5846 return old_override;
5847 }
5848
5849 /*
5850 * kqworkq_unbind - unbind of a workq kqueue from a thread
5851 *
5852 * We may have to request new threads.
5853 * This can happen there are no waiting processing threads and:
5854 * - there were active events we never got to (count > 0)
5855 * - we pended waitq hook callouts during processing
5856 * - we pended wakeups while processing (or unsuppressing)
5857 */
5858 static void
kqworkq_unbind(proc_t p,workq_threadreq_t kqr)5859 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
5860 {
5861 struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
5862 __assert_only int rc;
5863
5864 kqlock(kqwq);
5865 rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
5866 assert(rc == -1);
5867 kqunlock(kqwq);
5868 }
5869
5870 workq_threadreq_t
kqworkq_get_request(struct kqworkq * kqwq,kq_index_t qos_index)5871 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
5872 {
5873 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5874 return &kqwq->kqwq_request[qos_index - 1];
5875 }
5876
5877 static void
knote_reset_priority(kqueue_t kqu,struct knote * kn,pthread_priority_t pp)5878 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
5879 {
5880 kq_index_t qos = _pthread_priority_thread_qos(pp);
5881
5882 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5883 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
5884 pp = _pthread_priority_normalize(pp);
5885 } else if (kqu.kq->kq_state & KQ_WORKQ) {
5886 if (qos == THREAD_QOS_UNSPECIFIED) {
5887 /* On workqueues, outside of QoS means MANAGER */
5888 qos = KQWQ_QOS_MANAGER;
5889 pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
5890 } else {
5891 pp = _pthread_priority_normalize(pp);
5892 }
5893 } else {
5894 pp = _pthread_unspecified_priority();
5895 qos = THREAD_QOS_UNSPECIFIED;
5896 }
5897
5898 kn->kn_qos = (int32_t)pp;
5899
5900 if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
5901 /* Never lower QoS when in "Merge" mode */
5902 kn->kn_qos_override = qos;
5903 }
5904
5905 /* only adjust in-use qos index when not suppressed */
5906 if (kn->kn_status & KN_SUPPRESSED) {
5907 kqueue_update_override(kqu, kn, qos);
5908 } else if (kn->kn_qos_index != qos) {
5909 knote_dequeue(kqu, kn);
5910 kn->kn_qos_index = qos;
5911 }
5912 }
5913
5914 static void
knote_adjust_qos(struct kqueue * kq,struct knote * kn,int result)5915 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
5916 {
5917 thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
5918
5919 kqlock_held(kq);
5920
5921 assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
5922 assert(qos_index < THREAD_QOS_LAST);
5923
5924 /*
5925 * Early exit for knotes that should not change QoS
5926 */
5927 if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
5928 panic("filter %d cannot change QoS", kn->kn_filtid);
5929 } else if (__improbable(!knote_has_qos(kn))) {
5930 return;
5931 }
5932
5933 /*
5934 * knotes with the FALLBACK flag will only use their registration QoS if the
5935 * incoming event has no QoS, else, the registration QoS acts as a floor.
5936 */
5937 thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
5938 if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
5939 if (qos_index == THREAD_QOS_UNSPECIFIED) {
5940 qos_index = req_qos;
5941 }
5942 } else {
5943 if (qos_index < req_qos) {
5944 qos_index = req_qos;
5945 }
5946 }
5947 if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
5948 /* Never lower QoS when in "Merge" mode */
5949 return;
5950 }
5951
5952 if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
5953 /*
5954 * When we're trying to update the QoS override and that both an
5955 * f_event() and other f_* calls are running concurrently, any of these
5956 * in flight calls may want to perform overrides that aren't properly
5957 * serialized with each other.
5958 *
5959 * The first update that observes this racy situation enters a "Merge"
5960 * mode which causes subsequent override requests to saturate the
5961 * override instead of replacing its value.
5962 *
5963 * This mode is left when knote_unlock() or knote_post()
5964 * observe that no other f_* routine is in flight.
5965 */
5966 kn->kn_status |= KN_MERGE_QOS;
5967 }
5968
5969 /*
5970 * Now apply the override if it changed.
5971 */
5972
5973 if (kn->kn_qos_override == qos_index) {
5974 return;
5975 }
5976
5977 kn->kn_qos_override = qos_index;
5978
5979 if (kn->kn_status & KN_SUPPRESSED) {
5980 /*
5981 * For suppressed events, the kn_qos_index field cannot be touched as it
5982 * allows us to know on which supress queue the knote is for a kqworkq.
5983 *
5984 * Also, there's no natural push applied on the kqueues when this field
5985 * changes anyway. We hence need to apply manual overrides in this case,
5986 * which will be cleared when the events are later acknowledged.
5987 */
5988 kqueue_update_override(kq, kn, qos_index);
5989 } else if (kn->kn_qos_index != qos_index) {
5990 knote_dequeue(kq, kn);
5991 kn->kn_qos_index = qos_index;
5992 }
5993 }
5994
5995 void
klist_init(struct klist * list)5996 klist_init(struct klist *list)
5997 {
5998 SLIST_INIT(list);
5999 }
6000
6001
6002 /*
6003 * Query/Post each knote in the object's list
6004 *
6005 * The object lock protects the list. It is assumed that the filter/event
6006 * routine for the object can determine that the object is already locked (via
6007 * the hint) and not deadlock itself.
6008 *
6009 * Autodetach is a specific contract which will detach all knotes from the
6010 * object prior to posting the final event for that knote. This is done while
6011 * under the object lock. A breadcrumb is left in the knote's next pointer to
6012 * indicate to future calls to f_detach routines that they need not reattempt
6013 * to knote_detach from the object's klist again. This is currently used by
6014 * EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6015 *
6016 */
6017 void
knote(struct klist * list,long hint,bool autodetach)6018 knote(struct klist *list, long hint, bool autodetach)
6019 {
6020 struct knote *kn;
6021 struct knote *tmp_kn;
6022 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6023 /*
6024 * We can modify the knote's next pointer since since we are holding the
6025 * object lock and the list can't be concurrently modified. Anyone
6026 * determining auto-detached-ness of a knote should take the primitive lock
6027 * to synchronize.
6028 *
6029 * Note that we do this here instead of the filter's f_event since we may
6030 * not even post the event if the knote is being dropped.
6031 */
6032 if (autodetach) {
6033 kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6034 }
6035 knote_post(kn, hint);
6036 }
6037
6038 /* Blast away the entire klist */
6039 if (autodetach) {
6040 klist_init(list);
6041 }
6042 }
6043
6044 /*
6045 * attach a knote to the specified list. Return true if this is the first entry.
6046 * The list is protected by whatever lock the object it is associated with uses.
6047 */
6048 int
knote_attach(struct klist * list,struct knote * kn)6049 knote_attach(struct klist *list, struct knote *kn)
6050 {
6051 int ret = SLIST_EMPTY(list);
6052 SLIST_INSERT_HEAD(list, kn, kn_selnext);
6053 return ret;
6054 }
6055
6056 /*
6057 * detach a knote from the specified list. Return true if that was the last
6058 * entry. The list is protected by whatever lock the object it is associated
6059 * with uses.
6060 */
6061 int
knote_detach(struct klist * list,struct knote * kn)6062 knote_detach(struct klist *list, struct knote *kn)
6063 {
6064 assert(!KNOTE_IS_AUTODETACHED(kn));
6065
6066 SLIST_REMOVE(list, kn, knote, kn_selnext);
6067 return SLIST_EMPTY(list);
6068 }
6069
6070 /*
6071 * knote_vanish - Indicate that the source has vanished
6072 *
6073 * Used only for vanishing ports - vanishing fds go
6074 * through knote_fdclose()
6075 *
6076 * If the knote has requested EV_VANISHED delivery,
6077 * arrange for that. Otherwise, deliver a NOTE_REVOKE
6078 * event for backward compatibility.
6079 *
6080 * The knote is marked as having vanished. The source's
6081 * reference to the knote is dropped by caller, but the knote's
6082 * source reference is only cleaned up later when the knote is dropped.
6083 *
6084 * Our caller already has the object lock held. Calling
6085 * the detach routine would try to take that lock
6086 * recursively - which likely is not supported.
6087 */
6088 void
knote_vanish(struct klist * list,bool make_active)6089 knote_vanish(struct klist *list, bool make_active)
6090 {
6091 struct knote *kn;
6092 struct knote *kn_next;
6093
6094 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6095 struct kqueue *kq = knote_get_kq(kn);
6096
6097 kqlock(kq);
6098 if (__probable(kn->kn_status & KN_REQVANISH)) {
6099 /*
6100 * If EV_VANISH supported - prepare to deliver one
6101 */
6102 kn->kn_status |= KN_VANISHED;
6103 } else {
6104 /*
6105 * Handle the legacy way to indicate that the port/portset was
6106 * deallocated or left the current Mach portspace (modern technique
6107 * is with an EV_VANISHED protocol).
6108 *
6109 * Deliver an EV_EOF event for these changes (hopefully it will get
6110 * delivered before the port name recycles to the same generation
6111 * count and someone tries to re-register a kevent for it or the
6112 * events are udata-specific - avoiding a conflict).
6113 */
6114 kn->kn_flags |= EV_EOF | EV_ONESHOT;
6115 }
6116 if (make_active) {
6117 knote_activate(kq, kn, FILTER_ACTIVE);
6118 }
6119 kqunlock(kq);
6120 }
6121 }
6122
6123 /*
6124 * remove all knotes referencing a specified fd
6125 *
6126 * Entered with the proc_fd lock already held.
6127 * It returns the same way, but may drop it temporarily.
6128 */
6129 void
knote_fdclose(struct proc * p,int fd)6130 knote_fdclose(struct proc *p, int fd)
6131 {
6132 struct filedesc *fdt = &p->p_fd;
6133 struct klist *list;
6134 struct knote *kn;
6135 KNOTE_LOCK_CTX(knlc);
6136
6137 restart:
6138 list = &fdt->fd_knlist[fd];
6139 SLIST_FOREACH(kn, list, kn_link) {
6140 struct kqueue *kq = knote_get_kq(kn);
6141
6142 kqlock(kq);
6143
6144 if (kq->kq_p != p) {
6145 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6146 __func__, kq->kq_p, p);
6147 }
6148
6149 /*
6150 * If the knote supports EV_VANISHED delivery,
6151 * transition it to vanished mode (or skip over
6152 * it if already vanished).
6153 */
6154 if (kn->kn_status & KN_VANISHED) {
6155 kqunlock(kq);
6156 continue;
6157 }
6158
6159 proc_fdunlock(p);
6160 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6161 /* the knote was dropped by someone, nothing to do */
6162 } else if (kn->kn_status & KN_REQVANISH) {
6163 /*
6164 * Since we have REQVANISH for this knote, we need to notify clients about
6165 * the EV_VANISHED.
6166 *
6167 * But unlike mach ports, we want to do the detach here as well and not
6168 * defer it so that we can release the iocount that is on the knote and
6169 * close the fp.
6170 */
6171 kn->kn_status |= KN_VANISHED;
6172
6173 /*
6174 * There may be a concurrent post happening, make sure to wait for it
6175 * before we detach. knote_wait_for_post() unlocks on kq on exit
6176 */
6177 knote_wait_for_post(kq, kn);
6178
6179 knote_fops(kn)->f_detach(kn);
6180 if (kn->kn_is_fd) {
6181 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6182 }
6183 kn->kn_filtid = EVFILTID_DETACHED;
6184 kqlock(kq);
6185
6186 knote_activate(kq, kn, FILTER_ACTIVE);
6187 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6188 } else {
6189 knote_drop(kq, kn, &knlc);
6190 }
6191
6192 proc_fdlock(p);
6193 goto restart;
6194 }
6195 }
6196
6197 /*
6198 * knote_fdfind - lookup a knote in the fd table for process
6199 *
6200 * If the filter is file-based, lookup based on fd index.
6201 * Otherwise use a hash based on the ident.
6202 *
6203 * Matching is based on kq, filter, and ident. Optionally,
6204 * it may also be based on the udata field in the kevent -
6205 * allowing multiple event registration for the file object
6206 * per kqueue.
6207 *
6208 * fd_knhashlock or fdlock held on entry (and exit)
6209 */
6210 static struct knote *
knote_fdfind(struct kqueue * kq,const struct kevent_internal_s * kev,bool is_fd,struct proc * p)6211 knote_fdfind(struct kqueue *kq,
6212 const struct kevent_internal_s *kev,
6213 bool is_fd,
6214 struct proc *p)
6215 {
6216 struct filedesc *fdp = &p->p_fd;
6217 struct klist *list = NULL;
6218 struct knote *kn = NULL;
6219
6220 /*
6221 * determine where to look for the knote
6222 */
6223 if (is_fd) {
6224 /* fd-based knotes are linked off the fd table */
6225 if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6226 list = &fdp->fd_knlist[kev->kei_ident];
6227 }
6228 } else if (fdp->fd_knhashmask != 0) {
6229 /* hash non-fd knotes here too */
6230 list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6231 }
6232
6233 /*
6234 * scan the selected list looking for a match
6235 */
6236 if (list != NULL) {
6237 SLIST_FOREACH(kn, list, kn_link) {
6238 if (kq == knote_get_kq(kn) &&
6239 kev->kei_ident == kn->kn_id &&
6240 kev->kei_filter == kn->kn_filter) {
6241 if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6242 if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6243 kev->kei_udata == kn->kn_udata) {
6244 break; /* matching udata-specific knote */
6245 }
6246 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6247 break; /* matching non-udata-specific knote */
6248 }
6249 }
6250 }
6251 }
6252 return kn;
6253 }
6254
6255 /*
6256 * kq_add_knote- Add knote to the fd table for process
6257 * while checking for duplicates.
6258 *
6259 * All file-based filters associate a list of knotes by file
6260 * descriptor index. All other filters hash the knote by ident.
6261 *
6262 * May have to grow the table of knote lists to cover the
6263 * file descriptor index presented.
6264 *
6265 * fd_knhashlock and fdlock unheld on entry (and exit).
6266 *
6267 * Takes a rwlock boost if inserting the knote is successful.
6268 */
6269 static int
kq_add_knote(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc,struct proc * p)6270 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6271 struct proc *p)
6272 {
6273 struct filedesc *fdp = &p->p_fd;
6274 struct klist *list = NULL;
6275 int ret = 0;
6276 bool is_fd = kn->kn_is_fd;
6277
6278 if (is_fd) {
6279 proc_fdlock(p);
6280 } else {
6281 knhash_lock(fdp);
6282 }
6283
6284 if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6285 /* found an existing knote: we can't add this one */
6286 ret = ERESTART;
6287 goto out_locked;
6288 }
6289
6290 /* knote was not found: add it now */
6291 if (!is_fd) {
6292 if (fdp->fd_knhashmask == 0) {
6293 u_long size = 0;
6294
6295 list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6296 if (list == NULL) {
6297 ret = ENOMEM;
6298 goto out_locked;
6299 }
6300
6301 fdp->fd_knhash = list;
6302 fdp->fd_knhashmask = size;
6303 }
6304
6305 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6306 SLIST_INSERT_HEAD(list, kn, kn_link);
6307 ret = 0;
6308 goto out_locked;
6309 } else {
6310 /* knote is fd based */
6311
6312 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6313 u_int size = 0;
6314
6315 /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6316 if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6317 ret = EINVAL;
6318 goto out_locked;
6319 }
6320 /* have to grow the fd_knlist */
6321 size = fdp->fd_knlistsize;
6322 while (size <= kn->kn_id) {
6323 size += KQEXTENT;
6324 }
6325
6326 if (size >= (UINT_MAX / sizeof(struct klist))) {
6327 ret = EINVAL;
6328 goto out_locked;
6329 }
6330
6331 list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6332 if (list == NULL) {
6333 ret = ENOMEM;
6334 goto out_locked;
6335 }
6336
6337 bcopy(fdp->fd_knlist, list,
6338 fdp->fd_knlistsize * sizeof(struct klist));
6339 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6340 fdp->fd_knlist = list;
6341 fdp->fd_knlistsize = size;
6342 }
6343
6344 list = &fdp->fd_knlist[kn->kn_id];
6345 SLIST_INSERT_HEAD(list, kn, kn_link);
6346 ret = 0;
6347 goto out_locked;
6348 }
6349
6350 out_locked:
6351 if (ret == 0) {
6352 kqlock(kq);
6353 assert((kn->kn_status & KN_LOCKED) == 0);
6354 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6355 kqueue_retain(kq); /* retain a kq ref */
6356 }
6357 if (is_fd) {
6358 proc_fdunlock(p);
6359 } else {
6360 knhash_unlock(fdp);
6361 }
6362
6363 return ret;
6364 }
6365
6366 /*
6367 * kq_remove_knote - remove a knote from the fd table for process
6368 *
6369 * If the filter is file-based, remove based on fd index.
6370 * Otherwise remove from the hash based on the ident.
6371 *
6372 * fd_knhashlock and fdlock unheld on entry (and exit).
6373 */
6374 static void
kq_remove_knote(struct kqueue * kq,struct knote * kn,struct proc * p,struct knote_lock_ctx * knlc)6375 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6376 struct knote_lock_ctx *knlc)
6377 {
6378 struct filedesc *fdp = &p->p_fd;
6379 struct klist *list = NULL;
6380 uint16_t kq_state;
6381 bool is_fd = kn->kn_is_fd;
6382
6383 if (is_fd) {
6384 proc_fdlock(p);
6385 } else {
6386 knhash_lock(fdp);
6387 }
6388
6389 if (is_fd) {
6390 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6391 list = &fdp->fd_knlist[kn->kn_id];
6392 } else {
6393 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6394 }
6395 SLIST_REMOVE(list, kn, knote, kn_link);
6396
6397 kqlock(kq);
6398
6399 /* Update the servicer iotier override */
6400 kqueue_update_iotier_override(kq);
6401
6402 kq_state = kq->kq_state;
6403 if (knlc) {
6404 knote_unlock_cancel(kq, kn, knlc);
6405 } else {
6406 kqunlock(kq);
6407 }
6408 if (is_fd) {
6409 proc_fdunlock(p);
6410 } else {
6411 knhash_unlock(fdp);
6412 }
6413
6414 if (kq_state & KQ_DYNAMIC) {
6415 kqworkloop_release((struct kqworkloop *)kq);
6416 }
6417 }
6418
6419 /*
6420 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6421 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6422 *
6423 * fd_knhashlock or fdlock unheld on entry (and exit)
6424 */
6425
6426 static struct knote *
kq_find_knote_and_kq_lock(struct kqueue * kq,struct kevent_qos_s * kev,bool is_fd,struct proc * p)6427 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6428 bool is_fd, struct proc *p)
6429 {
6430 struct filedesc *fdp = &p->p_fd;
6431 struct knote *kn;
6432
6433 if (is_fd) {
6434 proc_fdlock(p);
6435 } else {
6436 knhash_lock(fdp);
6437 }
6438
6439 /*
6440 * Temporary horrible hack:
6441 * this cast is gross and will go away in a future change.
6442 * It is OK to do because we don't look at xflags/s_fflags,
6443 * and that when we cast down the kev this way,
6444 * the truncated filter field works.
6445 */
6446 kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
6447
6448 if (kn) {
6449 kqlock(kq);
6450 assert(knote_get_kq(kn) == kq);
6451 }
6452
6453 if (is_fd) {
6454 proc_fdunlock(p);
6455 } else {
6456 knhash_unlock(fdp);
6457 }
6458
6459 return kn;
6460 }
6461
6462 static struct kqtailq *
knote_get_tailq(kqueue_t kqu,struct knote * kn)6463 knote_get_tailq(kqueue_t kqu, struct knote *kn)
6464 {
6465 kq_index_t qos_index = kn->kn_qos_index;
6466
6467 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6468 assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
6469 return &kqu.kqwl->kqwl_queue[qos_index - 1];
6470 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6471 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6472 return &kqu.kqwq->kqwq_queue[qos_index - 1];
6473 } else {
6474 assert(qos_index == QOS_INDEX_KQFILE);
6475 return &kqu.kqf->kqf_queue;
6476 }
6477 }
6478
6479 static void
knote_enqueue(kqueue_t kqu,struct knote * kn)6480 knote_enqueue(kqueue_t kqu, struct knote *kn)
6481 {
6482 kqlock_held(kqu);
6483
6484 if ((kn->kn_status & KN_ACTIVE) == 0) {
6485 return;
6486 }
6487
6488 if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
6489 return;
6490 }
6491
6492 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6493 bool wakeup = TAILQ_EMPTY(queue);
6494
6495 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6496 kn->kn_status |= KN_QUEUED;
6497 kqu.kq->kq_count++;
6498
6499 if (wakeup) {
6500 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6501 kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
6502 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6503 kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
6504 } else {
6505 kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
6506 }
6507 }
6508 }
6509
6510 __attribute__((always_inline))
6511 static inline void
knote_dequeue(kqueue_t kqu,struct knote * kn)6512 knote_dequeue(kqueue_t kqu, struct knote *kn)
6513 {
6514 if (kn->kn_status & KN_QUEUED) {
6515 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6516
6517 // attaching the knote calls knote_reset_priority() without
6518 // the kqlock which is fine, so we can't call kqlock_held()
6519 // if we're not queued.
6520 kqlock_held(kqu);
6521
6522 TAILQ_REMOVE(queue, kn, kn_tqe);
6523 kn->kn_status &= ~KN_QUEUED;
6524 kqu.kq->kq_count--;
6525 if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
6526 assert((kqu.kq->kq_count == 0) ==
6527 (bool)TAILQ_EMPTY(queue));
6528 }
6529 }
6530 }
6531
6532 /* called with kqueue lock held */
6533 static void
knote_suppress(kqueue_t kqu,struct knote * kn)6534 knote_suppress(kqueue_t kqu, struct knote *kn)
6535 {
6536 struct kqtailq *suppressq;
6537
6538 kqlock_held(kqu);
6539
6540 assert((kn->kn_status & KN_SUPPRESSED) == 0);
6541 assert(kn->kn_status & KN_QUEUED);
6542
6543 knote_dequeue(kqu, kn);
6544 /* deactivate - so new activations indicate a wakeup */
6545 kn->kn_status &= ~KN_ACTIVE;
6546 kn->kn_status |= KN_SUPPRESSED;
6547 suppressq = kqueue_get_suppressed_queue(kqu, kn);
6548 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6549 }
6550
6551 __attribute__((always_inline))
6552 static inline void
knote_unsuppress_noqueue(kqueue_t kqu,struct knote * kn)6553 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6554 {
6555 struct kqtailq *suppressq;
6556
6557 kqlock_held(kqu);
6558
6559 assert(kn->kn_status & KN_SUPPRESSED);
6560
6561 kn->kn_status &= ~KN_SUPPRESSED;
6562 suppressq = kqueue_get_suppressed_queue(kqu, kn);
6563 TAILQ_REMOVE(suppressq, kn, kn_tqe);
6564
6565 /*
6566 * If the knote is no longer active, reset its push,
6567 * and resynchronize kn_qos_index with kn_qos_override
6568 * for knotes with a real qos.
6569 */
6570 if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
6571 kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
6572 }
6573 kn->kn_qos_index = kn->kn_qos_override;
6574 }
6575
6576 /* called with kqueue lock held */
6577 static void
knote_unsuppress(kqueue_t kqu,struct knote * kn)6578 knote_unsuppress(kqueue_t kqu, struct knote *kn)
6579 {
6580 knote_unsuppress_noqueue(kqu, kn);
6581 knote_enqueue(kqu, kn);
6582 }
6583
6584 __attribute__((always_inline))
6585 static inline void
knote_mark_active(struct knote * kn)6586 knote_mark_active(struct knote *kn)
6587 {
6588 if ((kn->kn_status & KN_ACTIVE) == 0) {
6589 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6590 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
6591 kn->kn_filtid);
6592 }
6593
6594 kn->kn_status |= KN_ACTIVE;
6595 }
6596
6597 /* called with kqueue lock held */
6598 static void
knote_activate(kqueue_t kqu,struct knote * kn,int result)6599 knote_activate(kqueue_t kqu, struct knote *kn, int result)
6600 {
6601 assert(result & FILTER_ACTIVE);
6602 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6603 // may dequeue the knote
6604 knote_adjust_qos(kqu.kq, kn, result);
6605 }
6606 knote_mark_active(kn);
6607 knote_enqueue(kqu, kn);
6608 }
6609
6610 /*
6611 * This function applies changes requested by f_attach or f_touch for
6612 * a given filter. It proceeds in a carefully chosen order to help
6613 * every single transition do the minimal amount of work possible.
6614 */
6615 static void
knote_apply_touch(kqueue_t kqu,struct knote * kn,struct kevent_qos_s * kev,int result)6616 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
6617 int result)
6618 {
6619 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6620 kn->kn_status &= ~KN_DISABLED;
6621
6622 /*
6623 * it is possible for userland to have knotes registered for a given
6624 * workloop `wl_orig` but really handled on another workloop `wl_new`.
6625 *
6626 * In that case, rearming will happen from the servicer thread of
6627 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6628 * this knote to stay suppressed forever if we only relied on
6629 * kqworkloop_acknowledge_events to be called by `wl_orig`.
6630 *
6631 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6632 * unsuppress because that would mess with the processing phase of
6633 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6634 * will be called.
6635 */
6636 if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6637 if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
6638 knote_unsuppress_noqueue(kqu, kn);
6639 }
6640 }
6641 }
6642
6643 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
6644 kqueue_update_iotier_override(kqu);
6645 }
6646
6647 if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6648 // may dequeue the knote
6649 knote_reset_priority(kqu, kn, kev->qos);
6650 }
6651
6652 /*
6653 * When we unsuppress above, or because of knote_reset_priority(),
6654 * the knote may have been dequeued, we need to restore the invariant
6655 * that if the knote is active it needs to be queued now that
6656 * we're done applying changes.
6657 */
6658 if (result & FILTER_ACTIVE) {
6659 knote_activate(kqu, kn, result);
6660 } else {
6661 knote_enqueue(kqu, kn);
6662 }
6663
6664 if ((result & FILTER_THREADREQ_NODEFEER) &&
6665 act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6666 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
6667 }
6668 }
6669
6670 /*
6671 * knote_drop - disconnect and drop the knote
6672 *
6673 * Called with the kqueue locked, returns with the kqueue unlocked.
6674 *
6675 * If a knote locking context is passed, it is canceled.
6676 *
6677 * The knote may have already been detached from
6678 * (or not yet attached to) its source object.
6679 */
6680 static void
knote_drop(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)6681 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
6682 {
6683 struct proc *p = kq->kq_p;
6684
6685 kqlock_held(kq);
6686
6687 assert((kn->kn_status & KN_DROPPING) == 0);
6688 if (knlc == NULL) {
6689 assert((kn->kn_status & KN_LOCKED) == 0);
6690 }
6691 kn->kn_status |= KN_DROPPING;
6692
6693 if (kn->kn_status & KN_SUPPRESSED) {
6694 knote_unsuppress_noqueue(kq, kn);
6695 } else {
6696 knote_dequeue(kq, kn);
6697 }
6698 knote_wait_for_post(kq, kn);
6699
6700 /* Even if we are autodetached, the filter may need to do cleanups of any
6701 * stuff stashed on the knote so always make the call and let each filter
6702 * handle the possibility of autodetached-ness */
6703 knote_fops(kn)->f_detach(kn);
6704
6705 /* kq may be freed when kq_remove_knote() returns */
6706 kq_remove_knote(kq, kn, p, knlc);
6707 if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
6708 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6709 }
6710
6711 knote_free(kn);
6712 }
6713
6714 void
knote_init(void)6715 knote_init(void)
6716 {
6717 #if CONFIG_MEMORYSTATUS
6718 /* Initialize the memorystatus list lock */
6719 memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
6720 #endif
6721 }
6722 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6723
6724 const struct filterops *
knote_fops(struct knote * kn)6725 knote_fops(struct knote *kn)
6726 {
6727 return sysfilt_ops[kn->kn_filtid];
6728 }
6729
6730 static struct knote *
knote_alloc(void)6731 knote_alloc(void)
6732 {
6733 return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
6734 }
6735
6736 static void
knote_free(struct knote * kn)6737 knote_free(struct knote *kn)
6738 {
6739 assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
6740 zfree(knote_zone, kn);
6741 }
6742
6743 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6744
6745 kevent_ctx_t
kevent_get_context(thread_t thread)6746 kevent_get_context(thread_t thread)
6747 {
6748 uthread_t ut = get_bsdthread_info(thread);
6749 return &ut->uu_save.uus_kevent;
6750 }
6751
6752 static inline bool
kevent_args_requesting_events(unsigned int flags,int nevents)6753 kevent_args_requesting_events(unsigned int flags, int nevents)
6754 {
6755 return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
6756 }
6757
6758 static inline int
kevent_adjust_flags_for_proc(proc_t p,int flags)6759 kevent_adjust_flags_for_proc(proc_t p, int flags)
6760 {
6761 __builtin_assume(p);
6762 return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
6763 }
6764
6765 /*!
6766 * @function kevent_get_kqfile
6767 *
6768 * @brief
6769 * Lookup a kqfile by fd.
6770 *
6771 * @discussion
6772 * Callers: kevent, kevent64, kevent_qos
6773 *
6774 * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6775 */
6776 OS_NOINLINE
6777 static int
kevent_get_kqfile(struct proc * p,int fd,int flags,struct fileproc ** fpp,struct kqueue ** kqp)6778 kevent_get_kqfile(struct proc *p, int fd, int flags,
6779 struct fileproc **fpp, struct kqueue **kqp)
6780 {
6781 int error = 0;
6782 struct kqueue *kq;
6783
6784 error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
6785 if (__improbable(error)) {
6786 return error;
6787 }
6788 kq = (struct kqueue *)fp_get_data((*fpp));
6789
6790 uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6791 if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
6792 kqlock(kq);
6793 kq_state = kq->kq_state;
6794 if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
6795 if (flags & KEVENT_FLAG_LEGACY32) {
6796 kq_state |= KQ_KEV32;
6797 } else if (flags & KEVENT_FLAG_LEGACY64) {
6798 kq_state |= KQ_KEV64;
6799 } else {
6800 kq_state |= KQ_KEV_QOS;
6801 }
6802 kq->kq_state = kq_state;
6803 }
6804 kqunlock(kq);
6805 }
6806
6807 /*
6808 * kqfiles can't be used through the legacy kevent()
6809 * and other interfaces at the same time.
6810 */
6811 if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
6812 (bool)(kq_state & KQ_KEV32))) {
6813 fp_drop(p, fd, *fpp, 0);
6814 return EINVAL;
6815 }
6816
6817 *kqp = kq;
6818 return 0;
6819 }
6820
6821 /*!
6822 * @function kevent_get_kqwq
6823 *
6824 * @brief
6825 * Lookup or create the process kqwq (faspath).
6826 *
6827 * @discussion
6828 * Callers: kevent64, kevent_qos
6829 */
6830 OS_ALWAYS_INLINE
6831 static int
kevent_get_kqwq(proc_t p,int flags,int nevents,struct kqueue ** kqp)6832 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
6833 {
6834 struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
6835
6836 if (__improbable(kevent_args_requesting_events(flags, nevents))) {
6837 return EINVAL;
6838 }
6839 if (__improbable(kqwq == NULL)) {
6840 kqwq = kqworkq_alloc(p, flags);
6841 if (__improbable(kqwq == NULL)) {
6842 return ENOMEM;
6843 }
6844 }
6845
6846 *kqp = &kqwq->kqwq_kqueue;
6847 return 0;
6848 }
6849
6850 #pragma mark kevent copyio
6851
6852 /*!
6853 * @function kevent_get_data_size
6854 *
6855 * @brief
6856 * Copies in the extra data size from user-space.
6857 */
6858 static int
kevent_get_data_size(int flags,user_addr_t data_avail,user_addr_t data_out,kevent_ctx_t kectx)6859 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
6860 kevent_ctx_t kectx)
6861 {
6862 if (!data_avail || !data_out) {
6863 kectx->kec_data_size = 0;
6864 kectx->kec_data_resid = 0;
6865 } else if (flags & KEVENT_FLAG_PROC64) {
6866 user64_size_t usize = 0;
6867 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6868 if (__improbable(error)) {
6869 return error;
6870 }
6871 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6872 } else {
6873 user32_size_t usize = 0;
6874 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6875 if (__improbable(error)) {
6876 return error;
6877 }
6878 kectx->kec_data_avail = data_avail;
6879 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6880 }
6881 kectx->kec_data_out = data_out;
6882 kectx->kec_data_avail = data_avail;
6883 return 0;
6884 }
6885
6886 /*!
6887 * @function kevent_put_data_size
6888 *
6889 * @brief
6890 * Copies out the residual data size to user-space if any has been used.
6891 */
6892 static int
kevent_put_data_size(unsigned int flags,kevent_ctx_t kectx)6893 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
6894 {
6895 if (kectx->kec_data_resid == kectx->kec_data_size) {
6896 return 0;
6897 }
6898 if (flags & KEVENT_FLAG_KERNEL) {
6899 *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
6900 return 0;
6901 }
6902 if (flags & KEVENT_FLAG_PROC64) {
6903 user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
6904 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6905 } else {
6906 user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
6907 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6908 }
6909 }
6910
6911 /*!
6912 * @function kevent_legacy_copyin
6913 *
6914 * @brief
6915 * Handles the copyin of a kevent/kevent64 event.
6916 */
6917 static int
kevent_legacy_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp,unsigned int flags)6918 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
6919 {
6920 int error;
6921
6922 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
6923
6924 if (flags & KEVENT_FLAG_LEGACY64) {
6925 struct kevent64_s kev64;
6926
6927 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6928 if (__improbable(error)) {
6929 return error;
6930 }
6931 *addrp += sizeof(kev64);
6932 *kevp = (struct kevent_qos_s){
6933 .ident = kev64.ident,
6934 .filter = kev64.filter,
6935 /* Make sure user doesn't pass in any system flags */
6936 .flags = kev64.flags & ~EV_SYSFLAGS,
6937 .udata = kev64.udata,
6938 .fflags = kev64.fflags,
6939 .data = kev64.data,
6940 .ext[0] = kev64.ext[0],
6941 .ext[1] = kev64.ext[1],
6942 };
6943 } else if (flags & KEVENT_FLAG_PROC64) {
6944 struct user64_kevent kev64;
6945
6946 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6947 if (__improbable(error)) {
6948 return error;
6949 }
6950 *addrp += sizeof(kev64);
6951 *kevp = (struct kevent_qos_s){
6952 .ident = kev64.ident,
6953 .filter = kev64.filter,
6954 /* Make sure user doesn't pass in any system flags */
6955 .flags = kev64.flags & ~EV_SYSFLAGS,
6956 .udata = kev64.udata,
6957 .fflags = kev64.fflags,
6958 .data = kev64.data,
6959 };
6960 } else {
6961 struct user32_kevent kev32;
6962
6963 error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
6964 if (__improbable(error)) {
6965 return error;
6966 }
6967 *addrp += sizeof(kev32);
6968 *kevp = (struct kevent_qos_s){
6969 .ident = (uintptr_t)kev32.ident,
6970 .filter = kev32.filter,
6971 /* Make sure user doesn't pass in any system flags */
6972 .flags = kev32.flags & ~EV_SYSFLAGS,
6973 .udata = CAST_USER_ADDR_T(kev32.udata),
6974 .fflags = kev32.fflags,
6975 .data = (intptr_t)kev32.data,
6976 };
6977 }
6978
6979 return 0;
6980 }
6981
6982 /*!
6983 * @function kevent_modern_copyin
6984 *
6985 * @brief
6986 * Handles the copyin of a kevent_qos/kevent_id event.
6987 */
6988 static int
kevent_modern_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp)6989 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
6990 {
6991 int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
6992 if (__probable(!error)) {
6993 /* Make sure user doesn't pass in any system flags */
6994 *addrp += sizeof(struct kevent_qos_s);
6995 kevp->flags &= ~EV_SYSFLAGS;
6996 }
6997 return error;
6998 }
6999
7000 /*!
7001 * @function kevent_legacy_copyout
7002 *
7003 * @brief
7004 * Handles the copyout of a kevent/kevent64 event.
7005 */
7006 static int
kevent_legacy_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp,unsigned int flags)7007 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7008 {
7009 int advance;
7010 int error;
7011
7012 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7013
7014 /*
7015 * fully initialize the differnt output event structure
7016 * types from the internal kevent (and some universal
7017 * defaults for fields not represented in the internal
7018 * form).
7019 *
7020 * Note: these structures have no padding hence the C99
7021 * initializers below do not leak kernel info.
7022 */
7023 if (flags & KEVENT_FLAG_LEGACY64) {
7024 struct kevent64_s kev64 = {
7025 .ident = kevp->ident,
7026 .filter = kevp->filter,
7027 .flags = kevp->flags,
7028 .fflags = kevp->fflags,
7029 .data = (int64_t)kevp->data,
7030 .udata = kevp->udata,
7031 .ext[0] = kevp->ext[0],
7032 .ext[1] = kevp->ext[1],
7033 };
7034 advance = sizeof(struct kevent64_s);
7035 error = copyout((caddr_t)&kev64, *addrp, advance);
7036 } else if (flags & KEVENT_FLAG_PROC64) {
7037 /*
7038 * deal with the special case of a user-supplied
7039 * value of (uintptr_t)-1.
7040 */
7041 uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7042 (uint64_t)-1LL : (uint64_t)kevp->ident;
7043 struct user64_kevent kev64 = {
7044 .ident = ident,
7045 .filter = kevp->filter,
7046 .flags = kevp->flags,
7047 .fflags = kevp->fflags,
7048 .data = (int64_t) kevp->data,
7049 .udata = (user_addr_t) kevp->udata,
7050 };
7051 advance = sizeof(kev64);
7052 error = copyout((caddr_t)&kev64, *addrp, advance);
7053 } else {
7054 struct user32_kevent kev32 = {
7055 .ident = (uint32_t)kevp->ident,
7056 .filter = kevp->filter,
7057 .flags = kevp->flags,
7058 .fflags = kevp->fflags,
7059 .data = (int32_t)kevp->data,
7060 .udata = (uint32_t)kevp->udata,
7061 };
7062 advance = sizeof(kev32);
7063 error = copyout((caddr_t)&kev32, *addrp, advance);
7064 }
7065 if (__probable(!error)) {
7066 *addrp += advance;
7067 }
7068 return error;
7069 }
7070
7071 /*!
7072 * @function kevent_modern_copyout
7073 *
7074 * @brief
7075 * Handles the copyout of a kevent_qos/kevent_id event.
7076 */
7077 OS_ALWAYS_INLINE
7078 static inline int
kevent_modern_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp)7079 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7080 {
7081 int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7082 if (__probable(!error)) {
7083 *addrp += sizeof(struct kevent_qos_s);
7084 }
7085 return error;
7086 }
7087
7088 #pragma mark kevent core implementation
7089
7090 /*!
7091 * @function kevent_callback_inline
7092 *
7093 * @brief
7094 * Callback for each individual event
7095 *
7096 * @discussion
7097 * This is meant to be inlined in kevent_modern_callback and
7098 * kevent_legacy_callback.
7099 */
7100 OS_ALWAYS_INLINE
7101 static inline int
kevent_callback_inline(struct kevent_qos_s * kevp,kevent_ctx_t kectx,bool legacy)7102 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7103 {
7104 int error;
7105
7106 assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7107
7108 /*
7109 * Copy out the appropriate amount of event data for this user.
7110 */
7111 if (legacy) {
7112 error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7113 kectx->kec_process_flags);
7114 } else {
7115 error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7116 }
7117
7118 /*
7119 * If there isn't space for additional events, return
7120 * a harmless error to stop the processing here
7121 */
7122 if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7123 error = EWOULDBLOCK;
7124 }
7125 return error;
7126 }
7127
7128 /*!
7129 * @function kevent_modern_callback
7130 *
7131 * @brief
7132 * Callback for each individual modern event.
7133 *
7134 * @discussion
7135 * This callback handles kevent_qos/kevent_id events.
7136 */
7137 static int
kevent_modern_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7138 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7139 {
7140 return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7141 }
7142
7143 /*!
7144 * @function kevent_legacy_callback
7145 *
7146 * @brief
7147 * Callback for each individual legacy event.
7148 *
7149 * @discussion
7150 * This callback handles kevent/kevent64 events.
7151 */
7152 static int
kevent_legacy_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7153 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7154 {
7155 return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7156 }
7157
7158 /*!
7159 * @function kevent_cleanup
7160 *
7161 * @brief
7162 * Handles the cleanup returning from a kevent call.
7163 *
7164 * @discussion
7165 * kevent entry points will take a reference on workloops,
7166 * and a usecount on the fileglob of kqfiles.
7167 *
7168 * This function undoes this on the exit paths of kevents.
7169 *
7170 * @returns
7171 * The error to return to userspace.
7172 */
7173 static int
kevent_cleanup(kqueue_t kqu,int flags,int error,kevent_ctx_t kectx)7174 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7175 {
7176 // poll should not call any codepath leading to this
7177 assert((flags & KEVENT_FLAG_POLL) == 0);
7178
7179 if (flags & KEVENT_FLAG_WORKLOOP) {
7180 kqworkloop_release(kqu.kqwl);
7181 } else if (flags & KEVENT_FLAG_WORKQ) {
7182 /* nothing held */
7183 } else {
7184 fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7185 }
7186
7187 /* don't restart after signals... */
7188 if (error == ERESTART) {
7189 error = EINTR;
7190 } else if (error == 0) {
7191 /* don't abandon other output just because of residual copyout failures */
7192 (void)kevent_put_data_size(flags, kectx);
7193 }
7194
7195 if (flags & KEVENT_FLAG_PARKING) {
7196 thread_t th = current_thread();
7197 struct uthread *uth = get_bsdthread_info(th);
7198 if (uth->uu_kqr_bound) {
7199 thread_unfreeze_base_pri(th);
7200 }
7201 }
7202 return error;
7203 }
7204
7205 /*!
7206 * @function kqueue_process
7207 *
7208 * @brief
7209 * Process the triggered events in a kqueue.
7210 *
7211 * @discussion
7212 * Walk the queued knotes and validate that they are really still triggered
7213 * events by calling the filter routines (if necessary).
7214 *
7215 * For each event that is still considered triggered, invoke the callback
7216 * routine provided.
7217 *
7218 * caller holds a reference on the kqueue.
7219 * kqueue locked on entry and exit - but may be dropped
7220 * kqueue list locked (held for duration of call)
7221 *
7222 * This is only called by kqueue_scan() so that the compiler can inline it.
7223 *
7224 * @returns
7225 * - 0: no event was returned, no other error occured
7226 * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7227 * - EWOULDBLOCK: (not an error) events have been found and we should return
7228 * - EFAULT: copyout failed
7229 * - filter specific errors
7230 */
7231 static int
kqueue_process(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7232 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7233 kevent_callback_t callback)
7234 {
7235 workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7236 struct knote *kn;
7237 int error = 0, rc = 0;
7238 struct kqtailq *base_queue, *queue;
7239 uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7240
7241 if (kq_type & KQ_WORKQ) {
7242 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7243 } else if (kq_type & KQ_WORKLOOP) {
7244 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7245 } else {
7246 kqfile_retry:
7247 rc = kqfile_begin_processing(kqu.kqf);
7248 if (rc == EBADF) {
7249 return EBADF;
7250 }
7251 }
7252
7253 if (rc == -1) {
7254 /* Nothing to process */
7255 return 0;
7256 }
7257
7258 /*
7259 * loop through the enqueued knotes associated with this request,
7260 * processing each one. Each request may have several queues
7261 * of knotes to process (depending on the type of kqueue) so we
7262 * have to loop through all the queues as long as we have additional
7263 * space.
7264 */
7265
7266 process_again:
7267 if (kq_type & KQ_WORKQ) {
7268 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7269 } else if (kq_type & KQ_WORKLOOP) {
7270 base_queue = &kqu.kqwl->kqwl_queue[0];
7271 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7272 } else {
7273 base_queue = queue = &kqu.kqf->kqf_queue;
7274 }
7275
7276 do {
7277 while ((kn = TAILQ_FIRST(queue)) != NULL) {
7278 error = knote_process(kn, kectx, callback);
7279 if (error == EJUSTRETURN) {
7280 error = 0;
7281 } else if (__improbable(error)) {
7282 /* error is EWOULDBLOCK when the out event array is full */
7283 goto stop_processing;
7284 }
7285 }
7286 } while (queue-- > base_queue);
7287
7288 if (kectx->kec_process_noutputs) {
7289 /* callers will transform this into no error */
7290 error = EWOULDBLOCK;
7291 }
7292
7293 stop_processing:
7294 /*
7295 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7296 * we want to unbind the kqrequest from the thread.
7297 *
7298 * However, because the kq locks are dropped several times during process,
7299 * new knotes may have fired again, in which case, we want to fail the end
7300 * processing and process again, until it converges.
7301 *
7302 * If we have an error or returned events, end processing never fails.
7303 */
7304 if (error) {
7305 flags &= ~KEVENT_FLAG_PARKING;
7306 }
7307 if (kq_type & KQ_WORKQ) {
7308 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7309 } else if (kq_type & KQ_WORKLOOP) {
7310 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7311 } else {
7312 rc = kqfile_end_processing(kqu.kqf);
7313 }
7314
7315 if (__probable(error)) {
7316 return error;
7317 }
7318
7319 if (__probable(rc >= 0)) {
7320 assert(rc == 0 || rc == EBADF);
7321 return rc;
7322 }
7323
7324 if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7325 assert(flags & KEVENT_FLAG_PARKING);
7326 goto process_again;
7327 } else {
7328 goto kqfile_retry;
7329 }
7330 }
7331
7332 /*!
7333 * @function kqueue_scan_continue
7334 *
7335 * @brief
7336 * The continuation used by kqueue_scan for kevent entry points.
7337 *
7338 * @discussion
7339 * Assumes we inherit a use/ref count on the kq or its fileglob.
7340 *
7341 * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7342 * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7343 */
7344 OS_NORETURN OS_NOINLINE
7345 static void
kqueue_scan_continue(void * data,wait_result_t wait_result)7346 kqueue_scan_continue(void *data, wait_result_t wait_result)
7347 {
7348 uthread_t ut = current_uthread();
7349 kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7350 int error = 0, flags = kectx->kec_process_flags;
7351 struct kqueue *kq = data;
7352
7353 /*
7354 * only kevent variants call in here, so we know the callback is
7355 * kevent_legacy_callback or kevent_modern_callback.
7356 */
7357 assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7358
7359 switch (wait_result) {
7360 case THREAD_AWAKENED:
7361 if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7362 error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7363 } else {
7364 error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7365 }
7366 break;
7367 case THREAD_TIMED_OUT:
7368 error = 0;
7369 break;
7370 case THREAD_INTERRUPTED:
7371 error = EINTR;
7372 break;
7373 case THREAD_RESTART:
7374 error = EBADF;
7375 break;
7376 default:
7377 panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7378 }
7379
7380
7381 error = kevent_cleanup(kq, flags, error, kectx);
7382 *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7383 unix_syscall_return(error);
7384 }
7385
7386 /*!
7387 * @function kqueue_scan
7388 *
7389 * @brief
7390 * Scan and wait for events in a kqueue (used by poll & kevent).
7391 *
7392 * @discussion
7393 * Process the triggered events in a kqueue.
7394 *
7395 * If there are no events triggered arrange to wait for them:
7396 * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7397 * - possibly until kectx->kec_deadline expires
7398 *
7399 * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7400 * are set, then it will wait in the kqueue_scan_continue continuation.
7401 *
7402 * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7403 * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7404 *
7405 * @param kqu
7406 * The kqueue being scanned.
7407 *
7408 * @param flags
7409 * The KEVENT_FLAG_* flags for this call.
7410 *
7411 * @param kectx
7412 * The context used for this scan.
7413 * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7414 *
7415 * @param callback
7416 * The callback to be called on events sucessfully processed.
7417 * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7418 */
7419 int
kqueue_scan(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7420 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7421 kevent_callback_t callback)
7422 {
7423 int error;
7424
7425 for (;;) {
7426 kqlock(kqu);
7427 error = kqueue_process(kqu, flags, kectx, callback);
7428
7429 /*
7430 * If we got an error, events returned (EWOULDBLOCK)
7431 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7432 * just return.
7433 */
7434 if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
7435 kqunlock(kqu);
7436 return error == EWOULDBLOCK ? 0 : error;
7437 }
7438
7439 assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
7440
7441 kqu.kqf->kqf_state |= KQ_SLEEP;
7442 assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
7443 kectx->kec_deadline);
7444 kqunlock(kqu);
7445
7446 if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
7447 thread_block_parameter(kqueue_scan_continue, kqu.kqf);
7448 __builtin_unreachable();
7449 }
7450
7451 wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7452 switch (wr) {
7453 case THREAD_AWAKENED:
7454 break;
7455 case THREAD_TIMED_OUT:
7456 return 0;
7457 case THREAD_INTERRUPTED:
7458 return EINTR;
7459 case THREAD_RESTART:
7460 return EBADF;
7461 default:
7462 panic("%s: - bad wait_result (%d)", __func__, wr);
7463 }
7464 }
7465 }
7466
7467 /*!
7468 * @function kevent_internal
7469 *
7470 * @brief
7471 * Common kevent code.
7472 *
7473 * @discussion
7474 * Needs to be inlined to specialize for legacy or modern and
7475 * eliminate dead code.
7476 *
7477 * This is the core logic of kevent entry points, that will:
7478 * - register kevents
7479 * - optionally scan the kqueue for events
7480 *
7481 * The caller is giving kevent_internal a reference on the kqueue
7482 * or its fileproc that needs to be cleaned up by kevent_cleanup().
7483 */
7484 OS_ALWAYS_INLINE
7485 static inline int
kevent_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval,bool legacy)7486 kevent_internal(kqueue_t kqu,
7487 user_addr_t changelist, int nchanges,
7488 user_addr_t ueventlist, int nevents,
7489 int flags, kevent_ctx_t kectx, int32_t *retval,
7490 bool legacy)
7491 {
7492 int error = 0, noutputs = 0, register_rc;
7493
7494 /* only bound threads can receive events on workloops */
7495 if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7496 #if CONFIG_WORKLOOP_DEBUG
7497 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7498 .uu_kqid = kqu.kqwl->kqwl_dynamicid,
7499 .uu_kq = error ? NULL : kqu.kq,
7500 .uu_error = error,
7501 .uu_nchanges = nchanges,
7502 .uu_nevents = nevents,
7503 .uu_flags = flags,
7504 });
7505 #endif // CONFIG_WORKLOOP_DEBUG
7506
7507 if (flags & KEVENT_FLAG_KERNEL) {
7508 /* see kevent_workq_internal */
7509 error = copyout(&kqu.kqwl->kqwl_dynamicid,
7510 ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7511 kectx->kec_data_resid -= sizeof(kqueue_id_t);
7512 if (__improbable(error)) {
7513 goto out;
7514 }
7515 }
7516
7517 if (kevent_args_requesting_events(flags, nevents)) {
7518 /*
7519 * Disable the R2K notification while doing a register, if the
7520 * caller wants events too, we don't want the AST to be set if we
7521 * will process these events soon.
7522 */
7523 kqlock(kqu);
7524 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7525 kqunlock(kqu);
7526 flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
7527 }
7528 }
7529
7530 /* register all the change requests the user provided... */
7531 while (nchanges > 0 && error == 0) {
7532 struct kevent_qos_s kev;
7533 struct knote *kn = NULL;
7534
7535 if (legacy) {
7536 error = kevent_legacy_copyin(&changelist, &kev, flags);
7537 } else {
7538 error = kevent_modern_copyin(&changelist, &kev);
7539 }
7540 if (error) {
7541 break;
7542 }
7543
7544 register_rc = kevent_register(kqu.kq, &kev, &kn);
7545 if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7546 thread_t thread = current_thread();
7547
7548 kqlock_held(kqu);
7549
7550 if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7551 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7552 }
7553
7554 // f_post_register_wait is meant to call a continuation and not to
7555 // return, which is why we don't support FILTER_REGISTER_WAIT if
7556 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7557 // waits isn't the last.
7558 //
7559 // It is implementable, but not used by any userspace code at the
7560 // moment, so for now return ENOTSUP if someone tries to do it.
7561 if (nchanges == 1 && noutputs < nevents &&
7562 (flags & KEVENT_FLAG_KERNEL) == 0 &&
7563 (flags & KEVENT_FLAG_PARKING) == 0 &&
7564 (flags & KEVENT_FLAG_ERROR_EVENTS) &&
7565 (flags & KEVENT_FLAG_WORKLOOP)) {
7566 uthread_t ut = get_bsdthread_info(thread);
7567
7568 /*
7569 * store the continuation/completion data in the uthread
7570 *
7571 * Note: the kectx aliases with this,
7572 * and is destroyed in the process.
7573 */
7574 ut->uu_save.uus_kevent_register = (struct _kevent_register){
7575 .kev = kev,
7576 .kqwl = kqu.kqwl,
7577 .eventout = noutputs,
7578 .ueventlist = ueventlist,
7579 };
7580 knote_fops(kn)->f_post_register_wait(ut, kn,
7581 &ut->uu_save.uus_kevent_register);
7582 __builtin_unreachable();
7583 }
7584 kqunlock(kqu);
7585
7586 kev.flags |= EV_ERROR;
7587 kev.data = ENOTSUP;
7588 } else {
7589 assert((register_rc & FILTER_REGISTER_WAIT) == 0);
7590 }
7591
7592 // keep in sync with kevent_register_wait_return()
7593 if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
7594 if ((kev.flags & EV_ERROR) == 0) {
7595 kev.flags |= EV_ERROR;
7596 kev.data = 0;
7597 }
7598 if (legacy) {
7599 error = kevent_legacy_copyout(&kev, &ueventlist, flags);
7600 } else {
7601 error = kevent_modern_copyout(&kev, &ueventlist);
7602 }
7603 if (error == 0) {
7604 noutputs++;
7605 }
7606 } else if (kev.flags & EV_ERROR) {
7607 error = (int)kev.data;
7608 }
7609 nchanges--;
7610 }
7611
7612 if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
7613 nevents > 0 && noutputs == 0 && error == 0) {
7614 kectx->kec_process_flags = flags;
7615 kectx->kec_process_nevents = nevents;
7616 kectx->kec_process_noutputs = 0;
7617 kectx->kec_process_eventlist = ueventlist;
7618
7619 if (legacy) {
7620 error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
7621 } else {
7622 error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
7623 }
7624
7625 noutputs = kectx->kec_process_noutputs;
7626 } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7627 /*
7628 * If we didn't through kqworkloop_end_processing(),
7629 * we need to do it here.
7630 *
7631 * kqueue_scan will call kqworkloop_end_processing(),
7632 * so we only need to do it if we didn't scan.
7633 */
7634 kqlock(kqu);
7635 kqworkloop_end_processing(kqu.kqwl, 0, 0);
7636 kqunlock(kqu);
7637 }
7638
7639 *retval = noutputs;
7640 out:
7641 return kevent_cleanup(kqu.kq, flags, error, kectx);
7642 }
7643
7644 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7645
7646 /*!
7647 * @function kevent_modern_internal
7648 *
7649 * @brief
7650 * The backend of the kevent_id and kevent_workq_internal entry points.
7651 *
7652 * @discussion
7653 * Needs to be inline due to the number of arguments.
7654 */
7655 OS_NOINLINE
7656 static int
kevent_modern_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval)7657 kevent_modern_internal(kqueue_t kqu,
7658 user_addr_t changelist, int nchanges,
7659 user_addr_t ueventlist, int nevents,
7660 int flags, kevent_ctx_t kectx, int32_t *retval)
7661 {
7662 return kevent_internal(kqu.kq, changelist, nchanges,
7663 ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
7664 }
7665
7666 /*!
7667 * @function kevent_id
7668 *
7669 * @brief
7670 * The kevent_id() syscall.
7671 */
7672 int
kevent_id(struct proc * p,struct kevent_id_args * uap,int32_t * retval)7673 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
7674 {
7675 int error, flags = uap->flags & KEVENT_FLAG_USER;
7676 uthread_t uth = current_uthread();
7677 workq_threadreq_t kqr = uth->uu_kqr_bound;
7678 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7679 kqueue_t kqu;
7680
7681 flags = kevent_adjust_flags_for_proc(p, flags);
7682 flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
7683
7684 if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
7685 KEVENT_FLAG_WORKLOOP)) {
7686 return EINVAL;
7687 }
7688
7689 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7690 if (__improbable(error)) {
7691 return error;
7692 }
7693
7694 kectx->kec_deadline = 0;
7695 kectx->kec_fp = NULL;
7696 kectx->kec_fd = -1;
7697 /* the kec_process_* fields are filled if kqueue_scann is called only */
7698
7699 /*
7700 * Get the kq we are going to be working on
7701 * As a fastpath, look at the currently bound workloop.
7702 */
7703 kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7704 if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7705 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7706 return EEXIST;
7707 }
7708 kqworkloop_retain(kqu.kqwl);
7709 } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7710 return EXDEV;
7711 } else {
7712 error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl);
7713 if (__improbable(error)) {
7714 return error;
7715 }
7716 }
7717
7718 return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
7719 uap->eventlist, uap->nevents, flags, kectx, retval);
7720 }
7721
7722 /**!
7723 * @function kevent_workq_internal
7724 *
7725 * @discussion
7726 * This function is exported for the sake of the workqueue subsystem.
7727 *
7728 * It is called in two ways:
7729 * - when a thread is about to go to userspace to ask for pending event
7730 * - when a thread is returning from userspace with events back
7731 *
7732 * the workqueue subsystem will only use the following flags:
7733 * - KEVENT_FLAG_STACK_DATA (always)
7734 * - KEVENT_FLAG_IMMEDIATE (always)
7735 * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7736 * userspace).
7737 *
7738 * It implicitly acts on the bound kqueue, and for the case of workloops
7739 * will copyout the kqueue ID before anything else.
7740 *
7741 *
7742 * Pthread will have setup the various arguments to fit this stack layout:
7743 *
7744 * +-------....----+--------------+-----------+--------------------+
7745 * | user stack | data avail | nevents | pthread_self() |
7746 * +-------....----+--------------+-----------+--------------------+
7747 * ^ ^
7748 * data_out eventlist
7749 *
7750 * When a workloop is used, the workloop ID is copied out right before
7751 * the eventlist and is taken from the data buffer.
7752 *
7753 * @warning
7754 * This function is carefuly tailored to not make any call except the final tail
7755 * call into kevent_modern_internal. (LTO inlines current_uthread()).
7756 *
7757 * This function is performance sensitive due to the workq subsystem.
7758 */
7759 int
kevent_workq_internal(struct proc * p,user_addr_t changelist,int nchanges,user_addr_t eventlist,int nevents,user_addr_t data_out,user_size_t * data_available,unsigned int flags,int32_t * retval)7760 kevent_workq_internal(struct proc *p,
7761 user_addr_t changelist, int nchanges,
7762 user_addr_t eventlist, int nevents,
7763 user_addr_t data_out, user_size_t *data_available,
7764 unsigned int flags, int32_t *retval)
7765 {
7766 uthread_t uth = current_uthread();
7767 workq_threadreq_t kqr = uth->uu_kqr_bound;
7768 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7769 kqueue_t kqu;
7770
7771 assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
7772 flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
7773
7774 kectx->kec_data_out = data_out;
7775 kectx->kec_data_avail = (uint64_t)data_available;
7776 kectx->kec_data_size = *data_available;
7777 kectx->kec_data_resid = *data_available;
7778 kectx->kec_deadline = 0;
7779 kectx->kec_fp = NULL;
7780 kectx->kec_fd = -1;
7781 /* the kec_process_* fields are filled if kqueue_scann is called only */
7782
7783 flags = kevent_adjust_flags_for_proc(p, flags);
7784
7785 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7786 kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7787 kqworkloop_retain(kqu.kqwl);
7788
7789 flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
7790 KEVENT_FLAG_KERNEL;
7791 } else {
7792 kqu.kqwq = p->p_fd.fd_wqkqueue;
7793
7794 flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
7795 }
7796
7797 return kevent_modern_internal(kqu, changelist, nchanges,
7798 eventlist, nevents, flags, kectx, retval);
7799 }
7800
7801 /*!
7802 * @function kevent_qos
7803 *
7804 * @brief
7805 * The kevent_qos() syscall.
7806 */
7807 int
kevent_qos(struct proc * p,struct kevent_qos_args * uap,int32_t * retval)7808 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
7809 {
7810 uthread_t uth = current_uthread();
7811 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7812 int error, flags = uap->flags & KEVENT_FLAG_USER;
7813 struct kqueue *kq;
7814
7815 if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
7816 return EINVAL;
7817 }
7818
7819 flags = kevent_adjust_flags_for_proc(p, flags);
7820
7821 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7822 if (__improbable(error)) {
7823 return error;
7824 }
7825
7826 kectx->kec_deadline = 0;
7827 kectx->kec_fp = NULL;
7828 kectx->kec_fd = uap->fd;
7829 /* the kec_process_* fields are filled if kqueue_scann is called only */
7830
7831 /* get the kq we are going to be working on */
7832 if (__probable(flags & KEVENT_FLAG_WORKQ)) {
7833 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7834 } else {
7835 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7836 }
7837 if (__improbable(error)) {
7838 return error;
7839 }
7840
7841 return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
7842 uap->eventlist, uap->nevents, flags, kectx, retval);
7843 }
7844
7845 #pragma mark legacy syscalls: kevent, kevent64
7846
7847 /*!
7848 * @function kevent_legacy_get_deadline
7849 *
7850 * @brief
7851 * Compute the deadline for the legacy kevent syscalls.
7852 *
7853 * @discussion
7854 * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
7855 * as this takes precedence over the deadline.
7856 *
7857 * This function will fail if utimeout is USER_ADDR_NULL
7858 * (the caller should check).
7859 */
7860 static int
kevent_legacy_get_deadline(int flags,user_addr_t utimeout,uint64_t * deadline)7861 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
7862 {
7863 struct timespec ts;
7864
7865 if (flags & KEVENT_FLAG_PROC64) {
7866 struct user64_timespec ts64;
7867 int error = copyin(utimeout, &ts64, sizeof(ts64));
7868 if (__improbable(error)) {
7869 return error;
7870 }
7871 ts.tv_sec = (unsigned long)ts64.tv_sec;
7872 ts.tv_nsec = (long)ts64.tv_nsec;
7873 } else {
7874 struct user32_timespec ts32;
7875 int error = copyin(utimeout, &ts32, sizeof(ts32));
7876 if (__improbable(error)) {
7877 return error;
7878 }
7879 ts.tv_sec = ts32.tv_sec;
7880 ts.tv_nsec = ts32.tv_nsec;
7881 }
7882 if (!timespec_is_valid(&ts)) {
7883 return EINVAL;
7884 }
7885
7886 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
7887 return 0;
7888 }
7889
7890 /*!
7891 * @function kevent_legacy_internal
7892 *
7893 * @brief
7894 * The core implementation for kevent and kevent64
7895 */
7896 OS_NOINLINE
7897 static int
kevent_legacy_internal(struct proc * p,struct kevent64_args * uap,int32_t * retval,int flags)7898 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
7899 int32_t *retval, int flags)
7900 {
7901 uthread_t uth = current_uthread();
7902 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7903 struct kqueue *kq;
7904 int error;
7905
7906 if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
7907 return EINVAL;
7908 }
7909
7910 flags = kevent_adjust_flags_for_proc(p, flags);
7911
7912 kectx->kec_data_out = 0;
7913 kectx->kec_data_avail = 0;
7914 kectx->kec_data_size = 0;
7915 kectx->kec_data_resid = 0;
7916 kectx->kec_deadline = 0;
7917 kectx->kec_fp = NULL;
7918 kectx->kec_fd = uap->fd;
7919 /* the kec_process_* fields are filled if kqueue_scann is called only */
7920
7921 /* convert timeout to absolute - if we have one (and not immediate) */
7922 if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
7923 error = kevent_legacy_get_deadline(flags, uap->timeout,
7924 &kectx->kec_deadline);
7925 if (__improbable(error)) {
7926 return error;
7927 }
7928 }
7929
7930 /* get the kq we are going to be working on */
7931 if (flags & KEVENT_FLAG_WORKQ) {
7932 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7933 } else {
7934 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7935 }
7936 if (__improbable(error)) {
7937 return error;
7938 }
7939
7940 return kevent_internal(kq, uap->changelist, uap->nchanges,
7941 uap->eventlist, uap->nevents, flags, kectx, retval,
7942 /*legacy*/ true);
7943 }
7944
7945 /*!
7946 * @function kevent
7947 *
7948 * @brief
7949 * The legacy kevent() syscall.
7950 */
7951 int
kevent(struct proc * p,struct kevent_args * uap,int32_t * retval)7952 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
7953 {
7954 struct kevent64_args args = {
7955 .fd = uap->fd,
7956 .changelist = uap->changelist,
7957 .nchanges = uap->nchanges,
7958 .eventlist = uap->eventlist,
7959 .nevents = uap->nevents,
7960 .timeout = uap->timeout,
7961 };
7962
7963 return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
7964 }
7965
7966 /*!
7967 * @function kevent64
7968 *
7969 * @brief
7970 * The legacy kevent64() syscall.
7971 */
7972 int
kevent64(struct proc * p,struct kevent64_args * uap,int32_t * retval)7973 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
7974 {
7975 int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
7976 return kevent_legacy_internal(p, uap, retval, flags);
7977 }
7978
7979 #pragma mark - socket interface
7980
7981 #if SOCKETS
7982 #include <sys/param.h>
7983 #include <sys/socket.h>
7984 #include <sys/protosw.h>
7985 #include <sys/domain.h>
7986 #include <sys/mbuf.h>
7987 #include <sys/kern_event.h>
7988 #include <sys/malloc.h>
7989 #include <sys/sys_domain.h>
7990 #include <sys/syslog.h>
7991
7992 #ifndef ROUNDUP64
7993 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
7994 #endif
7995
7996 #ifndef ADVANCE64
7997 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
7998 #endif
7999
8000 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8001 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8002
8003 static int kev_attach(struct socket *so, int proto, struct proc *p);
8004 static int kev_detach(struct socket *so);
8005 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8006 struct ifnet *ifp, struct proc *p);
8007 static lck_mtx_t * event_getlock(struct socket *, int);
8008 static int event_lock(struct socket *, int, void *);
8009 static int event_unlock(struct socket *, int, void *);
8010
8011 static int event_sofreelastref(struct socket *);
8012 static void kev_delete(struct kern_event_pcb *);
8013
8014 static struct pr_usrreqs event_usrreqs = {
8015 .pru_attach = kev_attach,
8016 .pru_control = kev_control,
8017 .pru_detach = kev_detach,
8018 .pru_soreceive = soreceive,
8019 };
8020
8021 static struct protosw eventsw[] = {
8022 {
8023 .pr_type = SOCK_RAW,
8024 .pr_protocol = SYSPROTO_EVENT,
8025 .pr_flags = PR_ATOMIC,
8026 .pr_usrreqs = &event_usrreqs,
8027 .pr_lock = event_lock,
8028 .pr_unlock = event_unlock,
8029 .pr_getlock = event_getlock,
8030 }
8031 };
8032
8033 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8034 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8035
8036 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8037 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8038
8039 struct kevtstat kevtstat;
8040 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8041 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8042 kevt_getstat, "S,kevtstat", "");
8043
8044 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8045 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8046 kevt_pcblist, "S,xkevtpcb", "");
8047
8048 static lck_mtx_t *
event_getlock(struct socket * so,int flags)8049 event_getlock(struct socket *so, int flags)
8050 {
8051 #pragma unused(flags)
8052 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8053
8054 if (so->so_pcb != NULL) {
8055 if (so->so_usecount < 0) {
8056 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8057 so, so->so_usecount, solockhistory_nr(so));
8058 }
8059 /* NOTREACHED */
8060 } else {
8061 panic("%s: so=%p NULL NO so_pcb %s", __func__,
8062 so, solockhistory_nr(so));
8063 /* NOTREACHED */
8064 }
8065 return &ev_pcb->evp_mtx;
8066 }
8067
8068 static int
event_lock(struct socket * so,int refcount,void * lr)8069 event_lock(struct socket *so, int refcount, void *lr)
8070 {
8071 void *lr_saved;
8072
8073 if (lr == NULL) {
8074 lr_saved = __builtin_return_address(0);
8075 } else {
8076 lr_saved = lr;
8077 }
8078
8079 if (so->so_pcb != NULL) {
8080 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8081 } else {
8082 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8083 so, lr_saved, solockhistory_nr(so));
8084 /* NOTREACHED */
8085 }
8086
8087 if (so->so_usecount < 0) {
8088 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8089 so, so->so_pcb, lr_saved, so->so_usecount,
8090 solockhistory_nr(so));
8091 /* NOTREACHED */
8092 }
8093
8094 if (refcount) {
8095 so->so_usecount++;
8096 }
8097
8098 so->lock_lr[so->next_lock_lr] = lr_saved;
8099 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8100 return 0;
8101 }
8102
8103 static int
event_unlock(struct socket * so,int refcount,void * lr)8104 event_unlock(struct socket *so, int refcount, void *lr)
8105 {
8106 void *lr_saved;
8107 lck_mtx_t *mutex_held;
8108
8109 if (lr == NULL) {
8110 lr_saved = __builtin_return_address(0);
8111 } else {
8112 lr_saved = lr;
8113 }
8114
8115 if (refcount) {
8116 so->so_usecount--;
8117 }
8118 if (so->so_usecount < 0) {
8119 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8120 so, so->so_usecount, solockhistory_nr(so));
8121 /* NOTREACHED */
8122 }
8123 if (so->so_pcb == NULL) {
8124 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8125 so, so->so_usecount, (void *)lr_saved,
8126 solockhistory_nr(so));
8127 /* NOTREACHED */
8128 }
8129 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8130
8131 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8132 so->unlock_lr[so->next_unlock_lr] = lr_saved;
8133 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8134
8135 if (so->so_usecount == 0) {
8136 VERIFY(so->so_flags & SOF_PCBCLEARING);
8137 event_sofreelastref(so);
8138 } else {
8139 lck_mtx_unlock(mutex_held);
8140 }
8141
8142 return 0;
8143 }
8144
8145 static int
event_sofreelastref(struct socket * so)8146 event_sofreelastref(struct socket *so)
8147 {
8148 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8149
8150 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8151
8152 so->so_pcb = NULL;
8153
8154 /*
8155 * Disable upcall in the event another thread is in kev_post_msg()
8156 * appending record to the receive socket buffer, since sbwakeup()
8157 * may release the socket lock otherwise.
8158 */
8159 so->so_rcv.sb_flags &= ~SB_UPCALL;
8160 so->so_snd.sb_flags &= ~SB_UPCALL;
8161 so->so_event = sonullevent;
8162 lck_mtx_unlock(&(ev_pcb->evp_mtx));
8163
8164 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8165 lck_rw_lock_exclusive(&kev_rwlock);
8166 LIST_REMOVE(ev_pcb, evp_link);
8167 kevtstat.kes_pcbcount--;
8168 kevtstat.kes_gencnt++;
8169 lck_rw_done(&kev_rwlock);
8170 kev_delete(ev_pcb);
8171
8172 sofreelastref(so, 1);
8173 return 0;
8174 }
8175
8176 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8177
8178 static
8179 struct kern_event_head kern_event_head;
8180
8181 static u_int32_t static_event_id = 0;
8182
8183 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8184
8185 /*
8186 * Install the protosw's for the NKE manager. Invoked at extension load time
8187 */
8188 void
kern_event_init(struct domain * dp)8189 kern_event_init(struct domain *dp)
8190 {
8191 struct protosw *pr;
8192 int i;
8193
8194 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8195 VERIFY(dp == systemdomain);
8196
8197 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8198 net_add_proto(pr, dp, 1);
8199 }
8200 }
8201
8202 static int
kev_attach(struct socket * so,__unused int proto,__unused struct proc * p)8203 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8204 {
8205 int error = 0;
8206 struct kern_event_pcb *ev_pcb;
8207
8208 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8209 if (error != 0) {
8210 return error;
8211 }
8212
8213 ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8214 lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8215
8216 ev_pcb->evp_socket = so;
8217 ev_pcb->evp_vendor_code_filter = 0xffffffff;
8218
8219 so->so_pcb = (caddr_t) ev_pcb;
8220 lck_rw_lock_exclusive(&kev_rwlock);
8221 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8222 kevtstat.kes_pcbcount++;
8223 kevtstat.kes_gencnt++;
8224 lck_rw_done(&kev_rwlock);
8225
8226 return error;
8227 }
8228
8229 static void
kev_delete(struct kern_event_pcb * ev_pcb)8230 kev_delete(struct kern_event_pcb *ev_pcb)
8231 {
8232 VERIFY(ev_pcb != NULL);
8233 lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8234 zfree(ev_pcb_zone, ev_pcb);
8235 }
8236
8237 static int
kev_detach(struct socket * so)8238 kev_detach(struct socket *so)
8239 {
8240 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8241
8242 if (ev_pcb != NULL) {
8243 soisdisconnected(so);
8244 so->so_flags |= SOF_PCBCLEARING;
8245 }
8246
8247 return 0;
8248 }
8249
8250 /*
8251 * For now, kev_vendor_code and mbuf_tags use the same
8252 * mechanism.
8253 */
8254 errno_t
kev_vendor_code_find(const char * string,u_int32_t * out_vendor_code)8255 kev_vendor_code_find(
8256 const char *string,
8257 u_int32_t *out_vendor_code)
8258 {
8259 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8260 return EINVAL;
8261 }
8262 return net_str_id_find_internal(string, out_vendor_code,
8263 NSI_VENDOR_CODE, 1);
8264 }
8265
8266 errno_t
kev_msg_post(struct kev_msg * event_msg)8267 kev_msg_post(struct kev_msg *event_msg)
8268 {
8269 mbuf_tag_id_t min_vendor, max_vendor;
8270
8271 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8272
8273 if (event_msg == NULL) {
8274 return EINVAL;
8275 }
8276
8277 /*
8278 * Limit third parties to posting events for registered vendor codes
8279 * only
8280 */
8281 if (event_msg->vendor_code < min_vendor ||
8282 event_msg->vendor_code > max_vendor) {
8283 os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8284 return EINVAL;
8285 }
8286 return kev_post_msg(event_msg);
8287 }
8288
8289 static int
kev_post_msg_internal(struct kev_msg * event_msg,int wait)8290 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8291 {
8292 struct mbuf *m, *m2;
8293 struct kern_event_pcb *ev_pcb;
8294 struct kern_event_msg *ev;
8295 char *tmp;
8296 u_int32_t total_size;
8297 int i;
8298
8299 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8300 /*
8301 * Special hook for ALF state updates
8302 */
8303 if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8304 event_msg->kev_class == KEV_NKE_CLASS &&
8305 event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8306 event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8307 #if (DEBUG || DEVELOPMENT)
8308 os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8309 #endif /* DEBUG || DEVELOPMENT */
8310 net_filter_event_mark(NET_FILTER_EVENT_ALF,
8311 net_check_compatible_alf());
8312 }
8313 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8314
8315 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8316 total_size = KEV_MSG_HEADER_SIZE;
8317
8318 for (i = 0; i < 5; i++) {
8319 if (event_msg->dv[i].data_length == 0) {
8320 break;
8321 }
8322 total_size += event_msg->dv[i].data_length;
8323 }
8324
8325 if (total_size > MLEN) {
8326 os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8327 return EMSGSIZE;
8328 }
8329
8330 m = m_get(wait, MT_DATA);
8331 if (m == 0) {
8332 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8333 return ENOMEM;
8334 }
8335 ev = mtod(m, struct kern_event_msg *);
8336 total_size = KEV_MSG_HEADER_SIZE;
8337
8338 tmp = (char *) &ev->event_data[0];
8339 for (i = 0; i < 5; i++) {
8340 if (event_msg->dv[i].data_length == 0) {
8341 break;
8342 }
8343
8344 total_size += event_msg->dv[i].data_length;
8345 bcopy(event_msg->dv[i].data_ptr, tmp,
8346 event_msg->dv[i].data_length);
8347 tmp += event_msg->dv[i].data_length;
8348 }
8349
8350 ev->id = ++static_event_id;
8351 ev->total_size = total_size;
8352 ev->vendor_code = event_msg->vendor_code;
8353 ev->kev_class = event_msg->kev_class;
8354 ev->kev_subclass = event_msg->kev_subclass;
8355 ev->event_code = event_msg->event_code;
8356
8357 m->m_len = total_size;
8358 lck_rw_lock_shared(&kev_rwlock);
8359 for (ev_pcb = LIST_FIRST(&kern_event_head);
8360 ev_pcb;
8361 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8362 lck_mtx_lock(&ev_pcb->evp_mtx);
8363 if (ev_pcb->evp_socket->so_pcb == NULL) {
8364 lck_mtx_unlock(&ev_pcb->evp_mtx);
8365 continue;
8366 }
8367 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8368 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8369 lck_mtx_unlock(&ev_pcb->evp_mtx);
8370 continue;
8371 }
8372
8373 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8374 if (ev_pcb->evp_class_filter != ev->kev_class) {
8375 lck_mtx_unlock(&ev_pcb->evp_mtx);
8376 continue;
8377 }
8378
8379 if ((ev_pcb->evp_subclass_filter !=
8380 KEV_ANY_SUBCLASS) &&
8381 (ev_pcb->evp_subclass_filter !=
8382 ev->kev_subclass)) {
8383 lck_mtx_unlock(&ev_pcb->evp_mtx);
8384 continue;
8385 }
8386 }
8387 }
8388
8389 m2 = m_copym(m, 0, m->m_len, wait);
8390 if (m2 == 0) {
8391 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8392 m_free(m);
8393 lck_mtx_unlock(&ev_pcb->evp_mtx);
8394 lck_rw_done(&kev_rwlock);
8395 return ENOMEM;
8396 }
8397 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8398 /*
8399 * We use "m" for the socket stats as it would be
8400 * unsafe to use "m2"
8401 */
8402 so_inc_recv_data_stat(ev_pcb->evp_socket,
8403 1, m->m_len, MBUF_TC_BE);
8404
8405 sorwakeup(ev_pcb->evp_socket);
8406 os_atomic_inc(&kevtstat.kes_posted, relaxed);
8407 } else {
8408 os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8409 }
8410 lck_mtx_unlock(&ev_pcb->evp_mtx);
8411 }
8412 m_free(m);
8413 lck_rw_done(&kev_rwlock);
8414
8415 return 0;
8416 }
8417
8418 int
kev_post_msg(struct kev_msg * event_msg)8419 kev_post_msg(struct kev_msg *event_msg)
8420 {
8421 return kev_post_msg_internal(event_msg, M_WAIT);
8422 }
8423
8424 int
kev_post_msg_nowait(struct kev_msg * event_msg)8425 kev_post_msg_nowait(struct kev_msg *event_msg)
8426 {
8427 return kev_post_msg_internal(event_msg, M_NOWAIT);
8428 }
8429
8430 static int
kev_control(struct socket * so,u_long cmd,caddr_t data,__unused struct ifnet * ifp,__unused struct proc * p)8431 kev_control(struct socket *so,
8432 u_long cmd,
8433 caddr_t data,
8434 __unused struct ifnet *ifp,
8435 __unused struct proc *p)
8436 {
8437 struct kev_request *kev_req = (struct kev_request *) data;
8438 struct kern_event_pcb *ev_pcb;
8439 struct kev_vendor_code *kev_vendor;
8440 u_int32_t *id_value = (u_int32_t *) data;
8441
8442 switch (cmd) {
8443 case SIOCGKEVID:
8444 *id_value = static_event_id;
8445 break;
8446 case SIOCSKEVFILT:
8447 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8448 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8449 ev_pcb->evp_class_filter = kev_req->kev_class;
8450 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
8451 break;
8452 case SIOCGKEVFILT:
8453 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8454 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8455 kev_req->kev_class = ev_pcb->evp_class_filter;
8456 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8457 break;
8458 case SIOCGKEVVENDOR:
8459 kev_vendor = (struct kev_vendor_code *)data;
8460 /* Make sure string is NULL terminated */
8461 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8462 return net_str_id_find_internal(kev_vendor->vendor_string,
8463 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8464 default:
8465 return ENOTSUP;
8466 }
8467
8468 return 0;
8469 }
8470
8471 int
8472 kevt_getstat SYSCTL_HANDLER_ARGS
8473 {
8474 #pragma unused(oidp, arg1, arg2)
8475 int error = 0;
8476
8477 lck_rw_lock_shared(&kev_rwlock);
8478
8479 if (req->newptr != USER_ADDR_NULL) {
8480 error = EPERM;
8481 goto done;
8482 }
8483 if (req->oldptr == USER_ADDR_NULL) {
8484 req->oldidx = sizeof(struct kevtstat);
8485 goto done;
8486 }
8487
8488 error = SYSCTL_OUT(req, &kevtstat,
8489 MIN(sizeof(struct kevtstat), req->oldlen));
8490 done:
8491 lck_rw_done(&kev_rwlock);
8492
8493 return error;
8494 }
8495
8496 __private_extern__ int
8497 kevt_pcblist SYSCTL_HANDLER_ARGS
8498 {
8499 #pragma unused(oidp, arg1, arg2)
8500 int error = 0;
8501 uint64_t n, i;
8502 struct xsystmgen xsg;
8503 void *buf = NULL;
8504 size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8505 ROUNDUP64(sizeof(struct xsocket_n)) +
8506 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8507 ROUNDUP64(sizeof(struct xsockstat_n));
8508 struct kern_event_pcb *ev_pcb;
8509
8510 buf = kalloc_data(item_size, Z_WAITOK | Z_ZERO);
8511 if (buf == NULL) {
8512 return ENOMEM;
8513 }
8514
8515 lck_rw_lock_shared(&kev_rwlock);
8516
8517 n = kevtstat.kes_pcbcount;
8518
8519 if (req->oldptr == USER_ADDR_NULL) {
8520 req->oldidx = (size_t) ((n + n / 8) * item_size);
8521 goto done;
8522 }
8523 if (req->newptr != USER_ADDR_NULL) {
8524 error = EPERM;
8525 goto done;
8526 }
8527 bzero(&xsg, sizeof(xsg));
8528 xsg.xg_len = sizeof(xsg);
8529 xsg.xg_count = n;
8530 xsg.xg_gen = kevtstat.kes_gencnt;
8531 xsg.xg_sogen = so_gencnt;
8532 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8533 if (error) {
8534 goto done;
8535 }
8536 /*
8537 * We are done if there is no pcb
8538 */
8539 if (n == 0) {
8540 goto done;
8541 }
8542
8543 i = 0;
8544 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8545 i < n && ev_pcb != NULL;
8546 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8547 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8548 struct xsocket_n *xso = (struct xsocket_n *)
8549 ADVANCE64(xk, sizeof(*xk));
8550 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8551 ADVANCE64(xso, sizeof(*xso));
8552 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8553 ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8554 struct xsockstat_n *xsostats = (struct xsockstat_n *)
8555 ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8556
8557 bzero(buf, item_size);
8558
8559 lck_mtx_lock(&ev_pcb->evp_mtx);
8560
8561 xk->kep_len = sizeof(struct xkevtpcb);
8562 xk->kep_kind = XSO_EVT;
8563 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8564 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8565 xk->kep_class_filter = ev_pcb->evp_class_filter;
8566 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8567
8568 sotoxsocket_n(ev_pcb->evp_socket, xso);
8569 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8570 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8571 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8572 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8573 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8574
8575 lck_mtx_unlock(&ev_pcb->evp_mtx);
8576
8577 error = SYSCTL_OUT(req, buf, item_size);
8578 }
8579
8580 if (error == 0) {
8581 /*
8582 * Give the user an updated idea of our state.
8583 * If the generation differs from what we told
8584 * her before, she knows that something happened
8585 * while we were processing this request, and it
8586 * might be necessary to retry.
8587 */
8588 bzero(&xsg, sizeof(xsg));
8589 xsg.xg_len = sizeof(xsg);
8590 xsg.xg_count = n;
8591 xsg.xg_gen = kevtstat.kes_gencnt;
8592 xsg.xg_sogen = so_gencnt;
8593 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8594 if (error) {
8595 goto done;
8596 }
8597 }
8598
8599 done:
8600 lck_rw_done(&kev_rwlock);
8601
8602 kfree_data(buf, item_size);
8603 return error;
8604 }
8605
8606 #endif /* SOCKETS */
8607
8608
8609 int
fill_kqueueinfo(kqueue_t kqu,struct kqueue_info * kinfo)8610 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
8611 {
8612 struct vinfo_stat * st;
8613
8614 st = &kinfo->kq_stat;
8615
8616 st->vst_size = kqu.kq->kq_count;
8617 if (kqu.kq->kq_state & KQ_KEV_QOS) {
8618 st->vst_blksize = sizeof(struct kevent_qos_s);
8619 } else if (kqu.kq->kq_state & KQ_KEV64) {
8620 st->vst_blksize = sizeof(struct kevent64_s);
8621 } else {
8622 st->vst_blksize = sizeof(struct kevent);
8623 }
8624 st->vst_mode = S_IFIFO;
8625 st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
8626 kqu.kqwl->kqwl_dynamicid : 0;
8627
8628 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8629 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8630 static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
8631 static_assert(PROC_KQUEUE_32 == KQ_KEV32);
8632 static_assert(PROC_KQUEUE_64 == KQ_KEV64);
8633 static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
8634 static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
8635 static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
8636 kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
8637 if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
8638 if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
8639 kinfo->kq_state |= PROC_KQUEUE_SELECT;
8640 }
8641 }
8642
8643 return 0;
8644 }
8645
8646 static int
fill_kqueue_dyninfo(struct kqworkloop * kqwl,struct kqueue_dyninfo * kqdi)8647 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
8648 {
8649 workq_threadreq_t kqr = &kqwl->kqwl_request;
8650 workq_threadreq_param_t trp = {};
8651 int err;
8652
8653 if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
8654 return EINVAL;
8655 }
8656
8657 if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
8658 return err;
8659 }
8660
8661 kqlock(kqwl);
8662
8663 kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
8664 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8665 kqdi->kqdi_request_state = kqr->tr_state;
8666 kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8667 kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8668 kqdi->kqdi_sync_waiters = 0;
8669 kqdi->kqdi_sync_waiter_qos = 0;
8670
8671 trp.trp_value = kqwl->kqwl_params;
8672 if (trp.trp_flags & TRP_PRIORITY) {
8673 kqdi->kqdi_pri = trp.trp_pri;
8674 } else {
8675 kqdi->kqdi_pri = 0;
8676 }
8677
8678 if (trp.trp_flags & TRP_POLICY) {
8679 kqdi->kqdi_pol = trp.trp_pol;
8680 } else {
8681 kqdi->kqdi_pol = 0;
8682 }
8683
8684 if (trp.trp_flags & TRP_CPUPERCENT) {
8685 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8686 } else {
8687 kqdi->kqdi_cpupercent = 0;
8688 }
8689
8690 kqunlock(kqwl);
8691
8692 return 0;
8693 }
8694
8695
8696 static unsigned long
kevent_extinfo_emit(struct kqueue * kq,struct knote * kn,struct kevent_extinfo * buf,unsigned long buflen,unsigned long nknotes)8697 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8698 unsigned long buflen, unsigned long nknotes)
8699 {
8700 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8701 if (kq == knote_get_kq(kn)) {
8702 if (nknotes < buflen) {
8703 struct kevent_extinfo *info = &buf[nknotes];
8704
8705 kqlock(kq);
8706
8707 if (knote_fops(kn)->f_sanitized_copyout) {
8708 knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
8709 } else {
8710 info->kqext_kev = *(struct kevent_qos_s *)&kn->kn_kevent;
8711 }
8712
8713 if (knote_has_qos(kn)) {
8714 info->kqext_kev.qos =
8715 _pthread_priority_thread_qos_fast(kn->kn_qos);
8716 } else {
8717 info->kqext_kev.qos = kn->kn_qos_override;
8718 }
8719 info->kqext_kev.filter |= 0xff00; /* sign extend filter */
8720 info->kqext_kev.xflags = 0; /* this is where sfflags lives */
8721 info->kqext_kev.data = 0; /* this is where sdata lives */
8722 info->kqext_sdata = kn->kn_sdata;
8723 info->kqext_status = kn->kn_status;
8724 info->kqext_sfflags = kn->kn_sfflags;
8725
8726 kqunlock(kq);
8727 }
8728
8729 /* we return total number of knotes, which may be more than requested */
8730 nknotes++;
8731 }
8732 }
8733
8734 return nknotes;
8735 }
8736
8737 int
kevent_copyout_proc_dynkqids(void * proc,user_addr_t ubuf,uint32_t ubufsize,int32_t * nkqueues_out)8738 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8739 int32_t *nkqueues_out)
8740 {
8741 proc_t p = (proc_t)proc;
8742 struct filedesc *fdp = &p->p_fd;
8743 unsigned int nkqueues = 0;
8744 unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8745 size_t buflen, bufsize;
8746 kqueue_id_t *kq_ids = NULL;
8747 int err = 0;
8748
8749 assert(p != NULL);
8750
8751 if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8752 err = EINVAL;
8753 goto out;
8754 }
8755
8756 buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8757
8758 if (ubuflen != 0) {
8759 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8760 err = ERANGE;
8761 goto out;
8762 }
8763 kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
8764 if (!kq_ids) {
8765 err = ENOMEM;
8766 goto out;
8767 }
8768 }
8769
8770 kqhash_lock(fdp);
8771
8772 if (fdp->fd_kqhashmask > 0) {
8773 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8774 struct kqworkloop *kqwl;
8775
8776 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8777 /* report the number of kqueues, even if they don't all fit */
8778 if (nkqueues < buflen) {
8779 kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8780 }
8781 nkqueues++;
8782 }
8783 }
8784 }
8785
8786 kqhash_unlock(fdp);
8787
8788 if (kq_ids) {
8789 size_t copysize;
8790 if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), ©size)) {
8791 err = ERANGE;
8792 goto out;
8793 }
8794
8795 assert(ubufsize >= copysize);
8796 err = copyout(kq_ids, ubuf, copysize);
8797 }
8798
8799 out:
8800 if (kq_ids) {
8801 kfree_data(kq_ids, bufsize);
8802 }
8803
8804 if (!err) {
8805 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8806 }
8807 return err;
8808 }
8809
8810 int
kevent_copyout_dynkqinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * size_out)8811 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8812 uint32_t ubufsize, int32_t *size_out)
8813 {
8814 proc_t p = (proc_t)proc;
8815 struct kqworkloop *kqwl;
8816 int err = 0;
8817 struct kqueue_dyninfo kqdi = { };
8818
8819 assert(p != NULL);
8820
8821 if (ubufsize < sizeof(struct kqueue_info)) {
8822 return ENOBUFS;
8823 }
8824
8825 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
8826 if (!kqwl) {
8827 return ESRCH;
8828 }
8829
8830 /*
8831 * backward compatibility: allow the argument to this call to only be
8832 * a struct kqueue_info
8833 */
8834 if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8835 ubufsize = sizeof(struct kqueue_dyninfo);
8836 err = fill_kqueue_dyninfo(kqwl, &kqdi);
8837 } else {
8838 ubufsize = sizeof(struct kqueue_info);
8839 err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
8840 }
8841 if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
8842 *size_out = ubufsize;
8843 }
8844 kqworkloop_release(kqwl);
8845 return err;
8846 }
8847
8848 int
kevent_copyout_dynkqextinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * nknotes_out)8849 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8850 uint32_t ubufsize, int32_t *nknotes_out)
8851 {
8852 proc_t p = (proc_t)proc;
8853 struct kqworkloop *kqwl;
8854 int err;
8855
8856 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
8857 if (!kqwl) {
8858 return ESRCH;
8859 }
8860
8861 err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
8862 kqworkloop_release(kqwl);
8863 return err;
8864 }
8865
8866 int
pid_kqueue_extinfo(proc_t p,struct kqueue * kq,user_addr_t ubuf,uint32_t bufsize,int32_t * retval)8867 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8868 uint32_t bufsize, int32_t *retval)
8869 {
8870 struct knote *kn;
8871 int i;
8872 int err = 0;
8873 struct filedesc *fdp = &p->p_fd;
8874 unsigned long nknotes = 0;
8875 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8876 struct kevent_extinfo *kqext = NULL;
8877
8878 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8879 buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8880
8881 kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
8882 if (kqext == NULL) {
8883 err = ENOMEM;
8884 goto out;
8885 }
8886
8887 proc_fdlock(p);
8888 for (i = 0; i < fdp->fd_knlistsize; i++) {
8889 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8890 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8891 }
8892 proc_fdunlock(p);
8893
8894 if (fdp->fd_knhashmask != 0) {
8895 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8896 knhash_lock(fdp);
8897 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8898 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8899 knhash_unlock(fdp);
8900 }
8901 }
8902
8903 assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8904 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8905
8906 out:
8907 kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
8908
8909 if (!err) {
8910 *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8911 }
8912 return err;
8913 }
8914
8915 static unsigned int
klist_copy_udata(struct klist * list,uint64_t * buf,unsigned int buflen,unsigned int nknotes)8916 klist_copy_udata(struct klist *list, uint64_t *buf,
8917 unsigned int buflen, unsigned int nknotes)
8918 {
8919 struct knote *kn;
8920 SLIST_FOREACH(kn, list, kn_link) {
8921 if (nknotes < buflen) {
8922 /*
8923 * kevent_register will always set kn_udata atomically
8924 * so that we don't have to take any kqlock here.
8925 */
8926 buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
8927 }
8928 /* we return total number of knotes, which may be more than requested */
8929 nknotes++;
8930 }
8931
8932 return nknotes;
8933 }
8934
8935 int
kevent_proc_copy_uptrs(void * proc,uint64_t * buf,uint32_t bufsize)8936 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
8937 {
8938 proc_t p = (proc_t)proc;
8939 struct filedesc *fdp = &p->p_fd;
8940 unsigned int nuptrs = 0;
8941 unsigned int buflen = bufsize / sizeof(uint64_t);
8942 struct kqworkloop *kqwl;
8943
8944 if (buflen > 0) {
8945 assert(buf != NULL);
8946 }
8947
8948 proc_fdlock(p);
8949 for (int i = 0; i < fdp->fd_knlistsize; i++) {
8950 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
8951 }
8952 proc_fdunlock(p);
8953
8954 knhash_lock(fdp);
8955 if (fdp->fd_knhashmask != 0) {
8956 for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) {
8957 nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
8958 }
8959 }
8960 knhash_unlock(fdp);
8961
8962 kqhash_lock(fdp);
8963 if (fdp->fd_kqhashmask != 0) {
8964 for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8965 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8966 if (nuptrs < buflen) {
8967 buf[nuptrs] = kqwl->kqwl_dynamicid;
8968 }
8969 nuptrs++;
8970 }
8971 }
8972 }
8973 kqhash_unlock(fdp);
8974
8975 return (int)nuptrs;
8976 }
8977
8978 static void
kevent_set_return_to_kernel_user_tsd(proc_t p,thread_t thread)8979 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
8980 {
8981 uint64_t ast_addr;
8982 bool proc_is_64bit = !!(p->p_flag & P_LP64);
8983 size_t user_addr_size = proc_is_64bit ? 8 : 4;
8984 uint32_t ast_flags32 = 0;
8985 uint64_t ast_flags64 = 0;
8986 struct uthread *ut = get_bsdthread_info(thread);
8987
8988 if (ut->uu_kqr_bound != NULL) {
8989 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
8990 }
8991
8992 if (ast_flags64 == 0) {
8993 return;
8994 }
8995
8996 if (!(p->p_flag & P_LP64)) {
8997 ast_flags32 = (uint32_t)ast_flags64;
8998 assert(ast_flags64 < 0x100000000ull);
8999 }
9000
9001 ast_addr = thread_rettokern_addr(thread);
9002 if (ast_addr == 0) {
9003 return;
9004 }
9005
9006 if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9007 (user_addr_t)ast_addr,
9008 user_addr_size) != 0) {
9009 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9010 "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9011 }
9012 }
9013
9014 /*
9015 * Semantics of writing to TSD value:
9016 *
9017 * 1. It is written to by the kernel and cleared by userspace.
9018 * 2. When the userspace code clears the TSD field, it takes responsibility for
9019 * taking action on the quantum expiry action conveyed by kernel.
9020 * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9021 * userspace back to kernel to make sure that it is never leaked across thread
9022 * requests.
9023 */
9024 void
kevent_set_workq_quantum_expiry_user_tsd(proc_t p,thread_t thread,uint64_t flags)9025 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9026 uint64_t flags)
9027 {
9028 uint64_t ast_addr;
9029 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9030 uint32_t ast_flags32 = 0;
9031 uint64_t ast_flags64 = flags;
9032
9033 if (ast_flags64 == 0) {
9034 return;
9035 }
9036
9037 if (!(p->p_flag & P_LP64)) {
9038 ast_flags32 = (uint32_t)ast_flags64;
9039 assert(ast_flags64 < 0x100000000ull);
9040 }
9041
9042 ast_addr = thread_wqquantum_addr(thread);
9043 assert(ast_addr != 0);
9044
9045 if (proc_is_64bit) {
9046 if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9047 #if DEBUG || DEVELOPMENT
9048 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9049 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9050 #endif
9051 }
9052 } else {
9053 if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9054 #if DEBUG || DEVELOPMENT
9055 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9056 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9057 #endif
9058 }
9059 }
9060 }
9061
9062 void
kevent_ast(thread_t thread,uint16_t bits)9063 kevent_ast(thread_t thread, uint16_t bits)
9064 {
9065 proc_t p = current_proc();
9066
9067
9068 if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9069 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9070 }
9071 if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9072 kevent_set_return_to_kernel_user_tsd(p, thread);
9073 }
9074
9075 if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9076 workq_kern_quantum_expiry_reevaluate(p, thread);
9077 }
9078 }
9079
9080 #if DEVELOPMENT || DEBUG
9081
9082 #define KEVENT_SYSCTL_BOUND_ID 1
9083
9084 static int
9085 kevent_sysctl SYSCTL_HANDLER_ARGS
9086 {
9087 #pragma unused(oidp, arg2)
9088 uintptr_t type = (uintptr_t)arg1;
9089 uint64_t bound_id = 0;
9090
9091 if (type != KEVENT_SYSCTL_BOUND_ID) {
9092 return EINVAL;
9093 }
9094
9095 if (req->newptr) {
9096 return EINVAL;
9097 }
9098
9099 struct uthread *ut = current_uthread();
9100 if (!ut) {
9101 return EFAULT;
9102 }
9103
9104 workq_threadreq_t kqr = ut->uu_kqr_bound;
9105 if (kqr) {
9106 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9107 bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9108 } else {
9109 bound_id = -1;
9110 }
9111 }
9112
9113 return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9114 }
9115
9116 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9117 "kevent information");
9118
9119 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9120 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9121 (void *)KEVENT_SYSCTL_BOUND_ID,
9122 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9123 "get the ID of the bound kqueue");
9124
9125 #endif /* DEVELOPMENT || DEBUG */
9126