1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 #include <kern/work_interval.h>
107
108 #include <pthread/priority_private.h>
109 #include <pthread/workqueue_syscalls.h>
110 #include <pthread/workqueue_internal.h>
111 #include <libkern/libkern.h>
112
113 #include <os/log.h>
114
115 #include "mach/kern_return.h"
116 #include "net/net_str_id.h"
117
118 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
119 #include <skywalk/lib/net_filter_event.h>
120
121 extern bool net_check_compatible_alf(void);
122 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
123
124 #include <mach/task.h>
125 #include <libkern/section_keywords.h>
126
127 #if CONFIG_MEMORYSTATUS
128 #include <sys/kern_memorystatus.h>
129 #endif
130
131 #if DEVELOPMENT || DEBUG
132 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
133 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
134 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
135 #endif
136
137 /* Enable bound thread support for kqworkloop. */
138 static TUNABLE(int, bootarg_thread_bound_kqwl_support_enabled,
139 "enable_thread_bound_kqwl_support", 0);
140 SYSCTL_NODE(_kern, OID_AUTO, kern_event, CTLFLAG_RD | CTLFLAG_LOCKED, 0, NULL);
141 SYSCTL_INT(_kern_kern_event, OID_AUTO, thread_bound_kqwl_support_enabled,
142 CTLFLAG_RD | CTLFLAG_LOCKED,
143 &bootarg_thread_bound_kqwl_support_enabled, 0,
144 "Whether thread bound kqwl support is enabled");
145
146 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
147 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
148 VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
149
150 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
151 extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
152
153 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
154
155 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
156 vfs_context_t ctx);
157 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
158 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
159 struct kevent_qos_s *kev);
160 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
161
162 static const struct fileops kqueueops = {
163 .fo_type = DTYPE_KQUEUE,
164 .fo_read = fo_no_read,
165 .fo_write = fo_no_write,
166 .fo_ioctl = fo_no_ioctl,
167 .fo_select = kqueue_select,
168 .fo_close = kqueue_close,
169 .fo_drain = kqueue_drain,
170 .fo_kqfilter = kqueue_kqfilter,
171 };
172
173 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
174 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
175 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
176 thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
177 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
178 static void kevent_register_wait_cleanup(struct knote *kn);
179
180 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
181 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
182
183 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
184 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
185 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
186 static void kqueue_update_iotier_override(kqueue_t kqu);
187
188 static void kqworkloop_unbind(struct kqworkloop *kqwl);
189
190 enum kqwl_unbind_locked_mode {
191 KQWL_OVERRIDE_DROP_IMMEDIATELY,
192 KQWL_OVERRIDE_DROP_DELAYED,
193 };
194 // The soft unbinding of kqworkloop only applies to kqwls configured
195 // with a permanently bound thread.
196 #define KQUEUE_THREADREQ_UNBIND_SOFT 0x1
197 static void kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
198 enum kqwl_unbind_locked_mode how, unsigned int flags);
199 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
200 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
201 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
202 static void kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread);
203 static void kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl);
204
205 enum {
206 KQWL_UTQ_NONE,
207 /*
208 * The wakeup qos is the qos of QUEUED knotes.
209 *
210 * This QoS is accounted for with the events override in the
211 * kqr_override_index field. It is raised each time a new knote is queued at
212 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
213 * knote buckets and is recomputed after each event delivery.
214 */
215 KQWL_UTQ_UPDATE_WAKEUP_QOS,
216 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
217 KQWL_UTQ_UNBINDING, /* attempt to rebind */
218 KQWL_UTQ_PARKING,
219 /*
220 * The wakeup override is for suppressed knotes that have fired again at
221 * a higher QoS than the one for which they are suppressed already.
222 * This override is cleared when the knote suppressed list becomes empty.
223 */
224 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
225 KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
226 /*
227 * The QoS is the maximum QoS of an event enqueued on this workloop in
228 * userland. It is copied from the only EVFILT_WORKLOOP knote with
229 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
230 * such knote, this QoS is 0.
231 */
232 KQWL_UTQ_SET_QOS_INDEX,
233 KQWL_UTQ_REDRIVE_EVENTS,
234 };
235 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
236 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
237
238 static struct knote *knote_alloc(void);
239 static void knote_free(struct knote *kn);
240 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
241 struct knote_lock_ctx *knlc, struct proc *p);
242 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
243 struct kevent_qos_s *kev, bool is_fd, struct proc *p);
244
245 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
246 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
247
248 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
249 struct kevent_qos_s *kev, int result);
250 static void knote_suppress(kqueue_t kqu, struct knote *kn);
251 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
252 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
253
254 // both these functions may dequeue the knote and it is up to the caller
255 // to enqueue the knote back
256 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
257 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
258
259 static ZONE_DEFINE(knote_zone, "knote zone",
260 sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
261 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
262 sizeof(struct kqfile), ZC_ZFREE_CLEARMEM);
263 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
264 sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM);
265 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
266 sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM);
267
268 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
269
270 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
271 static void filt_no_detach(struct knote *kn);
272 static int filt_bad_event(struct knote *kn, long hint);
273 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
274 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
275
276 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
277 .f_attach = filt_no_attach,
278 .f_detach = filt_no_detach,
279 .f_event = filt_bad_event,
280 .f_touch = filt_bad_touch,
281 .f_process = filt_bad_process,
282 };
283
284 #if CONFIG_MEMORYSTATUS
285 extern const struct filterops memorystatus_filtops;
286 #endif /* CONFIG_MEMORYSTATUS */
287 extern const struct filterops fs_filtops;
288 extern const struct filterops sig_filtops;
289 extern const struct filterops machport_attach_filtops;
290 extern const struct filterops mach_port_filtops;
291 extern const struct filterops mach_port_set_filtops;
292 extern const struct filterops pipe_nfiltops;
293 extern const struct filterops pipe_rfiltops;
294 extern const struct filterops pipe_wfiltops;
295 extern const struct filterops ptsd_kqops;
296 extern const struct filterops ptmx_kqops;
297 extern const struct filterops soread_filtops;
298 extern const struct filterops sowrite_filtops;
299 extern const struct filterops sock_filtops;
300 extern const struct filterops soexcept_filtops;
301 extern const struct filterops spec_filtops;
302 extern const struct filterops bpfread_filtops;
303 extern const struct filterops necp_fd_rfiltops;
304 #if SKYWALK
305 extern const struct filterops skywalk_channel_rfiltops;
306 extern const struct filterops skywalk_channel_wfiltops;
307 extern const struct filterops skywalk_channel_efiltops;
308 #endif /* SKYWALK */
309 extern const struct filterops fsevent_filtops;
310 extern const struct filterops vnode_filtops;
311 extern const struct filterops tty_filtops;
312
313 __security_const_early static struct filterops file_filtops;
314 __security_const_early static struct filterops kqread_filtops;
315 __security_const_early static struct filterops proc_filtops;
316 __security_const_early static struct filterops timer_filtops;
317 __security_const_early static struct filterops user_filtops;
318 __security_const_early static struct filterops workloop_filtops;
319 #if CONFIG_EXCLAVES
320 extern const struct filterops exclaves_notification_filtops;
321 #endif /* CONFIG_EXCLAVES */
322 extern const struct filterops aio_filtops;
323
324 /*
325 *
326 * Rules for adding new filters to the system:
327 * Public filters:
328 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
329 * in the exported section of the header
330 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
331 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
332 * of the Public Filters section in the array.
333 * Private filters:
334 * - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
335 * - Update the EVFILTID_MAX value to reflect the new addition
336 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
337 * the Private filters section of the array.
338 */
339 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
340 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
341 /* Public Filters */
342 [~EVFILT_READ] = &file_filtops,
343 [~EVFILT_WRITE] = &file_filtops,
344 [~EVFILT_AIO] = &aio_filtops,
345 [~EVFILT_VNODE] = &file_filtops,
346 [~EVFILT_PROC] = &proc_filtops,
347 [~EVFILT_SIGNAL] = &sig_filtops,
348 [~EVFILT_TIMER] = &timer_filtops,
349 [~EVFILT_MACHPORT] = &machport_attach_filtops,
350 [~EVFILT_FS] = &fs_filtops,
351 [~EVFILT_USER] = &user_filtops,
352 [~EVFILT_UNUSED_11] = &bad_filtops,
353 [~EVFILT_VM] = &bad_filtops,
354 [~EVFILT_SOCK] = &file_filtops,
355 #if CONFIG_MEMORYSTATUS
356 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
357 #else
358 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
359 #endif
360 [~EVFILT_EXCEPT] = &file_filtops,
361 #if SKYWALK
362 [~EVFILT_NW_CHANNEL] = &file_filtops,
363 #else /* !SKYWALK */
364 [~EVFILT_NW_CHANNEL] = &bad_filtops,
365 #endif /* !SKYWALK */
366 [~EVFILT_WORKLOOP] = &workloop_filtops,
367 #if CONFIG_EXCLAVES
368 [~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
369 #else /* !CONFIG_EXCLAVES */
370 [~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
371 #endif /* CONFIG_EXCLAVES*/
372
373 /* Private filters */
374 [EVFILTID_KQREAD] = &kqread_filtops,
375 [EVFILTID_PIPE_N] = &pipe_nfiltops,
376 [EVFILTID_PIPE_R] = &pipe_rfiltops,
377 [EVFILTID_PIPE_W] = &pipe_wfiltops,
378 [EVFILTID_PTSD] = &ptsd_kqops,
379 [EVFILTID_SOREAD] = &soread_filtops,
380 [EVFILTID_SOWRITE] = &sowrite_filtops,
381 [EVFILTID_SCK] = &sock_filtops,
382 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
383 [EVFILTID_SPEC] = &spec_filtops,
384 [EVFILTID_BPFREAD] = &bpfread_filtops,
385 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
386 #if SKYWALK
387 [EVFILTID_SKYWALK_CHANNEL_W] = &skywalk_channel_wfiltops,
388 [EVFILTID_SKYWALK_CHANNEL_R] = &skywalk_channel_rfiltops,
389 [EVFILTID_SKYWALK_CHANNEL_E] = &skywalk_channel_efiltops,
390 #else /* !SKYWALK */
391 [EVFILTID_SKYWALK_CHANNEL_W] = &bad_filtops,
392 [EVFILTID_SKYWALK_CHANNEL_R] = &bad_filtops,
393 [EVFILTID_SKYWALK_CHANNEL_E] = &bad_filtops,
394 #endif /* !SKYWALK */
395 [EVFILTID_FSEVENT] = &fsevent_filtops,
396 [EVFILTID_VN] = &vnode_filtops,
397 [EVFILTID_TTY] = &tty_filtops,
398 [EVFILTID_PTMX] = &ptmx_kqops,
399 [EVFILTID_MACH_PORT] = &mach_port_filtops,
400 [EVFILTID_MACH_PORT_SET] = &mach_port_set_filtops,
401
402 /* fake filter for detached knotes, keep last */
403 [EVFILTID_DETACHED] = &bad_filtops,
404 };
405
406 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)407 kqr_thread_bound(workq_threadreq_t kqr)
408 {
409 return kqr->tr_state == WORKQ_TR_STATE_BOUND;
410 }
411
412 static inline bool
kqr_thread_permanently_bound(workq_threadreq_t kqr)413 kqr_thread_permanently_bound(workq_threadreq_t kqr)
414 {
415 return kqr_thread_bound(kqr) && (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
416 }
417
418 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)419 kqr_thread_requested_pending(workq_threadreq_t kqr)
420 {
421 workq_tr_state_t tr_state = kqr->tr_state;
422 return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
423 }
424
425 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)426 kqr_thread_requested(workq_threadreq_t kqr)
427 {
428 return kqr->tr_state != WORKQ_TR_STATE_IDLE;
429 }
430
431 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)432 kqr_thread_fast(workq_threadreq_t kqr)
433 {
434 assert(kqr_thread_bound(kqr));
435 return kqr->tr_thread;
436 }
437
438 static inline thread_t
kqr_thread(workq_threadreq_t kqr)439 kqr_thread(workq_threadreq_t kqr)
440 {
441 return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
442 }
443
444 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)445 kqr_kqworkloop(workq_threadreq_t kqr)
446 {
447 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
448 return __container_of(kqr, struct kqworkloop, kqwl_request);
449 }
450 return NULL;
451 }
452
453 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)454 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
455 {
456 kqueue_t kqu;
457 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
458 kqu.kqwl = kqr_kqworkloop(kqr);
459 } else {
460 kqu.kqwq = p->p_fd.fd_wqkqueue;
461 assert(kqr >= kqu.kqwq->kqwq_request &&
462 kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
463 }
464 return kqu;
465 }
466
467 #if CONFIG_PREADOPT_TG
468 /* There are no guarantees about which locks are held when this is called */
469 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)470 kqr_preadopt_thread_group(workq_threadreq_t req)
471 {
472 struct kqworkloop *kqwl = kqr_kqworkloop(req);
473 return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
474 }
475
476 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)477 inline _Atomic(thread_group_qos_t) *
478 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
479 {
480 struct kqworkloop *kqwl = kqr_kqworkloop(req);
481 return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
482 }
483 #endif
484
485 /*
486 * kqueue/note lock implementations
487 *
488 * The kqueue lock guards the kq state, the state of its queues,
489 * and the kqueue-aware status and locks of individual knotes.
490 *
491 * The kqueue workq lock is used to protect state guarding the
492 * interaction of the kqueue with the workq. This state cannot
493 * be guarded by the kq lock - as it needs to be taken when we
494 * already have the waitq set lock held (during the waitq hook
495 * callback). It might be better to use the waitq lock itself
496 * for this, but the IRQ requirements make that difficult).
497 *
498 * Knote flags, filter flags, and associated data are protected
499 * by the underlying object lock - and are only ever looked at
500 * by calling the filter to get a [consistent] snapshot of that
501 * data.
502 */
503
504 static inline void
kqlock(kqueue_t kqu)505 kqlock(kqueue_t kqu)
506 {
507 lck_spin_lock(&kqu.kq->kq_lock);
508 }
509
510 static inline void
kqlock_held(__assert_only kqueue_t kqu)511 kqlock_held(__assert_only kqueue_t kqu)
512 {
513 LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
514 }
515
516 static inline void
kqunlock(kqueue_t kqu)517 kqunlock(kqueue_t kqu)
518 {
519 lck_spin_unlock(&kqu.kq->kq_lock);
520 }
521
522 static inline void
knhash_lock(struct filedesc * fdp)523 knhash_lock(struct filedesc *fdp)
524 {
525 lck_mtx_lock(&fdp->fd_knhashlock);
526 }
527
528 static inline void
knhash_unlock(struct filedesc * fdp)529 knhash_unlock(struct filedesc *fdp)
530 {
531 lck_mtx_unlock(&fdp->fd_knhashlock);
532 }
533
534 /* wait event for knote locks */
535 static inline event_t
knote_lock_wev(struct knote * kn)536 knote_lock_wev(struct knote *kn)
537 {
538 return (event_t)(&kn->kn_hook);
539 }
540
541 /* wait event for kevent_register_wait_* */
542 static inline event64_t
knote_filt_wev64(struct knote * kn)543 knote_filt_wev64(struct knote *kn)
544 {
545 /* kdp_workloop_sync_wait_find_owner knows about this */
546 return CAST_EVENT64_T(kn);
547 }
548
549 /* wait event for knote_post/knote_drop */
550 static inline event_t
knote_post_wev(struct knote * kn)551 knote_post_wev(struct knote *kn)
552 {
553 return &kn->kn_kevent;
554 }
555
556 /*!
557 * @function knote_has_qos
558 *
559 * @brief
560 * Whether the knote has a regular QoS.
561 *
562 * @discussion
563 * kn_qos_override is:
564 * - 0 on kqfiles
565 * - THREAD_QOS_LAST for special buckets (manager)
566 *
567 * Other values mean the knote participates to QoS propagation.
568 */
569 static inline bool
knote_has_qos(struct knote * kn)570 knote_has_qos(struct knote *kn)
571 {
572 return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
573 }
574
575 #pragma mark knote locks
576
577 /*
578 * Enum used by the knote_lock_* functions.
579 *
580 * KNOTE_KQ_LOCK_ALWAYS
581 * The function will always return with the kq lock held.
582 *
583 * KNOTE_KQ_LOCK_ON_SUCCESS
584 * The function will return with the kq lock held if it was successful
585 * (knote_lock() is the only function that can fail).
586 *
587 * KNOTE_KQ_LOCK_ON_FAILURE
588 * The function will return with the kq lock held if it was unsuccessful
589 * (knote_lock() is the only function that can fail).
590 *
591 * KNOTE_KQ_UNLOCK:
592 * The function returns with the kq unlocked.
593 */
594 enum kqlocking {
595 KNOTE_KQ_LOCK_ALWAYS,
596 KNOTE_KQ_LOCK_ON_SUCCESS,
597 KNOTE_KQ_LOCK_ON_FAILURE,
598 KNOTE_KQ_UNLOCK,
599 };
600
601 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)602 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
603 {
604 struct knote_lock_ctx *ctx;
605 LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
606 if (ctx->knlc_knote == kn) {
607 return ctx;
608 }
609 }
610 panic("knote lock context not found: %p", kn);
611 __builtin_trap();
612 }
613
614 /* slowpath of knote_lock() */
615 __attribute__((noinline))
616 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)617 knote_lock_slow(kqueue_t kqu, struct knote *kn,
618 struct knote_lock_ctx *knlc, int kqlocking)
619 {
620 struct knote_lock_ctx *owner_lc;
621 struct uthread *uth = current_uthread();
622 wait_result_t wr;
623
624 kqlock_held(kqu);
625
626 owner_lc = knote_lock_ctx_find(kqu, kn);
627 #if MACH_ASSERT
628 knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
629 #endif
630 owner_lc->knlc_waiters++;
631
632 /*
633 * Make our lock context visible to knote_unlock()
634 */
635 uth->uu_knlock = knlc;
636
637 wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
638 knote_lock_wev(kn), owner_lc->knlc_thread,
639 THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
640
641 if (wr == THREAD_RESTART) {
642 /*
643 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
644 * We need to cleanup the state since no one did.
645 */
646 uth->uu_knlock = NULL;
647 #if MACH_ASSERT
648 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
649 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
650 #endif
651
652 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
653 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
654 kqlock(kqu);
655 }
656 return false;
657 } else {
658 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
659 kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
660 kqlock(kqu);
661 /*
662 * This state is set under the lock so we can't
663 * really assert this unless we hold the lock.
664 */
665 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
666 }
667 return true;
668 }
669 }
670
671 /*
672 * Attempts to take the "knote" lock.
673 *
674 * Called with the kqueue lock held.
675 *
676 * Returns true if the knote lock is acquired, false if it has been dropped
677 */
678 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)679 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
680 enum kqlocking kqlocking)
681 {
682 kqlock_held(kqu);
683
684 #if MACH_ASSERT
685 assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
686 #endif
687 knlc->knlc_knote = kn;
688 knlc->knlc_thread = current_thread();
689 knlc->knlc_waiters = 0;
690
691 if (__improbable(kn->kn_status & KN_LOCKED)) {
692 return knote_lock_slow(kqu, kn, knlc, kqlocking);
693 }
694
695 /*
696 * When the knote will be dropped, the knote lock is taken before
697 * KN_DROPPING is set, and then the knote will be removed from any
698 * hash table that references it before the lock is canceled.
699 */
700 assert((kn->kn_status & KN_DROPPING) == 0);
701 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
702 kn->kn_status |= KN_LOCKED;
703 #if MACH_ASSERT
704 knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
705 #endif
706
707 if (kqlocking == KNOTE_KQ_UNLOCK ||
708 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
709 kqunlock(kqu);
710 }
711 return true;
712 }
713
714 /*
715 * Unlocks a knote successfully locked with knote_lock().
716 *
717 * Called with the kqueue lock held.
718 *
719 * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
720 */
721 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)722 knote_unlock(kqueue_t kqu, struct knote *kn,
723 struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
724 {
725 kqlock_held(kqu);
726
727 assert(knlc->knlc_knote == kn);
728 assert(kn->kn_status & KN_LOCKED);
729 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
730
731 LIST_REMOVE(knlc, knlc_link);
732
733 if (knlc->knlc_waiters) {
734 thread_t thread = THREAD_NULL;
735
736 wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
737 LCK_WAKE_DEFAULT, &thread);
738
739 /*
740 * knote_lock_slow() publishes the lock context of waiters
741 * in uthread::uu_knlock.
742 *
743 * Reach out and make this context the new owner.
744 */
745 struct uthread *ut = get_bsdthread_info(thread);
746 struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
747
748 assert(next_owner_lc->knlc_knote == kn);
749 next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
750 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
751 #if MACH_ASSERT
752 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
753 #endif
754 ut->uu_knlock = NULL;
755 thread_deallocate_safe(thread);
756 } else {
757 kn->kn_status &= ~KN_LOCKED;
758 }
759
760 if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
761 /*
762 * No f_event() in flight anymore, we can leave QoS "Merge" mode
763 *
764 * See knote_adjust_qos()
765 */
766 kn->kn_status &= ~KN_MERGE_QOS;
767 }
768 if (kqlocking == KNOTE_KQ_UNLOCK) {
769 kqunlock(kqu);
770 }
771 #if MACH_ASSERT
772 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
773 #endif
774 }
775
776 /*
777 * Aborts all waiters for a knote lock, and unlock the knote.
778 *
779 * Called with the kqueue lock held.
780 *
781 * Returns with the kqueue unlocked.
782 */
783 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)784 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
785 struct knote_lock_ctx *knlc)
786 {
787 kqlock_held(kq);
788
789 assert(knlc->knlc_knote == kn);
790 assert(kn->kn_status & KN_LOCKED);
791 assert(kn->kn_status & KN_DROPPING);
792
793 LIST_REMOVE(knlc, knlc_link);
794 kn->kn_status &= ~KN_LOCKED;
795 kqunlock(kq);
796
797 if (knlc->knlc_waiters) {
798 wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
799 }
800 #if MACH_ASSERT
801 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
802 #endif
803 }
804
805 /*
806 * Call the f_event hook of a given filter.
807 *
808 * Takes a use count to protect against concurrent drops.
809 * Called with the object lock held.
810 */
811 static void
knote_post(struct knote * kn,long hint)812 knote_post(struct knote *kn, long hint)
813 {
814 struct kqueue *kq = knote_get_kq(kn);
815 int dropping, result;
816
817 kqlock(kq);
818
819 if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
820 return kqunlock(kq);
821 }
822
823 if (__improbable(kn->kn_status & KN_POSTING)) {
824 panic("KNOTE() called concurrently on knote %p", kn);
825 }
826
827 kn->kn_status |= KN_POSTING;
828
829 kqunlock(kq);
830 result = filter_call(knote_fops(kn), f_event(kn, hint));
831 kqlock(kq);
832
833 /* Someone dropped the knote/the monitored object vanished while we
834 * were in f_event, swallow the side effects of the post.
835 */
836 dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
837
838 if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
839 kqueue_update_iotier_override(kq);
840 }
841
842 if (!dropping && (result & FILTER_ACTIVE)) {
843 knote_activate(kq, kn, result);
844 }
845
846 if ((kn->kn_status & KN_LOCKED) == 0) {
847 /*
848 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
849 *
850 * See knote_adjust_qos()
851 */
852 kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
853 } else {
854 kn->kn_status &= ~KN_POSTING;
855 }
856
857 if (__improbable(dropping)) {
858 thread_wakeup(knote_post_wev(kn));
859 }
860
861 kqunlock(kq);
862 }
863
864 /*
865 * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
866 * caller to be done.
867 *
868 * - kq locked at entry
869 * - kq unlocked at exit
870 */
871 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)872 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
873 {
874 kqlock_held(kq);
875
876 assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
877
878 if (kn->kn_status & KN_POSTING) {
879 lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
880 THREAD_UNINT | THREAD_WAIT_NOREPORT);
881 } else {
882 kqunlock(kq);
883 }
884 }
885
886 #pragma mark knote helpers for filters
887
888 OS_ALWAYS_INLINE
889 void *
knote_kn_hook_get_raw(struct knote * kn)890 knote_kn_hook_get_raw(struct knote *kn)
891 {
892 uintptr_t *addr = &kn->kn_hook;
893
894 void *hook = (void *) *addr;
895 #if __has_feature(ptrauth_calls)
896 if (hook) {
897 uint16_t blend = kn->kn_filter;
898 blend |= (kn->kn_filtid << 8);
899 blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
900
901 hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
902 ptrauth_blend_discriminator(addr, blend));
903 }
904 #endif
905
906 return hook;
907 }
908
909 OS_ALWAYS_INLINE void
knote_kn_hook_set_raw(struct knote * kn,void * kn_hook)910 knote_kn_hook_set_raw(struct knote *kn, void *kn_hook)
911 {
912 uintptr_t *addr = &kn->kn_hook;
913 #if __has_feature(ptrauth_calls)
914 if (kn_hook) {
915 uint16_t blend = kn->kn_filter;
916 blend |= (kn->kn_filtid << 8);
917 blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
918
919 kn_hook = ptrauth_sign_unauthenticated(kn_hook,
920 ptrauth_key_process_independent_data,
921 ptrauth_blend_discriminator(addr, blend));
922 }
923 #endif
924 *addr = (uintptr_t) kn_hook;
925 }
926
927 OS_ALWAYS_INLINE
928 void
knote_set_error(struct knote * kn,int error)929 knote_set_error(struct knote *kn, int error)
930 {
931 kn->kn_flags |= EV_ERROR;
932 kn->kn_sdata = error;
933 }
934
935 OS_ALWAYS_INLINE
936 int64_t
knote_low_watermark(const struct knote * kn)937 knote_low_watermark(const struct knote *kn)
938 {
939 return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
940 }
941
942 /*!
943 * @function knote_fill_kevent_with_sdata
944 *
945 * @brief
946 * Fills in a kevent from the current content of a knote.
947 *
948 * @discussion
949 * This is meant to be called from filter's f_process hooks.
950 * The kevent data is filled with kn->kn_sdata.
951 *
952 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
953 *
954 * Using knote_fill_kevent is typically preferred.
955 */
956 OS_ALWAYS_INLINE
957 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)958 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
959 {
960 #define knote_assert_aliases(name1, offs1, name2) \
961 static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
962 offsetof(struct kevent_internal_s, name2), \
963 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
964 /*
965 * All the code makes assumptions on these aliasing,
966 * so make sure we fail the build if we ever ever ever break them.
967 */
968 knote_assert_aliases(ident, 0, kei_ident);
969 #ifdef __LITTLE_ENDIAN__
970 knote_assert_aliases(filter, 0, kei_filter); // non trivial overlap
971 knote_assert_aliases(filter, 1, kei_filtid); // non trivial overlap
972 #else
973 knote_assert_aliases(filter, 0, kei_filtid); // non trivial overlap
974 knote_assert_aliases(filter, 1, kei_filter); // non trivial overlap
975 #endif
976 knote_assert_aliases(flags, 0, kei_flags);
977 knote_assert_aliases(qos, 0, kei_qos);
978 knote_assert_aliases(udata, 0, kei_udata);
979 knote_assert_aliases(fflags, 0, kei_fflags);
980 knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
981 knote_assert_aliases(data, 0, kei_sdata); // non trivial overlap
982 knote_assert_aliases(ext, 0, kei_ext);
983 #undef knote_assert_aliases
984
985 /*
986 * Fix the differences between kevent_qos_s and kevent_internal_s:
987 * - xflags is where kn_sfflags lives, we need to zero it
988 * - fixup the high bits of `filter` where kn_filtid lives
989 */
990 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
991 kev->xflags = 0;
992 kev->filter |= 0xff00;
993 if (kn->kn_flags & EV_CLEAR) {
994 kn->kn_fflags = 0;
995 }
996 }
997
998 /*!
999 * @function knote_fill_kevent
1000 *
1001 * @brief
1002 * Fills in a kevent from the current content of a knote.
1003 *
1004 * @discussion
1005 * This is meant to be called from filter's f_process hooks.
1006 * The kevent data is filled with the passed in data.
1007 *
1008 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
1009 */
1010 OS_ALWAYS_INLINE
1011 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)1012 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
1013 {
1014 knote_fill_kevent_with_sdata(kn, kev);
1015 kev->filter = kn->kn_filter;
1016 kev->data = data;
1017 }
1018
1019
1020 #pragma mark file_filtops
1021
1022 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)1023 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
1024 {
1025 return fo_kqfilter(kn->kn_fp, kn, kev);
1026 }
1027
1028 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1029 .f_isfd = 1,
1030 .f_attach = filt_fileattach,
1031 };
1032
1033 #pragma mark kqread_filtops
1034
1035 #define f_flag fp_glob->fg_flag
1036 #define f_ops fp_glob->fg_ops
1037 #define f_lflags fp_glob->fg_lflags
1038
1039 static void
filt_kqdetach(struct knote * kn)1040 filt_kqdetach(struct knote *kn)
1041 {
1042 struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
1043 struct kqueue *kq = &kqf->kqf_kqueue;
1044
1045 kqlock(kq);
1046 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1047 kqunlock(kq);
1048 }
1049
1050 static int
filt_kqueue(struct knote * kn,__unused long hint)1051 filt_kqueue(struct knote *kn, __unused long hint)
1052 {
1053 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1054
1055 return kq->kq_count > 0;
1056 }
1057
1058 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)1059 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
1060 {
1061 #pragma unused(kev)
1062 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1063 int res;
1064
1065 kqlock(kq);
1066 res = (kq->kq_count > 0);
1067 kqunlock(kq);
1068
1069 return res;
1070 }
1071
1072 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1073 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1074 {
1075 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1076 int res = 0;
1077
1078 kqlock(kq);
1079 if (kq->kq_count) {
1080 knote_fill_kevent(kn, kev, kq->kq_count);
1081 res = 1;
1082 }
1083 kqunlock(kq);
1084
1085 return res;
1086 }
1087
1088 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1089 .f_isfd = 1,
1090 .f_detach = filt_kqdetach,
1091 .f_event = filt_kqueue,
1092 .f_touch = filt_kqtouch,
1093 .f_process = filt_kqprocess,
1094 };
1095
1096 #pragma mark proc_filtops
1097
1098 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1099 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1100 {
1101 struct proc *p;
1102
1103 assert(PID_MAX < NOTE_PDATAMASK);
1104
1105 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1106 knote_set_error(kn, ENOTSUP);
1107 return 0;
1108 }
1109
1110 p = proc_find((int)kn->kn_id);
1111 if (p == NULL) {
1112 knote_set_error(kn, ESRCH);
1113 return 0;
1114 }
1115
1116 const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1117
1118 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1119 do {
1120 pid_t selfpid = proc_selfpid();
1121
1122 if (p->p_ppid == selfpid) {
1123 break; /* parent => ok */
1124 }
1125 if ((p->p_lflag & P_LTRACED) != 0 &&
1126 (p->p_oppid == selfpid)) {
1127 break; /* parent-in-waiting => ok */
1128 }
1129 if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1130 break; /* allowed to signal => ok */
1131 }
1132 proc_rele(p);
1133 knote_set_error(kn, EACCES);
1134 return 0;
1135 } while (0);
1136 }
1137
1138 kn->kn_proc = p;
1139 kn->kn_flags |= EV_CLEAR; /* automatically set */
1140 kn->kn_sdata = 0; /* incoming data is ignored */
1141
1142 proc_klist_lock();
1143
1144 KNOTE_ATTACH(&p->p_klist, kn);
1145
1146 proc_klist_unlock();
1147
1148 proc_rele(p);
1149
1150 /*
1151 * only captures edge-triggered events after this point
1152 * so it can't already be fired.
1153 */
1154 return 0;
1155 }
1156
1157
1158 /*
1159 * The knote may be attached to a different process, which may exit,
1160 * leaving nothing for the knote to be attached to. In that case,
1161 * the pointer to the process will have already been nulled out.
1162 */
1163 static void
filt_procdetach(struct knote * kn)1164 filt_procdetach(struct knote *kn)
1165 {
1166 struct proc *p;
1167
1168 proc_klist_lock();
1169
1170 p = kn->kn_proc;
1171 if (p != PROC_NULL) {
1172 kn->kn_proc = PROC_NULL;
1173 KNOTE_DETACH(&p->p_klist, kn);
1174 }
1175
1176 proc_klist_unlock();
1177 }
1178
1179 static int
filt_procevent(struct knote * kn,long hint)1180 filt_procevent(struct knote *kn, long hint)
1181 {
1182 u_int event;
1183
1184 /* ALWAYS CALLED WITH proc_klist_lock */
1185
1186 /*
1187 * Note: a lot of bits in hint may be obtained from the knote
1188 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1189 * bits in hint for filt_procevent
1190 *
1191 * mask off extra data
1192 */
1193 event = (u_int)hint & NOTE_PCTRLMASK;
1194
1195 /*
1196 * termination lifecycle events can happen while a debugger
1197 * has reparented a process, in which case notifications
1198 * should be quashed except to the tracing parent. When
1199 * the debugger reaps the child (either via wait4(2) or
1200 * process exit), the child will be reparented to the original
1201 * parent and these knotes re-fired.
1202 */
1203 if (event & NOTE_EXIT) {
1204 if ((kn->kn_proc->p_oppid != 0)
1205 && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1206 /*
1207 * This knote is not for the current ptrace(2) parent, ignore.
1208 */
1209 return 0;
1210 }
1211 }
1212
1213 /*
1214 * if the user is interested in this event, record it.
1215 */
1216 if (kn->kn_sfflags & event) {
1217 kn->kn_fflags |= event;
1218 }
1219
1220 #pragma clang diagnostic push
1221 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1222 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1223 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1224 }
1225 #pragma clang diagnostic pop
1226
1227
1228 /*
1229 * The kernel has a wrapper in place that returns the same data
1230 * as is collected here, in kn_hook32. Any changes to how
1231 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1232 * should also be reflected in the proc_pidnoteexit() wrapper.
1233 */
1234 if (event == NOTE_EXIT) {
1235 kn->kn_hook32 = 0;
1236 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1237 kn->kn_fflags |= NOTE_EXITSTATUS;
1238 kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1239 }
1240 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1241 kn->kn_fflags |= NOTE_EXIT_DETAIL;
1242 if ((kn->kn_proc->p_lflag &
1243 P_LTERM_DECRYPTFAIL) != 0) {
1244 kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1245 }
1246 if ((kn->kn_proc->p_lflag &
1247 P_LTERM_JETSAM) != 0) {
1248 kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1249 switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1250 case P_JETSAM_VMPAGESHORTAGE:
1251 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1252 break;
1253 case P_JETSAM_VMTHRASHING:
1254 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1255 break;
1256 case P_JETSAM_FCTHRASHING:
1257 kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1258 break;
1259 case P_JETSAM_VNODE:
1260 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1261 break;
1262 case P_JETSAM_HIWAT:
1263 kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1264 break;
1265 case P_JETSAM_PID:
1266 kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1267 break;
1268 case P_JETSAM_IDLEEXIT:
1269 kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1270 break;
1271 }
1272 }
1273 if ((proc_getcsflags(kn->kn_proc) &
1274 CS_KILLED) != 0) {
1275 kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1276 }
1277 }
1278 }
1279
1280 /* if we have any matching state, activate the knote */
1281 return kn->kn_fflags != 0;
1282 }
1283
1284 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1285 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1286 {
1287 int res;
1288
1289 proc_klist_lock();
1290
1291 /* accept new filter flags and mask off output events no long interesting */
1292 kn->kn_sfflags = kev->fflags;
1293
1294 /* restrict the current results to the (smaller?) set of new interest */
1295 /*
1296 * For compatibility with previous implementations, we leave kn_fflags
1297 * as they were before.
1298 */
1299 //kn->kn_fflags &= kn->kn_sfflags;
1300
1301 res = (kn->kn_fflags != 0);
1302
1303 proc_klist_unlock();
1304
1305 return res;
1306 }
1307
1308 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1309 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1310 {
1311 int res = 0;
1312
1313 proc_klist_lock();
1314 if (kn->kn_fflags) {
1315 knote_fill_kevent(kn, kev, kn->kn_hook32);
1316 kn->kn_hook32 = 0;
1317 res = 1;
1318 }
1319 proc_klist_unlock();
1320 return res;
1321 }
1322
1323 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1324 .f_attach = filt_procattach,
1325 .f_detach = filt_procdetach,
1326 .f_event = filt_procevent,
1327 .f_touch = filt_proctouch,
1328 .f_process = filt_procprocess,
1329 };
1330
1331 #pragma mark timer_filtops
1332
1333 struct filt_timer_params {
1334 uint64_t deadline; /* deadline in abs/cont time
1335 * (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1336 uint64_t leeway; /* leeway in abstime, or 0 if none */
1337 uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1338 };
1339
1340 /*
1341 * Values stored in the knote at rest (using Mach absolute time units)
1342 *
1343 * kn->kn_thcall where the thread_call object is stored
1344 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1345 * kn->kn_ext[1] leeway value
1346 * kn->kn_sdata interval timer: the interval
1347 * absolute/deadline timer: 0
1348 * kn->kn_hook32 timer state (with gencount)
1349 *
1350 * TIMER_IDLE:
1351 * The timer has either never been scheduled or been cancelled.
1352 * It is safe to schedule a new one in this state.
1353 *
1354 * TIMER_ARMED:
1355 * The timer has been scheduled
1356 *
1357 * TIMER_FIRED
1358 * The timer has fired and an event needs to be delivered.
1359 * When in this state, the callout may still be running.
1360 *
1361 * TIMER_IMMEDIATE
1362 * The timer has fired at registration time, and the callout was never
1363 * dispatched.
1364 */
1365 #define TIMER_IDLE 0x0
1366 #define TIMER_ARMED 0x1
1367 #define TIMER_FIRED 0x2
1368 #define TIMER_IMMEDIATE 0x3
1369 #define TIMER_STATE_MASK 0x3
1370 #define TIMER_GEN_INC 0x4
1371
1372 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1373 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1374 {
1375 kn->kn_ext[0] = params->deadline;
1376 kn->kn_ext[1] = params->leeway;
1377 kn->kn_sdata = params->interval;
1378 }
1379
1380 /*
1381 * filt_timervalidate - process data from user
1382 *
1383 * Sets up the deadline, interval, and leeway from the provided user data
1384 *
1385 * Input:
1386 * kn_sdata timer deadline or interval time
1387 * kn_sfflags style of timer, unit of measurement
1388 *
1389 * Output:
1390 * struct filter_timer_params to apply to the filter with
1391 * filt_timer_set_params when changes are ready to be commited.
1392 *
1393 * Returns:
1394 * EINVAL Invalid user data parameters
1395 * ERANGE Various overflows with the parameters
1396 *
1397 * Called with timer filter lock held.
1398 */
1399 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1400 filt_timervalidate(const struct kevent_qos_s *kev,
1401 struct filt_timer_params *params)
1402 {
1403 /*
1404 * There are 5 knobs that need to be chosen for a timer registration:
1405 *
1406 * A) Units of time (what is the time duration of the specified number)
1407 * Absolute and interval take:
1408 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1409 * Defaults to milliseconds if not specified
1410 *
1411 * B) Clock epoch (what is the zero point of the specified number)
1412 * For interval, there is none
1413 * For absolute, defaults to the gettimeofday/calendar epoch
1414 * With NOTE_MACHTIME, uses mach_absolute_time()
1415 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1416 *
1417 * C) The knote's behavior on delivery
1418 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1419 * Absolute is a forced one-shot timer which deletes on delivery
1420 * TODO: Add a way for absolute to be not forced one-shot
1421 *
1422 * D) Whether the time duration is relative to now or absolute
1423 * Interval fires at now + duration when it is set up
1424 * Absolute fires at now + difference between now walltime and passed in walltime
1425 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1426 *
1427 * E) Whether the timer continues to tick across sleep
1428 * By default all three do not.
1429 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1430 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1431 * expires when mach_continuous_time() is > the passed in value.
1432 */
1433
1434 uint64_t multiplier;
1435
1436 boolean_t use_abstime = FALSE;
1437
1438 switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1439 case NOTE_SECONDS:
1440 multiplier = NSEC_PER_SEC;
1441 break;
1442 case NOTE_USECONDS:
1443 multiplier = NSEC_PER_USEC;
1444 break;
1445 case NOTE_NSECONDS:
1446 multiplier = 1;
1447 break;
1448 case NOTE_MACHTIME:
1449 multiplier = 0;
1450 use_abstime = TRUE;
1451 break;
1452 case 0: /* milliseconds (default) */
1453 multiplier = NSEC_PER_SEC / 1000;
1454 break;
1455 default:
1456 return EINVAL;
1457 }
1458
1459 /* transform the leeway in kn_ext[1] to same time scale */
1460 if (kev->fflags & NOTE_LEEWAY) {
1461 uint64_t leeway_abs;
1462
1463 if (use_abstime) {
1464 leeway_abs = (uint64_t)kev->ext[1];
1465 } else {
1466 uint64_t leeway_ns;
1467 if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1468 return ERANGE;
1469 }
1470
1471 nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1472 }
1473
1474 params->leeway = leeway_abs;
1475 } else {
1476 params->leeway = 0;
1477 }
1478
1479 if (kev->fflags & NOTE_ABSOLUTE) {
1480 uint64_t deadline_abs;
1481
1482 if (use_abstime) {
1483 deadline_abs = (uint64_t)kev->data;
1484 } else {
1485 uint64_t calendar_deadline_ns;
1486
1487 if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1488 return ERANGE;
1489 }
1490
1491 /* calendar_deadline_ns is in nanoseconds since the epoch */
1492
1493 clock_sec_t seconds;
1494 clock_nsec_t nanoseconds;
1495
1496 /*
1497 * Note that the conversion through wall-time is only done once.
1498 *
1499 * If the relationship between MAT and gettimeofday changes,
1500 * the underlying timer does not update.
1501 *
1502 * TODO: build a wall-time denominated timer_call queue
1503 * and a flag to request DTRTing with wall-time timers
1504 */
1505 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1506
1507 uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1508
1509 /* if deadline is in the future */
1510 if (calendar_now_ns < calendar_deadline_ns) {
1511 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1512 uint64_t interval_abs;
1513
1514 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1515
1516 /*
1517 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1518 * causes the timer to keep ticking across sleep, but
1519 * it does not change the calendar timebase.
1520 */
1521
1522 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1523 clock_continuoustime_interval_to_deadline(interval_abs,
1524 &deadline_abs);
1525 } else {
1526 clock_absolutetime_interval_to_deadline(interval_abs,
1527 &deadline_abs);
1528 }
1529 } else {
1530 deadline_abs = 0; /* cause immediate expiration */
1531 }
1532 }
1533
1534 params->deadline = deadline_abs;
1535 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1536 } else if (kev->data < 0) {
1537 /*
1538 * Negative interval timers fire immediately, once.
1539 *
1540 * Ideally a negative interval would be an error, but certain clients
1541 * pass negative values on accident, and expect an event back.
1542 *
1543 * In the old implementation the timer would repeat with no delay
1544 * N times until mach_absolute_time() + (N * interval) underflowed,
1545 * then it would wait ~forever by accidentally arming a timer for the far future.
1546 *
1547 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1548 */
1549
1550 params->deadline = 0; /* expire immediately */
1551 params->interval = 0; /* non-repeating */
1552 } else {
1553 uint64_t interval_abs = 0;
1554
1555 if (use_abstime) {
1556 interval_abs = (uint64_t)kev->data;
1557 } else {
1558 uint64_t interval_ns;
1559 if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1560 return ERANGE;
1561 }
1562
1563 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1564 }
1565
1566 uint64_t deadline = 0;
1567
1568 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1569 clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1570 } else {
1571 clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1572 }
1573
1574 params->deadline = deadline;
1575 params->interval = interval_abs;
1576 }
1577
1578 return 0;
1579 }
1580
1581 /*
1582 * filt_timerexpire - the timer callout routine
1583 */
1584 static void
filt_timerexpire(void * knx,void * state_on_arm)1585 filt_timerexpire(void *knx, void *state_on_arm)
1586 {
1587 struct knote *kn = knx;
1588
1589 uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1590 uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1591
1592 if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1593 // our f_event always would say FILTER_ACTIVE,
1594 // so be leaner and just do it.
1595 struct kqueue *kq = knote_get_kq(kn);
1596 kqlock(kq);
1597 knote_activate(kq, kn, FILTER_ACTIVE);
1598 kqunlock(kq);
1599 } else {
1600 /*
1601 * The timer has been reprogrammed or canceled since it was armed,
1602 * and this is a late firing for the timer, just ignore it.
1603 */
1604 }
1605 }
1606
1607 /*
1608 * Does this deadline needs a timer armed for it, or has it expired?
1609 */
1610 static bool
filt_timer_is_ready(struct knote * kn)1611 filt_timer_is_ready(struct knote *kn)
1612 {
1613 uint64_t now, deadline = kn->kn_ext[0];
1614
1615 if (deadline == 0) {
1616 return true;
1617 }
1618
1619 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1620 now = mach_continuous_time();
1621 } else {
1622 now = mach_absolute_time();
1623 }
1624 return deadline <= now;
1625 }
1626
1627 /*
1628 * Arm a timer
1629 *
1630 * It is the responsibility of the caller to make sure the timer call
1631 * has completed or been cancelled properly prior to arming it.
1632 */
1633 static void
filt_timerarm(struct knote * kn)1634 filt_timerarm(struct knote *kn)
1635 {
1636 uint64_t deadline = kn->kn_ext[0];
1637 uint64_t leeway = kn->kn_ext[1];
1638 uint32_t state;
1639
1640 int filter_flags = kn->kn_sfflags;
1641 unsigned int timer_flags = 0;
1642
1643 if (filter_flags & NOTE_CRITICAL) {
1644 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1645 } else if (filter_flags & NOTE_BACKGROUND) {
1646 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1647 } else {
1648 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1649 }
1650
1651 if (filter_flags & NOTE_LEEWAY) {
1652 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1653 }
1654
1655 if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1656 timer_flags |= THREAD_CALL_CONTINUOUS;
1657 }
1658
1659 /*
1660 * Move to ARMED.
1661 *
1662 * We increase the gencount, and setup the thread call with this expected
1663 * state. It means that if there was a previous generation of the timer in
1664 * flight that needs to be ignored, then 3 things are possible:
1665 *
1666 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1667 * but we clobber it with ARMED and a new gencount. The knote will still
1668 * be activated, but filt_timerprocess() which is serialized with this
1669 * call will not see the FIRED bit set and will not deliver an event.
1670 *
1671 * - this code runs first, but filt_timerexpire() comes second. Because it
1672 * knows an old gencount, it will debounce and not activate the knote.
1673 *
1674 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1675 * will just cancel it properly.
1676 *
1677 * This is important as userspace expects to never be woken up for past
1678 * timers after filt_timertouch ran.
1679 */
1680 state = os_atomic_load(&kn->kn_hook32, relaxed);
1681 state &= ~TIMER_STATE_MASK;
1682 state += TIMER_GEN_INC + TIMER_ARMED;
1683 os_atomic_store(&kn->kn_hook32, state, relaxed);
1684
1685 thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1686 (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1687 }
1688
1689 /*
1690 * Mark a timer as "already fired" when it is being reprogrammed
1691 *
1692 * If there is a timer in flight, this will do a best effort at canceling it,
1693 * but will not wait. If the thread call was in flight, having set the
1694 * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1695 * cancelation.
1696 */
1697 static void
filt_timerfire_immediate(struct knote * kn)1698 filt_timerfire_immediate(struct knote *kn)
1699 {
1700 uint32_t state;
1701
1702 static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1703 "validate that this atomic or will transition to IMMEDIATE");
1704 state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1705
1706 if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1707 thread_call_cancel(kn->kn_thcall);
1708 }
1709 }
1710
1711 /*
1712 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1713 */
1714 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1715 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1716 {
1717 thread_call_t callout;
1718 struct filt_timer_params params;
1719 int error;
1720
1721 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1722 knote_set_error(kn, error);
1723 return 0;
1724 }
1725
1726 callout = thread_call_allocate_with_options(filt_timerexpire,
1727 (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1728 THREAD_CALL_OPTIONS_ONCE);
1729
1730 if (NULL == callout) {
1731 knote_set_error(kn, ENOMEM);
1732 return 0;
1733 }
1734
1735 filt_timer_set_params(kn, ¶ms);
1736 kn->kn_thcall = callout;
1737 kn->kn_flags |= EV_CLEAR;
1738 os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1739
1740 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1741 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1742 kn->kn_flags |= EV_ONESHOT;
1743 }
1744
1745 if (filt_timer_is_ready(kn)) {
1746 os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1747 return FILTER_ACTIVE;
1748 } else {
1749 filt_timerarm(kn);
1750 return 0;
1751 }
1752 }
1753
1754 /*
1755 * Shut down the timer if it's running, and free the callout.
1756 */
1757 static void
filt_timerdetach(struct knote * kn)1758 filt_timerdetach(struct knote *kn)
1759 {
1760 __assert_only boolean_t freed;
1761
1762 /*
1763 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1764 * running anymore.
1765 */
1766 thread_call_cancel_wait(kn->kn_thcall);
1767 freed = thread_call_free(kn->kn_thcall);
1768 assert(freed);
1769 }
1770
1771 /*
1772 * filt_timertouch - update timer knote with new user input
1773 *
1774 * Cancel and restart the timer based on new user data. When
1775 * the user picks up a knote, clear the count of how many timer
1776 * pops have gone off (in kn_data).
1777 */
1778 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1779 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1780 {
1781 struct filt_timer_params params;
1782 uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1783 int error;
1784
1785 if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1786 !_pthread_priority_thread_qos(kev->qos)) {
1787 /* validate usage of FILTER_UPDATE_REQ_QOS */
1788 kev->flags |= EV_ERROR;
1789 kev->data = ERANGE;
1790 return 0;
1791 }
1792
1793 if (changed_flags & NOTE_ABSOLUTE) {
1794 kev->flags |= EV_ERROR;
1795 kev->data = EINVAL;
1796 return 0;
1797 }
1798
1799 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1800 kev->flags |= EV_ERROR;
1801 kev->data = error;
1802 return 0;
1803 }
1804
1805 /* capture the new values used to compute deadline */
1806 filt_timer_set_params(kn, ¶ms);
1807 kn->kn_sfflags = kev->fflags;
1808
1809 if (filt_timer_is_ready(kn)) {
1810 filt_timerfire_immediate(kn);
1811 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1812 } else {
1813 filt_timerarm(kn);
1814 return FILTER_UPDATE_REQ_QOS;
1815 }
1816 }
1817
1818 /*
1819 * filt_timerprocess - query state of knote and snapshot event data
1820 *
1821 * Determine if the timer has fired in the past, snapshot the state
1822 * of the kevent for returning to user-space, and clear pending event
1823 * counters for the next time.
1824 */
1825 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1826 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1827 {
1828 uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1829
1830 /*
1831 * filt_timerprocess is serialized with any filter routine except for
1832 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1833 * transition, and on success, activates the knote.
1834 *
1835 * Hence, we don't need atomic modifications of the state, only to peek at
1836 * whether we see any of the "FIRED" state, and if we do, it is safe to
1837 * do simple state machine transitions.
1838 */
1839 switch (state & TIMER_STATE_MASK) {
1840 case TIMER_IDLE:
1841 case TIMER_ARMED:
1842 /*
1843 * This can happen if a touch resets a timer that had fired
1844 * without being processed
1845 */
1846 return 0;
1847 }
1848
1849 os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1850
1851 /*
1852 * Copy out the interesting kevent state,
1853 * but don't leak out the raw time calculations.
1854 *
1855 * TODO: potential enhancements - tell the user about:
1856 * - deadline to which this timer thought it was expiring
1857 * - return kn_sfflags in the fflags field so the client can know
1858 * under what flags the timer fired
1859 */
1860 knote_fill_kevent(kn, kev, 1);
1861 kev->ext[0] = 0;
1862 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1863
1864 if (kn->kn_sdata != 0) {
1865 /*
1866 * This is a 'repeating' timer, so we have to emit
1867 * how many intervals expired between the arm
1868 * and the process.
1869 *
1870 * A very strange style of interface, because
1871 * this could easily be done in the client...
1872 */
1873
1874 uint64_t now;
1875
1876 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1877 now = mach_continuous_time();
1878 } else {
1879 now = mach_absolute_time();
1880 }
1881
1882 uint64_t first_deadline = kn->kn_ext[0];
1883 uint64_t interval_abs = kn->kn_sdata;
1884 uint64_t orig_arm_time = first_deadline - interval_abs;
1885
1886 assert(now > orig_arm_time);
1887 assert(now > first_deadline);
1888
1889 uint64_t elapsed = now - orig_arm_time;
1890
1891 uint64_t num_fired = elapsed / interval_abs;
1892
1893 /*
1894 * To reach this code, we must have seen the timer pop
1895 * and be in repeating mode, so therefore it must have been
1896 * more than 'interval' time since the attach or last
1897 * successful touch.
1898 */
1899 assert(num_fired > 0);
1900
1901 /* report how many intervals have elapsed to the user */
1902 kev->data = (int64_t)num_fired;
1903
1904 /* We only need to re-arm the timer if it's not about to be destroyed */
1905 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1906 /* fire at the end of the next interval */
1907 uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1908
1909 assert(new_deadline > now);
1910
1911 kn->kn_ext[0] = new_deadline;
1912
1913 /*
1914 * This can't shortcut setting up the thread call, because
1915 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1916 */
1917 filt_timerarm(kn);
1918 }
1919 }
1920
1921 return FILTER_ACTIVE;
1922 }
1923
1924 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1925 .f_extended_codes = true,
1926 .f_attach = filt_timerattach,
1927 .f_detach = filt_timerdetach,
1928 .f_event = filt_bad_event,
1929 .f_touch = filt_timertouch,
1930 .f_process = filt_timerprocess,
1931 };
1932
1933 #pragma mark user_filtops
1934
1935 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1936 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1937 {
1938 if (kn->kn_sfflags & NOTE_TRIGGER) {
1939 kn->kn_hook32 = FILTER_ACTIVE;
1940 } else {
1941 kn->kn_hook32 = 0;
1942 }
1943 return kn->kn_hook32;
1944 }
1945
1946 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1947 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1948 {
1949 uint32_t ffctrl;
1950 int fflags;
1951
1952 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1953 fflags = kev->fflags & NOTE_FFLAGSMASK;
1954 switch (ffctrl) {
1955 case NOTE_FFNOP:
1956 break;
1957 case NOTE_FFAND:
1958 kn->kn_sfflags &= fflags;
1959 break;
1960 case NOTE_FFOR:
1961 kn->kn_sfflags |= fflags;
1962 break;
1963 case NOTE_FFCOPY:
1964 kn->kn_sfflags = fflags;
1965 break;
1966 }
1967 kn->kn_sdata = kev->data;
1968
1969 if (kev->fflags & NOTE_TRIGGER) {
1970 kn->kn_hook32 = FILTER_ACTIVE;
1971 }
1972 return (int)kn->kn_hook32;
1973 }
1974
1975 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1976 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1977 {
1978 int result = (int)kn->kn_hook32;
1979
1980 if (result) {
1981 /* EVFILT_USER returns the data that was passed in */
1982 knote_fill_kevent_with_sdata(kn, kev);
1983 kev->fflags = kn->kn_sfflags;
1984 if (kn->kn_flags & EV_CLEAR) {
1985 /* knote_fill_kevent cleared kn_fflags */
1986 kn->kn_hook32 = 0;
1987 }
1988 }
1989
1990 return result;
1991 }
1992
1993 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1994 .f_extended_codes = true,
1995 .f_attach = filt_userattach,
1996 .f_detach = filt_no_detach,
1997 .f_event = filt_bad_event,
1998 .f_touch = filt_usertouch,
1999 .f_process = filt_userprocess,
2000 };
2001
2002 #pragma mark workloop_filtops
2003
2004 #define EPREEMPTDISABLED (-1)
2005
2006 static inline void
filt_wllock(struct kqworkloop * kqwl)2007 filt_wllock(struct kqworkloop *kqwl)
2008 {
2009 lck_spin_lock(&kqwl->kqwl_statelock);
2010 }
2011
2012 static inline void
filt_wlunlock(struct kqworkloop * kqwl)2013 filt_wlunlock(struct kqworkloop *kqwl)
2014 {
2015 lck_spin_unlock(&kqwl->kqwl_statelock);
2016 }
2017
2018 /*
2019 * Returns true when the interlock for the turnstile is the workqueue lock
2020 *
2021 * When this is the case, all turnstiles operations are delegated
2022 * to the workqueue subsystem.
2023 *
2024 * This is required because kqueue_threadreq_bind_prepost only holds the
2025 * workqueue lock but needs to move the inheritor from the workloop turnstile
2026 * away from the creator thread, so that this now fulfilled request cannot be
2027 * picked anymore by other threads.
2028 */
2029 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)2030 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2031 {
2032 return kqr_thread_requested_pending(&kqwl->kqwl_request);
2033 }
2034
2035 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)2036 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
2037 turnstile_update_flags_t flags)
2038 {
2039 turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2040 workq_threadreq_t kqr = &kqwl->kqwl_request;
2041
2042 /*
2043 * binding to the workq should always happen through
2044 * workq_kern_threadreq_update_inheritor()
2045 */
2046 assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2047
2048 if ((inheritor = kqwl->kqwl_owner)) {
2049 flags |= TURNSTILE_INHERITOR_THREAD;
2050 } else if ((inheritor = kqr_thread(kqr))) {
2051 flags |= TURNSTILE_INHERITOR_THREAD;
2052 }
2053
2054 turnstile_update_inheritor(ts, inheritor, flags);
2055 }
2056
2057 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2058 #define FILT_WLATTACH 0
2059 #define FILT_WLTOUCH 1
2060 #define FILT_WLDROP 2
2061
2062 __result_use_check
2063 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)2064 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
2065 struct kevent_qos_s *kev, kq_index_t qos_index, int op)
2066 {
2067 user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2068 workq_threadreq_t kqr = &kqwl->kqwl_request;
2069 thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2070 kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2071 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2072 int action = KQWL_UTQ_NONE, error = 0;
2073 bool wl_inheritor_updated = false, needs_wake = false;
2074 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2075 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2076 uint64_t udata = 0;
2077 struct turnstile *ts = TURNSTILE_NULL;
2078
2079 filt_wllock(kqwl);
2080
2081 again:
2082 new_owner = cur_owner = kqwl->kqwl_owner;
2083
2084 /*
2085 * Phase 1:
2086 *
2087 * If asked, load the uint64 value at the user provided address and compare
2088 * it against the passed in mask and expected value.
2089 *
2090 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2091 * a thread reference.
2092 *
2093 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2094 * the current thread, then end ownership.
2095 *
2096 * Lastly decide whether we need to perform a QoS update.
2097 */
2098 if (uaddr) {
2099 /*
2100 * Until <rdar://problem/24999882> exists,
2101 * disabling preemption copyin forces any
2102 * vm_fault we encounter to fail.
2103 */
2104 error = copyin_atomic64(uaddr, &udata);
2105
2106 /*
2107 * If we get EFAULT, drop locks, and retry.
2108 * If we still get an error report it,
2109 * else assume the memory has been faulted
2110 * and attempt to copyin under lock again.
2111 */
2112 switch (error) {
2113 case 0:
2114 break;
2115 case EFAULT:
2116 if (efault_retry-- > 0) {
2117 filt_wlunlock(kqwl);
2118 error = copyin_atomic64(uaddr, &udata);
2119 filt_wllock(kqwl);
2120 if (error == 0) {
2121 goto again;
2122 }
2123 }
2124 OS_FALLTHROUGH;
2125 default:
2126 goto out;
2127 }
2128
2129 /* Update state as copied in. */
2130 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2131
2132 if ((udata & mask) != (kdata & mask)) {
2133 error = ESTALE;
2134 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2135 /*
2136 * Decipher the owner port name, and translate accordingly.
2137 * The low 2 bits were borrowed for other flags, so mask them off.
2138 *
2139 * Then attempt translation to a thread reference or fail.
2140 */
2141 mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2142 if (name != MACH_PORT_NULL) {
2143 name = ipc_entry_name_mask(name);
2144 extra_thread_ref = port_name_to_thread(name,
2145 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2146 if (extra_thread_ref == THREAD_NULL) {
2147 error = EOWNERDEAD;
2148 goto out;
2149 }
2150 new_owner = extra_thread_ref;
2151 }
2152 }
2153 }
2154
2155 if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2156 new_owner = THREAD_NULL;
2157 }
2158
2159 if (error == 0) {
2160 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2161 action = KQWL_UTQ_SET_QOS_INDEX;
2162 } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2163 action = KQWL_UTQ_SET_QOS_INDEX;
2164 }
2165
2166 if (op == FILT_WLTOUCH) {
2167 /*
2168 * Save off any additional fflags/data we just accepted
2169 * But only keep the last round of "update" bits we acted on which helps
2170 * debugging a lot.
2171 */
2172 kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2173 kn->kn_sfflags |= kev->fflags;
2174 if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2175 needs_wake = (kn->kn_thread != THREAD_NULL);
2176 }
2177 } else if (op == FILT_WLDROP) {
2178 if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2179 NOTE_WL_SYNC_WAIT) {
2180 /*
2181 * When deleting a SYNC_WAIT knote that hasn't been woken up
2182 * explicitly, issue a wake up.
2183 */
2184 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2185 needs_wake = (kn->kn_thread != THREAD_NULL);
2186 }
2187 }
2188 }
2189
2190 /*
2191 * Phase 2:
2192 *
2193 * Commit ownership and QoS changes if any, possibly wake up waiters
2194 */
2195
2196 if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2197 goto out;
2198 }
2199
2200 kqlock(kqwl);
2201
2202 /* If already tracked as servicer, don't track as owner */
2203 if (new_owner == kqr_thread(kqr)) {
2204 new_owner = THREAD_NULL;
2205 }
2206
2207 if (cur_owner != new_owner) {
2208 kqwl->kqwl_owner = new_owner;
2209 if (new_owner == extra_thread_ref) {
2210 /* we just transfered this ref to kqwl_owner */
2211 extra_thread_ref = THREAD_NULL;
2212 }
2213 cur_override = kqworkloop_override(kqwl);
2214
2215 if (new_owner) {
2216 /* override it before we drop the old */
2217 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2218 thread_add_kevent_override(new_owner, cur_override);
2219 }
2220 if (kqr_thread_requested_pending(kqr)) {
2221 if (action == KQWL_UTQ_NONE) {
2222 action = KQWL_UTQ_REDRIVE_EVENTS;
2223 }
2224 }
2225 } else if (action == KQWL_UTQ_NONE &&
2226 !kqr_thread_requested(kqr) &&
2227 kqwl->kqwl_wakeup_qos) {
2228 action = KQWL_UTQ_REDRIVE_EVENTS;
2229 }
2230 }
2231
2232 if (action != KQWL_UTQ_NONE) {
2233 kqworkloop_update_threads_qos(kqwl, action, qos_index);
2234 }
2235
2236 ts = kqwl->kqwl_turnstile;
2237 if (cur_owner != new_owner && ts) {
2238 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2239 /*
2240 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2241 * the code went through workq_kern_threadreq_initiate()
2242 * and the workqueue has set the inheritor already
2243 */
2244 assert(filt_wlturnstile_interlock_is_workq(kqwl));
2245 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2246 workq_kern_threadreq_lock(kqwl->kqwl_p);
2247 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2248 ts, TURNSTILE_IMMEDIATE_UPDATE);
2249 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2250 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2251 /*
2252 * If the workq is no longer the interlock, then
2253 * workq_kern_threadreq_update_inheritor() has finished a bind
2254 * and we need to fallback to the regular path.
2255 */
2256 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2257 }
2258 wl_inheritor_updated = true;
2259 } else {
2260 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2261 wl_inheritor_updated = true;
2262 }
2263
2264 /*
2265 * We need a turnstile reference because we are dropping the interlock
2266 * and the caller has not called turnstile_prepare.
2267 */
2268 if (wl_inheritor_updated) {
2269 turnstile_reference(ts);
2270 }
2271 }
2272
2273 if (needs_wake && ts) {
2274 waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2275 kn->kn_thread, THREAD_AWAKENED);
2276 if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2277 disable_preemption();
2278 error = EPREEMPTDISABLED;
2279 }
2280 }
2281
2282 kqunlock(kqwl);
2283
2284 out:
2285 /*
2286 * Phase 3:
2287 *
2288 * Unlock and cleanup various lingering references and things.
2289 */
2290 filt_wlunlock(kqwl);
2291
2292 #if CONFIG_WORKLOOP_DEBUG
2293 KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2294 .updater = current_thread(),
2295 .servicer = kqr_thread(kqr), /* Note: racy */
2296 .old_owner = cur_owner,
2297 .new_owner = new_owner,
2298
2299 .kev_ident = kev->ident,
2300 .error = (int16_t)error,
2301 .kev_flags = kev->flags,
2302 .kev_fflags = kev->fflags,
2303
2304 .kev_mask = mask,
2305 .kev_value = kdata,
2306 .in_value = udata,
2307 });
2308 #endif // CONFIG_WORKLOOP_DEBUG
2309
2310 if (wl_inheritor_updated) {
2311 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2312 turnstile_deallocate(ts);
2313 }
2314
2315 if (cur_owner && new_owner != cur_owner) {
2316 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2317 thread_drop_kevent_override(cur_owner);
2318 }
2319 thread_deallocate_safe(cur_owner);
2320 }
2321 if (extra_thread_ref) {
2322 thread_deallocate_safe(extra_thread_ref);
2323 }
2324 return error;
2325 }
2326
2327 /*
2328 * Remembers the last updated that came in from userspace for debugging reasons.
2329 * - fflags is mirrored from the userspace kevent
2330 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2331 * - ext[VALUE] is set to what the kernel loaded atomically
2332 * - data is set to the error if any
2333 */
2334 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2335 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2336 int error)
2337 {
2338 kn->kn_fflags = kev->fflags;
2339 kn->kn_sdata = error;
2340 memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2341 }
2342
2343 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2344 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2345 struct kevent_qos_s *kev, int op)
2346 {
2347 user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2348 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2349 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2350 uint64_t udata = 0;
2351 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2352 int error = 0;
2353
2354 if (op == FILT_WLATTACH) {
2355 (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2356 } else if (uaddr == 0) {
2357 return 0;
2358 }
2359
2360 filt_wllock(kqwl);
2361
2362 again:
2363
2364 /*
2365 * Do the debounce thing, the lock serializing the state is the knote lock.
2366 */
2367 if (uaddr) {
2368 /*
2369 * Until <rdar://problem/24999882> exists,
2370 * disabling preemption copyin forces any
2371 * vm_fault we encounter to fail.
2372 */
2373 error = copyin_atomic64(uaddr, &udata);
2374
2375 /*
2376 * If we get EFAULT, drop locks, and retry.
2377 * If we still get an error report it,
2378 * else assume the memory has been faulted
2379 * and attempt to copyin under lock again.
2380 */
2381 switch (error) {
2382 case 0:
2383 break;
2384 case EFAULT:
2385 if (efault_retry-- > 0) {
2386 filt_wlunlock(kqwl);
2387 error = copyin_atomic64(uaddr, &udata);
2388 filt_wllock(kqwl);
2389 if (error == 0) {
2390 goto again;
2391 }
2392 }
2393 OS_FALLTHROUGH;
2394 default:
2395 goto out;
2396 }
2397
2398 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2399 kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2400
2401 if ((udata & mask) != (kdata & mask)) {
2402 error = ESTALE;
2403 goto out;
2404 }
2405 }
2406
2407 if (op == FILT_WLATTACH) {
2408 error = filt_wlattach_sync_ipc(kn);
2409 if (error == 0) {
2410 disable_preemption();
2411 error = EPREEMPTDISABLED;
2412 }
2413 }
2414
2415 out:
2416 filt_wlunlock(kqwl);
2417 return error;
2418 }
2419
2420 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2421 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2422 {
2423 struct kqueue *kq = knote_get_kq(kn);
2424 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2425 int error = 0, result = 0;
2426 kq_index_t qos_index = 0;
2427
2428 if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2429 error = ENOTSUP;
2430 goto out;
2431 }
2432
2433 uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2434 switch (command) {
2435 case NOTE_WL_THREAD_REQUEST:
2436 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2437 error = EINVAL;
2438 goto out;
2439 }
2440 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2441 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2442 error = ERANGE;
2443 goto out;
2444 }
2445 if (kqwl->kqwl_request.tr_kq_qos_index) {
2446 /*
2447 * There already is a thread request, and well, you're only allowed
2448 * one per workloop, so fail the attach.
2449 */
2450 error = EALREADY;
2451 goto out;
2452 }
2453 break;
2454 case NOTE_WL_SYNC_WAIT:
2455 case NOTE_WL_SYNC_WAKE:
2456 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2457 error = EINVAL;
2458 goto out;
2459 }
2460 if ((kn->kn_flags & EV_DISABLE) == 0) {
2461 error = EINVAL;
2462 goto out;
2463 }
2464 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2465 error = EINVAL;
2466 goto out;
2467 }
2468 break;
2469
2470 case NOTE_WL_SYNC_IPC:
2471 if ((kn->kn_flags & EV_DISABLE) == 0) {
2472 error = EINVAL;
2473 goto out;
2474 }
2475 if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2476 error = EINVAL;
2477 goto out;
2478 }
2479 break;
2480 default:
2481 error = EINVAL;
2482 goto out;
2483 }
2484
2485 if (command == NOTE_WL_SYNC_IPC) {
2486 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2487 } else {
2488 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2489 }
2490
2491 if (error == EPREEMPTDISABLED) {
2492 error = 0;
2493 result = FILTER_THREADREQ_NODEFEER;
2494 }
2495 out:
2496 if (error) {
2497 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2498 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2499 error = 0;
2500 }
2501 knote_set_error(kn, error);
2502 return result;
2503 }
2504 if (command == NOTE_WL_SYNC_WAIT) {
2505 return kevent_register_wait_prepare(kn, kev, result);
2506 }
2507 /* Just attaching the thread request successfully will fire it */
2508 if (command == NOTE_WL_THREAD_REQUEST) {
2509 /*
2510 * Thread Request knotes need an explicit touch to be active again,
2511 * so delivering an event needs to also consume it.
2512 */
2513 kn->kn_flags |= EV_CLEAR;
2514 return result | FILTER_ACTIVE;
2515 }
2516 return result;
2517 }
2518
2519 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2520 filt_wlwait_continue(void *parameter, wait_result_t wr)
2521 {
2522 struct _kevent_register *cont_args = parameter;
2523 struct kqworkloop *kqwl = cont_args->kqwl;
2524
2525 kqlock(kqwl);
2526 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2527 workq_kern_threadreq_lock(kqwl->kqwl_p);
2528 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2529 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2530 } else {
2531 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2532 }
2533 kqunlock(kqwl);
2534
2535 turnstile_cleanup();
2536
2537 if (wr == THREAD_INTERRUPTED) {
2538 cont_args->kev.flags |= EV_ERROR;
2539 cont_args->kev.data = EINTR;
2540 } else if (wr != THREAD_AWAKENED) {
2541 panic("Unexpected wait result: %d", wr);
2542 }
2543
2544 kevent_register_wait_return(cont_args);
2545 }
2546
2547 /*
2548 * Called with the workloop mutex held, most of the time never returns as it
2549 * calls filt_wlwait_continue through a continuation.
2550 */
2551 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2552 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2553 struct _kevent_register *cont_args)
2554 {
2555 struct kqworkloop *kqwl = cont_args->kqwl;
2556 workq_threadreq_t kqr = &kqwl->kqwl_request;
2557 struct turnstile *ts;
2558 bool workq_locked = false;
2559
2560 kqlock_held(kqwl);
2561
2562 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2563 workq_kern_threadreq_lock(kqwl->kqwl_p);
2564 workq_locked = true;
2565 }
2566
2567 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2568 TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2569
2570 if (workq_locked) {
2571 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2572 &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2573 TURNSTILE_DELAYED_UPDATE);
2574 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2575 /*
2576 * if the interlock is no longer the workqueue lock,
2577 * then we don't need to hold it anymore.
2578 */
2579 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2580 workq_locked = false;
2581 }
2582 }
2583 if (!workq_locked) {
2584 /*
2585 * If the interlock is the workloop's, then it's our responsibility to
2586 * call update_inheritor, so just do it.
2587 */
2588 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2589 }
2590
2591 thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2592 waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2593 THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2594
2595 if (workq_locked) {
2596 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2597 }
2598
2599 thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2600 if (thread) {
2601 thread_reference(thread);
2602 }
2603
2604 kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2605 }
2606
2607 /* called in stackshot context to report the thread responsible for blocking this thread */
2608 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2609 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2610 event64_t event, thread_waitinfo_t *waitinfo)
2611 {
2612 struct knote *kn = (struct knote *)event;
2613
2614 zone_require(knote_zone, kn);
2615
2616 assert(kn->kn_thread == thread);
2617
2618 struct kqueue *kq = knote_get_kq(kn);
2619
2620 zone_require(kqworkloop_zone, kq);
2621 assert(kq->kq_state & KQ_WORKLOOP);
2622
2623 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2624 workq_threadreq_t kqr = &kqwl->kqwl_request;
2625
2626 thread_t kqwl_owner = kqwl->kqwl_owner;
2627
2628 if (kqwl_owner != THREAD_NULL) {
2629 thread_require(kqwl_owner);
2630 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2631 } else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2632 thread_require(kqr->tr_thread);
2633 waitinfo->owner = thread_tid(kqr->tr_thread);
2634 } else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2635 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2636 } else {
2637 waitinfo->owner = 0;
2638 }
2639
2640 waitinfo->context = kqwl->kqwl_dynamicid;
2641 }
2642
2643 static void
filt_wldetach(struct knote * kn)2644 filt_wldetach(struct knote *kn)
2645 {
2646 if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2647 filt_wldetach_sync_ipc(kn);
2648 } else if (kn->kn_thread) {
2649 kevent_register_wait_cleanup(kn);
2650 }
2651 }
2652
2653 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2654 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2655 thread_qos_t *qos_index)
2656 {
2657 uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2658 uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2659
2660 if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2661 return EINVAL;
2662 }
2663 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2664 if (kev->flags & EV_DELETE) {
2665 return EINVAL;
2666 }
2667 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2668 return EINVAL;
2669 }
2670 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2671 return ERANGE;
2672 }
2673 }
2674
2675 switch (new_commands) {
2676 case NOTE_WL_THREAD_REQUEST:
2677 /* thread requests can only update themselves */
2678 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2679 return EINVAL;
2680 }
2681 break;
2682
2683 case NOTE_WL_SYNC_WAIT:
2684 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2685 return EINVAL;
2686 }
2687 goto sync_checks;
2688
2689 case NOTE_WL_SYNC_WAKE:
2690 sync_checks:
2691 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2692 return EINVAL;
2693 }
2694 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2695 return EINVAL;
2696 }
2697 break;
2698
2699 case NOTE_WL_SYNC_IPC:
2700 if (sav_commands != NOTE_WL_SYNC_IPC) {
2701 return EINVAL;
2702 }
2703 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2704 return EINVAL;
2705 }
2706 break;
2707
2708 default:
2709 return EINVAL;
2710 }
2711 return 0;
2712 }
2713
2714 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2715 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2716 {
2717 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2718 thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2719 int result = 0;
2720
2721 int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2722 if (error) {
2723 goto out;
2724 }
2725
2726 uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2727 if (command == NOTE_WL_SYNC_IPC) {
2728 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2729 } else {
2730 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2731 filt_wlremember_last_update(kn, kev, error);
2732 }
2733 if (error == EPREEMPTDISABLED) {
2734 error = 0;
2735 result = FILTER_THREADREQ_NODEFEER;
2736 }
2737
2738 out:
2739 if (error) {
2740 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2741 /* If userland wants ESTALE to be hidden, do not activate */
2742 return result;
2743 }
2744 kev->flags |= EV_ERROR;
2745 kev->data = error;
2746 return result;
2747 }
2748 if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2749 return kevent_register_wait_prepare(kn, kev, result);
2750 }
2751 /* Just touching the thread request successfully will fire it */
2752 if (command == NOTE_WL_THREAD_REQUEST) {
2753 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2754 result |= FILTER_UPDATE_REQ_QOS;
2755 }
2756 result |= FILTER_ACTIVE;
2757 }
2758 return result;
2759 }
2760
2761 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2762 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2763 {
2764 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2765
2766 int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2767 if (error) {
2768 goto out;
2769 }
2770
2771 uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2772 if (command == NOTE_WL_SYNC_IPC) {
2773 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2774 } else {
2775 error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2776 filt_wlremember_last_update(kn, kev, error);
2777 }
2778 assert(error != EPREEMPTDISABLED);
2779
2780 out:
2781 if (error) {
2782 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2783 return false;
2784 }
2785 kev->flags |= EV_ERROR;
2786 kev->data = error;
2787 return false;
2788 }
2789 return true;
2790 }
2791
2792 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2793 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2794 {
2795 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2796 int rc = 0;
2797
2798 assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2799
2800 kqlock(kqwl);
2801
2802 if (kqwl->kqwl_owner) {
2803 /*
2804 * <rdar://problem/33584321> userspace sometimes due to events being
2805 * delivered but not triggering a drain session can cause a process
2806 * of the thread request knote.
2807 *
2808 * When that happens, the automatic deactivation due to process
2809 * would swallow the event, so we have to activate the knote again.
2810 */
2811 knote_activate(kqwl, kn, FILTER_ACTIVE);
2812 } else {
2813 #if DEBUG || DEVELOPMENT
2814 if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2815 /*
2816 * see src/queue_internal.h in libdispatch
2817 */
2818 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2819 user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2820 task_t t = current_task();
2821 uint64_t val;
2822 if (addr && task_is_active(t) && !task_is_halting(t) &&
2823 copyin_atomic64(addr, &val) == 0 &&
2824 val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2825 (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2826 panic("kevent: workloop %#016llx is not enqueued "
2827 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2828 kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2829 }
2830 }
2831 #endif
2832 knote_fill_kevent(kn, kev, 0);
2833 kev->fflags = kn->kn_sfflags;
2834 rc |= FILTER_ACTIVE;
2835 }
2836
2837 kqunlock(kqwl);
2838
2839 if (rc & FILTER_ACTIVE) {
2840 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2841 }
2842 return rc;
2843 }
2844
2845 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2846 .f_extended_codes = true,
2847 .f_attach = filt_wlattach,
2848 .f_detach = filt_wldetach,
2849 .f_event = filt_bad_event,
2850 .f_touch = filt_wltouch,
2851 .f_process = filt_wlprocess,
2852 .f_allow_drop = filt_wlallow_drop,
2853 .f_post_register_wait = filt_wlpost_register_wait,
2854 };
2855
2856 #pragma mark - kqueues allocation and deallocation
2857
2858 OS_NOINLINE
2859 static void
2860 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2861
2862 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2863 kqworkloop_try_retain(struct kqworkloop *kqwl)
2864 {
2865 return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2866 }
2867
2868 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2869 kqworkloop_retain(struct kqworkloop *kqwl)
2870 {
2871 return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2872 }
2873
2874 OS_ALWAYS_INLINE
2875 static inline void
kqueue_retain(kqueue_t kqu)2876 kqueue_retain(kqueue_t kqu)
2877 {
2878 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2879 kqworkloop_retain(kqu.kqwl);
2880 }
2881 }
2882
2883 OS_ALWAYS_INLINE
2884 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2885 kqworkloop_release_live(struct kqworkloop *kqwl)
2886 {
2887 os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2888 }
2889
2890 OS_ALWAYS_INLINE
2891 static inline void
kqueue_release_live(kqueue_t kqu)2892 kqueue_release_live(kqueue_t kqu)
2893 {
2894 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2895 kqworkloop_release_live(kqu.kqwl);
2896 }
2897 }
2898
2899 OS_ALWAYS_INLINE
2900 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2901 kqworkloop_release(struct kqworkloop *kqwl)
2902 {
2903 if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2904 kqworkloop_dealloc(kqwl, true);
2905 }
2906 }
2907
2908 OS_ALWAYS_INLINE
2909 static inline void
kqueue_release(kqueue_t kqu)2910 kqueue_release(kqueue_t kqu)
2911 {
2912 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2913 kqworkloop_release(kqu.kqwl);
2914 }
2915 }
2916
2917 /*!
2918 * @function kqueue_destroy
2919 *
2920 * @brief
2921 * Common part to all kqueue dealloc functions.
2922 */
2923 OS_NOINLINE
2924 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2925 kqueue_destroy(kqueue_t kqu, zone_t zone)
2926 {
2927 lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2928
2929 zfree(zone, kqu.kq);
2930 }
2931
2932 /*!
2933 * @function kqueue_init
2934 *
2935 * @brief
2936 * Common part to all kqueue alloc functions.
2937 */
2938 static kqueue_t
kqueue_init(kqueue_t kqu)2939 kqueue_init(kqueue_t kqu)
2940 {
2941 lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2942 return kqu;
2943 }
2944
2945 #pragma mark kqfile allocation and deallocation
2946
2947 /*!
2948 * @function kqueue_dealloc
2949 *
2950 * @brief
2951 * Detach all knotes from a kqfile and free it.
2952 *
2953 * @discussion
2954 * We walk each list looking for knotes referencing this
2955 * this kqueue. If we find one, we try to drop it. But
2956 * if we fail to get a drop reference, that will wait
2957 * until it is dropped. So, we can just restart again
2958 * safe in the assumption that the list will eventually
2959 * not contain any more references to this kqueue (either
2960 * we dropped them all, or someone else did).
2961 *
2962 * Assumes no new events are being added to the kqueue.
2963 * Nothing locked on entry or exit.
2964 */
2965 void
kqueue_dealloc(struct kqueue * kq)2966 kqueue_dealloc(struct kqueue *kq)
2967 {
2968 KNOTE_LOCK_CTX(knlc);
2969 struct proc *p = kq->kq_p;
2970 struct filedesc *fdp = &p->p_fd;
2971 struct knote *kn;
2972
2973 assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2974
2975 proc_fdlock(p);
2976 for (int i = 0; i < fdp->fd_knlistsize; i++) {
2977 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2978 while (kn != NULL) {
2979 if (kq == knote_get_kq(kn)) {
2980 kqlock(kq);
2981 proc_fdunlock(p);
2982 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2983 knote_drop(kq, kn, &knlc);
2984 }
2985 proc_fdlock(p);
2986 /* start over at beginning of list */
2987 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2988 continue;
2989 }
2990 kn = SLIST_NEXT(kn, kn_link);
2991 }
2992 }
2993
2994 knhash_lock(fdp);
2995 proc_fdunlock(p);
2996
2997 if (fdp->fd_knhashmask != 0) {
2998 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2999 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3000 while (kn != NULL) {
3001 if (kq == knote_get_kq(kn)) {
3002 kqlock(kq);
3003 knhash_unlock(fdp);
3004 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3005 knote_drop(kq, kn, &knlc);
3006 }
3007 knhash_lock(fdp);
3008 /* start over at beginning of list */
3009 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3010 continue;
3011 }
3012 kn = SLIST_NEXT(kn, kn_link);
3013 }
3014 }
3015 }
3016 knhash_unlock(fdp);
3017
3018 kqueue_destroy(kq, kqfile_zone);
3019 }
3020
3021 /*!
3022 * @function kqueue_alloc
3023 *
3024 * @brief
3025 * Allocate a kqfile.
3026 */
3027 struct kqueue *
kqueue_alloc(struct proc * p)3028 kqueue_alloc(struct proc *p)
3029 {
3030 struct kqfile *kqf;
3031
3032 /*
3033 * kqfiles are created with kqueue() so we need to wait for
3034 * the first kevent syscall to know which bit among
3035 * KQ_KEV_{32,64,QOS} will be set in kqf_state
3036 */
3037 kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
3038 kqf->kqf_p = p;
3039 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3040 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3041
3042 return kqueue_init(kqf).kq;
3043 }
3044
3045 /*!
3046 * @function kqueue_internal
3047 *
3048 * @brief
3049 * Core implementation for kqueue and guarded_kqueue_np()
3050 */
3051 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)3052 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
3053 {
3054 struct kqueue *kq;
3055 struct fileproc *fp;
3056 int fd, error;
3057
3058 error = falloc_withinit(p, current_cached_proc_cred(p),
3059 vfs_context_current(), &fp, &fd, fp_init, initarg);
3060 if (error) {
3061 return error;
3062 }
3063
3064 kq = kqueue_alloc(p);
3065 if (kq == NULL) {
3066 fp_free(p, fd, fp);
3067 return ENOMEM;
3068 }
3069
3070 fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3071 fp->f_flag = FREAD | FWRITE;
3072 fp->f_ops = &kqueueops;
3073 fp_set_data(fp, kq);
3074 fp->f_lflags |= FG_CONFINED;
3075
3076 proc_fdlock(p);
3077 procfdtbl_releasefd(p, fd, NULL);
3078 fp_drop(p, fd, fp, 1);
3079 proc_fdunlock(p);
3080
3081 *retval = fd;
3082 return error;
3083 }
3084
3085 /*!
3086 * @function kqueue
3087 *
3088 * @brief
3089 * The kqueue syscall.
3090 */
3091 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3092 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3093 {
3094 return kqueue_internal(p, NULL, NULL, retval);
3095 }
3096
3097 #pragma mark kqworkq allocation and deallocation
3098
3099 /*!
3100 * @function kqworkq_dealloc
3101 *
3102 * @brief
3103 * Deallocates a workqueue kqueue.
3104 *
3105 * @discussion
3106 * This only happens at process death, or for races with concurrent
3107 * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3108 * this kqueue, either there are none, or someone else took care of them.
3109 */
3110 void
kqworkq_dealloc(struct kqworkq * kqwq)3111 kqworkq_dealloc(struct kqworkq *kqwq)
3112 {
3113 kqueue_destroy(kqwq, kqworkq_zone);
3114 }
3115
3116 /*!
3117 * @function kqworkq_alloc
3118 *
3119 * @brief
3120 * Allocates a workqueue kqueue.
3121 *
3122 * @discussion
3123 * This is the slow path of kevent_get_kqwq.
3124 * This takes care of making sure procs have a single workq kqueue.
3125 */
3126 OS_NOINLINE
3127 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3128 kqworkq_alloc(struct proc *p, unsigned int flags)
3129 {
3130 struct kqworkq *kqwq, *tmp;
3131
3132 kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3133
3134 assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3135 if (flags & KEVENT_FLAG_LEGACY64) {
3136 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3137 } else {
3138 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3139 }
3140 kqwq->kqwq_p = p;
3141
3142 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3143 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3144 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3145 }
3146 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3147 /*
3148 * Because of how the bucketized system works, we mix overcommit
3149 * sources with not overcommit: each time we move a knote from
3150 * one bucket to the next due to overrides, we'd had to track
3151 * overcommitness, and it's really not worth it in the workloop
3152 * enabled world that track this faithfully.
3153 *
3154 * Incidentally, this behaves like the original manager-based
3155 * kqwq where event delivery always happened (hence is
3156 * "overcommit")
3157 */
3158 kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3159 kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3160 if (i != KQWQ_QOS_MANAGER) {
3161 kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3162 }
3163 kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3164 }
3165
3166 kqueue_init(kqwq);
3167
3168 if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3169 kqworkq_dealloc(kqwq);
3170 return tmp;
3171 }
3172
3173 return kqwq;
3174 }
3175
3176 #pragma mark kqworkloop allocation and deallocation
3177
3178 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3179 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3180
3181 OS_ALWAYS_INLINE
3182 static inline void
kqhash_lock(struct filedesc * fdp)3183 kqhash_lock(struct filedesc *fdp)
3184 {
3185 lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3186 }
3187
3188 OS_ALWAYS_INLINE
3189 static inline void
kqhash_unlock(struct filedesc * fdp)3190 kqhash_unlock(struct filedesc *fdp)
3191 {
3192 lck_mtx_unlock(&fdp->fd_kqhashlock);
3193 }
3194
3195 OS_ALWAYS_INLINE
3196 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3197 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3198 struct kqworkloop *kqwl)
3199 {
3200 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3201 LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3202 }
3203
3204 OS_ALWAYS_INLINE
3205 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3206 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3207 {
3208 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3209 struct kqworkloop *kqwl;
3210
3211 LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3212 if (kqwl->kqwl_dynamicid == id) {
3213 return kqwl;
3214 }
3215 }
3216 return NULL;
3217 }
3218
3219 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3220 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3221 {
3222 struct kqworkloop *kqwl = NULL;
3223
3224 kqhash_lock(fdp);
3225 if (__probable(fdp->fd_kqhash)) {
3226 kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3227 if (kqwl && !kqworkloop_try_retain(kqwl)) {
3228 kqwl = NULL;
3229 }
3230 }
3231 kqhash_unlock(fdp);
3232 return kqwl;
3233 }
3234
3235 OS_NOINLINE
3236 static void
kqworkloop_hash_init(struct filedesc * fdp)3237 kqworkloop_hash_init(struct filedesc *fdp)
3238 {
3239 struct kqwllist *alloc_hash;
3240 u_long alloc_mask;
3241
3242 kqhash_unlock(fdp);
3243 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3244 kqhash_lock(fdp);
3245
3246 /* See if we won the race */
3247 if (__probable(fdp->fd_kqhashmask == 0)) {
3248 fdp->fd_kqhash = alloc_hash;
3249 fdp->fd_kqhashmask = alloc_mask;
3250 } else {
3251 kqhash_unlock(fdp);
3252 hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3253 kqhash_lock(fdp);
3254 }
3255 }
3256
3257 /*
3258 * kqueue iotier override is only supported for kqueue that has
3259 * only one port as a mach port source. Updating the iotier
3260 * override on the mach port source will update the override
3261 * on kqueue as well. Since kqueue with iotier override will
3262 * only have one port attached, there is no logic for saturation
3263 * like qos override, the iotier override of mach port source
3264 * would be reflected in kevent iotier override.
3265 */
3266 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3267 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3268 {
3269 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3270 return;
3271 }
3272
3273 struct kqworkloop *kqwl = kqu.kqwl;
3274 os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3275 }
3276
3277 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3278 kqueue_get_iotier_override(kqueue_t kqu)
3279 {
3280 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3281 return THROTTLE_LEVEL_END;
3282 }
3283
3284 struct kqworkloop *kqwl = kqu.kqwl;
3285 return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3286 }
3287
3288 #if CONFIG_PREADOPT_TG
3289 /*
3290 * This function is called with a borrowed reference on the thread group without
3291 * kq lock held with the mqueue lock held. It may or may not have the knote lock
3292 * (called from both fevent as well as fattach/ftouch). Upon success, an
3293 * additional reference on the TG is taken
3294 */
3295 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3296 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3297 {
3298 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3299 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3300 (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3301 return;
3302 }
3303
3304 struct kqworkloop *kqwl = kqu.kqwl;
3305
3306 assert(qos < THREAD_QOS_LAST);
3307
3308 thread_group_retain(tg);
3309
3310 thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3311 int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3312 if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3313 os_atomic_rmw_loop_give_up(break);
3314 }
3315
3316 if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3317 /*
3318 * Note that old_tg could be a NULL TG pointer but with a QoS
3319 * set. See also workq_thread_reset_pri.
3320 *
3321 * Compare the QoS of existing preadopted tg with new one and
3322 * only overwrite the thread group if we have one with a higher
3323 * QoS.
3324 */
3325 thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3326 if (existing_qos >= qos) {
3327 os_atomic_rmw_loop_give_up(break);
3328 }
3329 }
3330
3331 // Transfer the ref taken earlier in the function to the kqwl
3332 new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3333 });
3334
3335 if (ret) {
3336 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3337
3338 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3339 thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3340 }
3341
3342 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3343 } else {
3344 // We failed to write to the kqwl_preadopt_tg, drop the ref we took
3345 // earlier in the function
3346 thread_group_deallocate_safe(tg);
3347 }
3348 }
3349
3350 /*
3351 * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3352 */
3353 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3354 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3355 {
3356 bool success = false;
3357 if (kq->kq_state & KQ_WORKLOOP) {
3358 struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3359 thread_group_qos_t old_tg;
3360 success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3361 KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3362 &old_tg, relaxed);
3363 if (success) {
3364 thread_set_preadopt_thread_group(thread, tg);
3365 } else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3366 /*
3367 * Technically the following set_preadopt should be a no-op since this
3368 * servicer thread preadopts kqwl's permanent tg at bind time.
3369 * See kqueue_threadreq_bind.
3370 */
3371 thread_set_preadopt_thread_group(thread, KQWL_GET_PREADOPTED_TG(old_tg));
3372 } else {
3373 assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3374 old_tg == KQWL_PREADOPTED_TG_NEVER);
3375 }
3376 }
3377 return success;
3378 }
3379 #endif
3380
3381 /*!
3382 * @function kqworkloop_dealloc
3383 *
3384 * @brief
3385 * Deallocates a workloop kqueue.
3386 *
3387 * @discussion
3388 * Knotes hold references on the workloop, so we can't really reach this
3389 * function unless all of these are already gone.
3390 *
3391 * Nothing locked on entry or exit.
3392 *
3393 * @param hash_remove
3394 * Whether to remove the workloop from its hash table.
3395 */
3396 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3397 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3398 {
3399 thread_t cur_owner;
3400
3401 cur_owner = kqwl->kqwl_owner;
3402 if (cur_owner) {
3403 if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3404 thread_drop_kevent_override(cur_owner);
3405 }
3406 thread_deallocate(cur_owner);
3407 kqwl->kqwl_owner = THREAD_NULL;
3408 }
3409
3410 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3411 struct turnstile *ts;
3412 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3413 &ts, TURNSTILE_WORKLOOPS);
3414 turnstile_cleanup();
3415 turnstile_deallocate(ts);
3416 }
3417
3418 if (hash_remove) {
3419 struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3420
3421 kqhash_lock(fdp);
3422 LIST_REMOVE(kqwl, kqwl_hashlink);
3423 #if CONFIG_PROC_RESOURCE_LIMITS
3424 fdp->num_kqwls--;
3425 #endif
3426 kqhash_unlock(fdp);
3427 }
3428
3429 #if CONFIG_PREADOPT_TG
3430 thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3431 if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3432 thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3433 }
3434 #endif
3435
3436 workq_threadreq_t kqr = &kqwl->kqwl_request;
3437 if ((kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) && kqr->tr_work_interval) {
3438 kern_work_interval_release(kqr->tr_work_interval);
3439 }
3440
3441 assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3442 assert(kqwl->kqwl_owner == THREAD_NULL);
3443 assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3444
3445 lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3446 kqueue_destroy(kqwl, kqworkloop_zone);
3447 }
3448
3449 /*!
3450 * @function kqworkloop_init
3451 *
3452 * @brief
3453 * Initializes an allocated kqworkloop.
3454 */
3455 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended)3456 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3457 kqueue_id_t id, workq_threadreq_param_t *trp,
3458 struct workq_threadreq_extended_param_s *trp_extended)
3459 {
3460 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3461 os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3462 kqwl->kqwl_dynamicid = id;
3463 kqwl->kqwl_p = p;
3464 if (trp) {
3465 kqwl->kqwl_params = trp->trp_value;
3466 }
3467
3468 workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3469 if (trp) {
3470 if (trp->trp_flags & TRP_PRIORITY) {
3471 tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3472 }
3473 if (trp->trp_flags & TRP_BOUND_THREAD) {
3474 tr_flags |= WORKQ_TR_FLAG_PERMANENT_BIND;
3475 }
3476 if (trp->trp_flags) {
3477 tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3478 }
3479 }
3480 kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3481 kqwl->kqwl_request.tr_flags = tr_flags;
3482 os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3483 #if CONFIG_PREADOPT_TG
3484 if (trp_extended && trp_extended->trp_permanent_preadopt_tg) {
3485 /*
3486 * This kqwl is permanently configured with a thread group.
3487 * By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3488 * has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3489 * trp_extended->trp_permanent_preadopt_tg is transferred to the kqwl.
3490 */
3491 thread_group_qos_t kqwl_preadopt_tg;
3492 kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_extended->trp_permanent_preadopt_tg);
3493 os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3494 } else if (task_is_app(current_task())) {
3495 /*
3496 * Not a specially preconfigured kqwl so it is open to participate in sync IPC
3497 * thread group preadoption; but, apps will never adopt a thread group that
3498 * is not their own. This is a gross hack to simulate the post-process that
3499 * is done in the voucher subsystem today for thread groups.
3500 */
3501 os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3502 }
3503 #endif
3504 if (trp_extended) {
3505 if (trp_extended->trp_work_interval) {
3506 /*
3507 * The +1 ref on the work interval is transferred to the kqwl.
3508 */
3509 assert(tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
3510 kqwl->kqwl_request.tr_work_interval = trp_extended->trp_work_interval;
3511 }
3512 }
3513 for (int i = 0; i < KQWL_NBUCKETS; i++) {
3514 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3515 }
3516 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3517
3518 lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3519
3520 kqueue_init(kqwl);
3521 }
3522
3523 #if CONFIG_PROC_RESOURCE_LIMITS
3524 void
kqworkloop_check_limit_exceeded(struct filedesc * fdp)3525 kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3526 {
3527 int num_kqwls = fdp->num_kqwls;
3528 if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > 0 &&
3529 num_kqwls > fdp->kqwl_dyn_soft_limit) {
3530 kqwl_above_soft_limit_send_notification(fdp);
3531 act_set_astproc_resource(current_thread());
3532 } else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > 0
3533 && num_kqwls > fdp->kqwl_dyn_hard_limit) {
3534 kqwl_above_hard_limit_send_notification(fdp);
3535 act_set_astproc_resource(current_thread());
3536 }
3537 }
3538 #endif
3539
3540 /*!
3541 * @function kqworkloop_get_or_create
3542 *
3543 * @brief
3544 * Wrapper around kqworkloop_init that handles the uniquing of workloops.
3545 *
3546 * @returns
3547 * 0: success
3548 * EINVAL: invalid parameters
3549 * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3550 * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3551 * ENOMEM: allocation failed
3552 */
3553 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended,unsigned int flags,struct kqworkloop ** kqwlp)3554 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3555 workq_threadreq_param_t *trp,
3556 struct workq_threadreq_extended_param_s *trp_extended,
3557 unsigned int flags, struct kqworkloop **kqwlp)
3558 {
3559 struct filedesc *fdp = &p->p_fd;
3560 struct kqworkloop *alloc_kqwl = NULL;
3561 struct kqworkloop *kqwl = NULL;
3562 int error = 0;
3563
3564 assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3565
3566 if (id == 0 || id == (kqueue_id_t)-1) {
3567 return EINVAL;
3568 }
3569
3570 for (;;) {
3571 kqhash_lock(fdp);
3572 if (__improbable(fdp->fd_kqhash == NULL)) {
3573 kqworkloop_hash_init(fdp);
3574 }
3575
3576 kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3577 if (kqwl) {
3578 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3579 /*
3580 * If MUST_NOT_EXIST was passed, even if we would have failed
3581 * the try_retain, it could have gone the other way, and
3582 * userspace can't tell. Let'em fix their race.
3583 */
3584 error = EEXIST;
3585 break;
3586 }
3587
3588 if (__probable(kqworkloop_try_retain(kqwl))) {
3589 /*
3590 * This is a valid live workloop !
3591 */
3592 *kqwlp = kqwl;
3593 error = 0;
3594 break;
3595 }
3596 }
3597
3598 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3599 error = ENOENT;
3600 break;
3601 }
3602
3603 /*
3604 * We didn't find what we were looking for.
3605 *
3606 * If this is the second time we reach this point (alloc_kqwl != NULL),
3607 * then we're done.
3608 *
3609 * If this is the first time we reach this point (alloc_kqwl == NULL),
3610 * then try to allocate one without blocking.
3611 */
3612 if (__probable(alloc_kqwl == NULL)) {
3613 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3614 }
3615 if (__probable(alloc_kqwl)) {
3616 #if CONFIG_PROC_RESOURCE_LIMITS
3617 fdp->num_kqwls++;
3618 kqworkloop_check_limit_exceeded(fdp);
3619 #endif
3620 kqworkloop_init(alloc_kqwl, p, id, trp, trp_extended);
3621 /*
3622 * The newly allocated and initialized kqwl has a retain count of 1.
3623 */
3624 kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3625 if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3626 /*
3627 * If this kqworkloop is configured to be permanently bound to
3628 * a thread, we take +1 ref on that thread's behalf before we
3629 * unlock the kqhash below. The reason being this new kqwl is
3630 * findable in the hash table as soon as we unlock the kqhash
3631 * and we want to make sure this kqwl does not get deleted from
3632 * under us by the time we create a new thread and bind to it.
3633 *
3634 * This ref is released when the bound thread unbinds itself
3635 * from the kqwl on its way to termination.
3636 * See uthread_cleanup -> kqueue_threadreq_unbind.
3637 *
3638 * The kqwl now has a retain count of 2.
3639 */
3640 kqworkloop_retain(alloc_kqwl);
3641 }
3642 kqhash_unlock(fdp);
3643 /*
3644 * We do not want to keep holding kqhash lock when workq is
3645 * busy creating and initializing a new thread to bind to this
3646 * kqworkloop.
3647 */
3648 if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3649 error = workq_kern_threadreq_permanent_bind(p, &alloc_kqwl->kqwl_request);
3650 if (error != KERN_SUCCESS) {
3651 /*
3652 * The kqwl we just created and initialized has a retain
3653 * count of 2 at this point i.e. 1 from kqworkloop_init and
3654 * 1 on behalf of the bound thread. We need to release
3655 * both the references here to successfully deallocate this
3656 * kqwl before we return an error.
3657 *
3658 * The latter release should take care of deallocating
3659 * the kqwl itself and removing it from the kqhash.
3660 */
3661 kqworkloop_release(alloc_kqwl);
3662 kqworkloop_release(alloc_kqwl);
3663 alloc_kqwl = NULL;
3664 if (trp_extended) {
3665 /*
3666 * Since we transferred these refs to kqwl during
3667 * kqworkloop_init, the kqwl takes care of releasing them.
3668 * We don't have any refs to return to our caller
3669 * in this case.
3670 */
3671 #if CONFIG_PREADOPT_TG
3672 if (trp_extended->trp_permanent_preadopt_tg) {
3673 trp_extended->trp_permanent_preadopt_tg = NULL;
3674 }
3675 #endif
3676 if (trp_extended->trp_work_interval) {
3677 trp_extended->trp_work_interval = NULL;
3678 }
3679 }
3680 return error;
3681 } else {
3682 /*
3683 * For kqwl configured with a bound thread, KQ_SLEEP is used
3684 * to track whether the bound thread needs to be woken up
3685 * when such a kqwl is woken up.
3686 *
3687 * See kqworkloop_bound_thread_wakeup and
3688 * kqworkloop_bound_thread_park_prepost.
3689 *
3690 * Once the kqwl is initialized, this state
3691 * should always be manipulated under kqlock.
3692 */
3693 kqlock(alloc_kqwl);
3694 alloc_kqwl->kqwl_state |= KQ_SLEEP;
3695 kqunlock(alloc_kqwl);
3696 }
3697 }
3698 *kqwlp = alloc_kqwl;
3699 return 0;
3700 }
3701
3702 /*
3703 * We have to block to allocate a workloop, drop the lock,
3704 * allocate one, but then we need to retry lookups as someone
3705 * else could race with us.
3706 */
3707 kqhash_unlock(fdp);
3708
3709 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3710 }
3711
3712 kqhash_unlock(fdp);
3713
3714 if (__improbable(alloc_kqwl)) {
3715 zfree(kqworkloop_zone, alloc_kqwl);
3716 }
3717
3718 return error;
3719 }
3720
3721 #pragma mark - knotes
3722
3723 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3724 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3725 {
3726 knote_set_error(kn, ENOTSUP);
3727 return 0;
3728 }
3729
3730 static void
filt_no_detach(__unused struct knote * kn)3731 filt_no_detach(__unused struct knote *kn)
3732 {
3733 }
3734
3735 static int __dead2
filt_bad_event(struct knote * kn,long hint)3736 filt_bad_event(struct knote *kn, long hint)
3737 {
3738 panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3739 }
3740
3741 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3742 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3743 {
3744 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3745 }
3746
3747 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3748 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3749 {
3750 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3751 }
3752
3753 /*
3754 * knotes_dealloc - detach all knotes for the process and drop them
3755 *
3756 * Process is in such a state that it will not try to allocate
3757 * any more knotes during this process (stopped for exit or exec).
3758 */
3759 void
knotes_dealloc(proc_t p)3760 knotes_dealloc(proc_t p)
3761 {
3762 struct filedesc *fdp = &p->p_fd;
3763 struct kqueue *kq;
3764 struct knote *kn;
3765 struct klist *kn_hash = NULL;
3766 u_long kn_hashmask;
3767 int i;
3768
3769 proc_fdlock(p);
3770
3771 /* Close all the fd-indexed knotes up front */
3772 if (fdp->fd_knlistsize > 0) {
3773 for (i = 0; i < fdp->fd_knlistsize; i++) {
3774 while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3775 kq = knote_get_kq(kn);
3776 kqlock(kq);
3777 proc_fdunlock(p);
3778 knote_drop(kq, kn, NULL);
3779 proc_fdlock(p);
3780 }
3781 }
3782 /* free the table */
3783 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3784 }
3785 fdp->fd_knlistsize = 0;
3786
3787 proc_fdunlock(p);
3788
3789 knhash_lock(fdp);
3790
3791 /* Clean out all the hashed knotes as well */
3792 if (fdp->fd_knhashmask != 0) {
3793 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3794 while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3795 kq = knote_get_kq(kn);
3796 kqlock(kq);
3797 knhash_unlock(fdp);
3798 knote_drop(kq, kn, NULL);
3799 knhash_lock(fdp);
3800 }
3801 }
3802 kn_hash = fdp->fd_knhash;
3803 kn_hashmask = fdp->fd_knhashmask;
3804 fdp->fd_knhashmask = 0;
3805 fdp->fd_knhash = NULL;
3806 }
3807
3808 knhash_unlock(fdp);
3809
3810 if (kn_hash) {
3811 hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3812 }
3813 }
3814
3815 /*
3816 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3817 * scheduling parameters
3818 *
3819 * Process is in such a state that it will not try to allocate
3820 * any more kqs or knotes during this process (stopped for exit or exec).
3821 */
3822 void
kqworkloops_dealloc(proc_t p)3823 kqworkloops_dealloc(proc_t p)
3824 {
3825 struct filedesc *fdp = &p->p_fd;
3826 struct kqworkloop *kqwl, *kqwln;
3827 struct kqwllist tofree;
3828
3829 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3830 return;
3831 }
3832
3833 kqhash_lock(fdp);
3834
3835 if (fdp->fd_kqhashmask == 0) {
3836 kqhash_unlock(fdp);
3837 return;
3838 }
3839
3840 LIST_INIT(&tofree);
3841
3842 for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3843 LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3844 #if CONFIG_PREADOPT_TG
3845 /*
3846 * kqworkloops that have scheduling parameters have an
3847 * implicit retain from kqueue_workloop_ctl that needs
3848 * to be balanced on process exit.
3849 */
3850 __assert_only thread_group_qos_t preadopt_tg;
3851 preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3852 #endif
3853 assert(kqwl->kqwl_params
3854 #if CONFIG_PREADOPT_TG
3855 || KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3856 #endif
3857 );
3858
3859 LIST_REMOVE(kqwl, kqwl_hashlink);
3860 LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3861 }
3862 }
3863 #if CONFIG_PROC_RESOURCE_LIMITS
3864 fdp->num_kqwls = 0;
3865 #endif
3866 kqhash_unlock(fdp);
3867
3868 LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3869 uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3870 if (ref != 1) {
3871 panic("kq(%p) invalid refcount %d", kqwl, ref);
3872 }
3873 kqworkloop_dealloc(kqwl, false);
3874 }
3875 }
3876
3877 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3878 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3879 struct kevent_qos_s *kev)
3880 {
3881 /* We don't care about the priority of a disabled or deleted knote */
3882 if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3883 return 0;
3884 }
3885
3886 if (kq->kq_state & KQ_WORKLOOP) {
3887 /*
3888 * Workloops need valid priorities with a QOS (excluding manager) for
3889 * any enabled knote.
3890 *
3891 * When it is pre-existing, just make sure it has a valid QoS as
3892 * kevent_register() will not use the incoming priority (filters who do
3893 * have the responsibility to validate it again, see filt_wltouch).
3894 *
3895 * If the knote is being made, validate the incoming priority.
3896 */
3897 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3898 return ERANGE;
3899 }
3900 }
3901
3902 return 0;
3903 }
3904
3905 /*
3906 * Prepare a filter for waiting after register.
3907 *
3908 * The f_post_register_wait hook will be called later by kevent_register()
3909 * and should call kevent_register_wait_block()
3910 */
3911 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3912 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3913 {
3914 thread_t thread = current_thread();
3915
3916 assert(knote_fops(kn)->f_extended_codes);
3917
3918 if (kn->kn_thread == NULL) {
3919 thread_reference(thread);
3920 kn->kn_thread = thread;
3921 } else if (kn->kn_thread != thread) {
3922 /*
3923 * kn_thread may be set from a previous aborted wait
3924 * However, it has to be from the same thread.
3925 */
3926 kev->flags |= EV_ERROR;
3927 kev->data = EXDEV;
3928 return 0;
3929 }
3930
3931 return FILTER_REGISTER_WAIT | rc;
3932 }
3933
3934 /*
3935 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3936 * aborted instead of properly woken up with thread_wakeup_thread().
3937 */
3938 static void
kevent_register_wait_cleanup(struct knote * kn)3939 kevent_register_wait_cleanup(struct knote *kn)
3940 {
3941 thread_t thread = kn->kn_thread;
3942 kn->kn_thread = NULL;
3943 thread_deallocate(thread);
3944 }
3945
3946 /*
3947 * Must be called at the end of a f_post_register_wait call from a filter.
3948 */
3949 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3950 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3951 thread_continue_t cont, struct _kevent_register *cont_args)
3952 {
3953 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3954 kqunlock(cont_args->kqwl);
3955 cont_args->handoff_thread = thread;
3956 thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3957 }
3958
3959 /*
3960 * Called by Filters using a f_post_register_wait to return from their wait.
3961 */
3962 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3963 kevent_register_wait_return(struct _kevent_register *cont_args)
3964 {
3965 struct kqworkloop *kqwl = cont_args->kqwl;
3966 struct kevent_qos_s *kev = &cont_args->kev;
3967 int error = 0;
3968
3969 if (cont_args->handoff_thread) {
3970 thread_deallocate(cont_args->handoff_thread);
3971 }
3972
3973 if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3974 if ((kev->flags & EV_ERROR) == 0) {
3975 kev->flags |= EV_ERROR;
3976 kev->data = 0;
3977 }
3978 error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3979 if (error == 0) {
3980 cont_args->eventout++;
3981 }
3982 }
3983
3984 kqworkloop_release(kqwl);
3985 if (error == 0) {
3986 *(int32_t *)¤t_uthread()->uu_rval = cont_args->eventout;
3987 }
3988 unix_syscall_return(error);
3989 }
3990
3991 /*
3992 * kevent_register - add a new event to a kqueue
3993 *
3994 * Creates a mapping between the event source and
3995 * the kqueue via a knote data structure.
3996 *
3997 * Because many/most the event sources are file
3998 * descriptor related, the knote is linked off
3999 * the filedescriptor table for quick access.
4000 *
4001 * called with nothing locked
4002 * caller holds a reference on the kqueue
4003 */
4004
4005 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)4006 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
4007 struct knote **kn_out)
4008 {
4009 struct proc *p = kq->kq_p;
4010 const struct filterops *fops;
4011 struct knote *kn = NULL;
4012 int result = 0, error = 0;
4013 unsigned short kev_flags = kev->flags;
4014 KNOTE_LOCK_CTX(knlc);
4015
4016 if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
4017 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
4018 } else {
4019 error = EINVAL;
4020 goto out;
4021 }
4022
4023 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4024 if (__improbable((kev->flags & EV_VANISHED) &&
4025 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
4026 error = EINVAL;
4027 goto out;
4028 }
4029
4030 /* Simplify the flags - delete and disable overrule */
4031 if (kev->flags & EV_DELETE) {
4032 kev->flags &= ~EV_ADD;
4033 }
4034 if (kev->flags & EV_DISABLE) {
4035 kev->flags &= ~EV_ENABLE;
4036 }
4037
4038 if (kq->kq_state & KQ_WORKLOOP) {
4039 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4040 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4041 kev->udata, kev->flags, kev->filter);
4042 } else if (kq->kq_state & KQ_WORKQ) {
4043 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4044 0, kev->udata, kev->flags, kev->filter);
4045 } else {
4046 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4047 VM_KERNEL_UNSLIDE_OR_PERM(kq),
4048 kev->udata, kev->flags, kev->filter);
4049 }
4050
4051 restart:
4052 /* find the matching knote from the fd tables/hashes */
4053 kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4054 error = kevent_register_validate_priority(kq, kn, kev);
4055 result = 0;
4056 if (error) {
4057 if (kn) {
4058 kqunlock(kq);
4059 }
4060 goto out;
4061 }
4062
4063 if (kn == NULL && (kev->flags & EV_ADD) == 0) {
4064 /*
4065 * No knote found, EV_ADD wasn't specified
4066 */
4067
4068 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4069 (kq->kq_state & KQ_WORKLOOP)) {
4070 /*
4071 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4072 * that doesn't care about ENOENT, so just pretend the deletion
4073 * happened.
4074 */
4075 } else {
4076 error = ENOENT;
4077 }
4078 goto out;
4079 } else if (kn == NULL) {
4080 /*
4081 * No knote found, need to attach a new one (attach)
4082 */
4083
4084 struct fileproc *knote_fp = NULL;
4085
4086 /* grab a file reference for the new knote */
4087 if (fops->f_isfd) {
4088 if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
4089 goto out;
4090 }
4091 }
4092
4093 kn = knote_alloc();
4094 kn->kn_fp = knote_fp;
4095 kn->kn_is_fd = fops->f_isfd;
4096 kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
4097 kn->kn_status = 0;
4098
4099 /* was vanish support requested */
4100 if (kev->flags & EV_VANISHED) {
4101 kev->flags &= ~EV_VANISHED;
4102 kn->kn_status |= KN_REQVANISH;
4103 }
4104
4105 /* snapshot matching/dispatching protocol flags into knote */
4106 if (kev->flags & EV_DISABLE) {
4107 kn->kn_status |= KN_DISABLED;
4108 }
4109
4110 /*
4111 * copy the kevent state into knote
4112 * protocol is that fflags and data
4113 * are saved off, and cleared before
4114 * calling the attach routine.
4115 *
4116 * - kn->kn_sfflags aliases with kev->xflags
4117 * - kn->kn_sdata aliases with kev->data
4118 * - kn->kn_filter is the top 8 bits of kev->filter
4119 */
4120 kn->kn_kevent = *(struct kevent_internal_s *)kev;
4121 kn->kn_sfflags = kev->fflags;
4122 kn->kn_filtid = (uint8_t)~kev->filter;
4123 kn->kn_fflags = 0;
4124 knote_reset_priority(kq, kn, kev->qos);
4125
4126 /* Add the knote for lookup thru the fd table */
4127 error = kq_add_knote(kq, kn, &knlc, p);
4128 if (error) {
4129 knote_free(kn);
4130 if (knote_fp != NULL) {
4131 fp_drop(p, (int)kev->ident, knote_fp, 0);
4132 }
4133
4134 if (error == ERESTART) {
4135 goto restart;
4136 }
4137 goto out;
4138 }
4139
4140 /* fp reference count now applies to knote */
4141
4142 /*
4143 * we can't use filter_call() because f_attach can change the filter ops
4144 * for a filter that supports f_extended_codes, so we need to reload
4145 * knote_fops() and not use `fops`.
4146 */
4147 result = fops->f_attach(kn, kev);
4148 if (result && !knote_fops(kn)->f_extended_codes) {
4149 result = FILTER_ACTIVE;
4150 }
4151
4152 kqlock(kq);
4153
4154 if (result & FILTER_THREADREQ_NODEFEER) {
4155 enable_preemption();
4156 }
4157
4158 if (kn->kn_flags & EV_ERROR) {
4159 /*
4160 * Failed to attach correctly, so drop.
4161 */
4162 kn->kn_filtid = EVFILTID_DETACHED;
4163 error = (int)kn->kn_sdata;
4164 knote_drop(kq, kn, &knlc);
4165 result = 0;
4166 goto out;
4167 }
4168
4169 /*
4170 * end "attaching" phase - now just attached
4171 *
4172 * Mark the thread request overcommit, if appropos
4173 *
4174 * If the attach routine indicated that an
4175 * event is already fired, activate the knote.
4176 */
4177 if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4178 (kq->kq_state & KQ_WORKLOOP)) {
4179 kqworkloop_set_overcommit((struct kqworkloop *)kq);
4180 }
4181 } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4182 /*
4183 * The knote was dropped while we were waiting for the lock,
4184 * we need to re-evaluate entirely
4185 */
4186
4187 goto restart;
4188 } else if (kev->flags & EV_DELETE) {
4189 /*
4190 * Deletion of a knote (drop)
4191 *
4192 * If the filter wants to filter drop events, let it do so.
4193 *
4194 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4195 * we must wait for the knote to be re-enabled (unless it is being
4196 * re-enabled atomically here).
4197 */
4198
4199 if (knote_fops(kn)->f_allow_drop) {
4200 bool drop;
4201
4202 kqunlock(kq);
4203 drop = knote_fops(kn)->f_allow_drop(kn, kev);
4204 kqlock(kq);
4205
4206 if (!drop) {
4207 goto out_unlock;
4208 }
4209 }
4210
4211 if ((kev->flags & EV_ENABLE) == 0 &&
4212 (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4213 (kn->kn_status & KN_DISABLED) != 0) {
4214 kn->kn_status |= KN_DEFERDELETE;
4215 error = EINPROGRESS;
4216 goto out_unlock;
4217 }
4218
4219 knote_drop(kq, kn, &knlc);
4220 goto out;
4221 } else {
4222 /*
4223 * Regular update of a knote (touch)
4224 *
4225 * Call touch routine to notify filter of changes in filter values
4226 * (and to re-determine if any events are fired).
4227 *
4228 * If the knote is in defer-delete, avoid calling the filter touch
4229 * routine (it has delivered its last event already).
4230 *
4231 * If the touch routine had no failure,
4232 * apply the requested side effects to the knote.
4233 */
4234
4235 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4236 if (kev->flags & EV_ENABLE) {
4237 result = FILTER_ACTIVE;
4238 }
4239 } else {
4240 kqunlock(kq);
4241 result = filter_call(knote_fops(kn), f_touch(kn, kev));
4242 kqlock(kq);
4243 if (result & FILTER_THREADREQ_NODEFEER) {
4244 enable_preemption();
4245 }
4246 }
4247
4248 if (kev->flags & EV_ERROR) {
4249 result = 0;
4250 goto out_unlock;
4251 }
4252
4253 if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4254 kn->kn_udata != kev->udata) {
4255 // this allows klist_copy_udata() not to take locks
4256 os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4257 }
4258 if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4259 kn->kn_status |= KN_DISABLED;
4260 knote_dequeue(kq, kn);
4261 }
4262 }
4263
4264 /* accept new kevent state */
4265 knote_apply_touch(kq, kn, kev, result);
4266
4267 out_unlock:
4268 /*
4269 * When the filter asked for a post-register wait,
4270 * we leave the kqueue locked for kevent_register()
4271 * to call the filter's f_post_register_wait hook.
4272 */
4273 if (result & FILTER_REGISTER_WAIT) {
4274 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4275 *kn_out = kn;
4276 } else {
4277 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4278 }
4279
4280 out:
4281 /* output local errors through the kevent */
4282 if (error) {
4283 kev->flags |= EV_ERROR;
4284 kev->data = error;
4285 }
4286 return result;
4287 }
4288
4289 /*
4290 * knote_process - process a triggered event
4291 *
4292 * Validate that it is really still a triggered event
4293 * by calling the filter routines (if necessary). Hold
4294 * a use reference on the knote to avoid it being detached.
4295 *
4296 * If it is still considered triggered, we will have taken
4297 * a copy of the state under the filter lock. We use that
4298 * snapshot to dispatch the knote for future processing (or
4299 * not, if this was a lost event).
4300 *
4301 * Our caller assures us that nobody else can be processing
4302 * events from this knote during the whole operation. But
4303 * others can be touching or posting events to the knote
4304 * interspersed with our processing it.
4305 *
4306 * caller holds a reference on the kqueue.
4307 * kqueue locked on entry and exit - but may be dropped
4308 */
4309 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4310 knote_process(struct knote *kn, kevent_ctx_t kectx,
4311 kevent_callback_t callback)
4312 {
4313 struct kevent_qos_s kev;
4314 struct kqueue *kq = knote_get_kq(kn);
4315 KNOTE_LOCK_CTX(knlc);
4316 int result = FILTER_ACTIVE;
4317 int error = 0;
4318 bool drop = false;
4319
4320 /*
4321 * Must be active
4322 * Must be queued and not disabled/suppressed or dropping
4323 */
4324 assert(kn->kn_status & KN_QUEUED);
4325 assert(kn->kn_status & KN_ACTIVE);
4326 assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4327
4328 if (kq->kq_state & KQ_WORKLOOP) {
4329 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4330 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4331 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4332 kn->kn_filtid);
4333 } else if (kq->kq_state & KQ_WORKQ) {
4334 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4335 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4336 kn->kn_filtid);
4337 } else {
4338 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4339 VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4340 kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4341 }
4342
4343 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4344 /*
4345 * When the knote is dropping or has dropped,
4346 * then there's nothing we want to process.
4347 */
4348 return EJUSTRETURN;
4349 }
4350
4351 /*
4352 * While waiting for the knote lock, we may have dropped the kq lock.
4353 * and a touch may have disabled and dequeued the knote.
4354 */
4355 if (!(kn->kn_status & KN_QUEUED)) {
4356 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4357 return EJUSTRETURN;
4358 }
4359
4360 /*
4361 * For deferred-drop or vanished events, we just create a fake
4362 * event to acknowledge end-of-life. Otherwise, we call the
4363 * filter's process routine to snapshot the kevent state under
4364 * the filter's locking protocol.
4365 *
4366 * suppress knotes to avoid returning the same event multiple times in
4367 * a single call.
4368 */
4369 knote_suppress(kq, kn);
4370
4371 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4372 uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4373 if (kn->kn_status & KN_DEFERDELETE) {
4374 kev_flags |= EV_DELETE;
4375 } else {
4376 kev_flags |= EV_VANISHED;
4377 }
4378
4379 /* create fake event */
4380 kev = (struct kevent_qos_s){
4381 .filter = kn->kn_filter,
4382 .ident = kn->kn_id,
4383 .flags = kev_flags,
4384 .udata = kn->kn_udata,
4385 };
4386 } else {
4387 kqunlock(kq);
4388 kev = (struct kevent_qos_s) { };
4389 result = filter_call(knote_fops(kn), f_process(kn, &kev));
4390 kqlock(kq);
4391 }
4392
4393 /*
4394 * Determine how to dispatch the knote for future event handling.
4395 * not-fired: just return (do not callout, leave deactivated).
4396 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4397 * is the deferred delete event delivery itself). Otherwise,
4398 * drop it.
4399 * Dispatch: don't clear state, just mark it disabled.
4400 * Cleared: just leave it deactivated.
4401 * Others: re-activate as there may be more events to handle.
4402 * This will not wake up more handlers right now, but
4403 * at the completion of handling events it may trigger
4404 * more handler threads (TODO: optimize based on more than
4405 * just this one event being detected by the filter).
4406 */
4407 if ((result & FILTER_ACTIVE) == 0) {
4408 if ((kn->kn_status & KN_ACTIVE) == 0) {
4409 /*
4410 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4411 * within f_process() but that doesn't necessarily make them
4412 * ready to process, so we should leave them be.
4413 *
4414 * For other knotes, since we will not return an event,
4415 * there's no point keeping the knote suppressed.
4416 */
4417 knote_unsuppress(kq, kn);
4418 }
4419 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4420 return EJUSTRETURN;
4421 }
4422
4423 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4424 knote_adjust_qos(kq, kn, result);
4425 }
4426
4427 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4428 kqueue_update_iotier_override(kq);
4429 }
4430
4431 kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4432
4433 if (kev.flags & EV_ONESHOT) {
4434 if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4435 (kn->kn_status & KN_DEFERDELETE) == 0) {
4436 /* defer dropping non-delete oneshot dispatch2 events */
4437 kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4438 } else {
4439 drop = true;
4440 }
4441 } else if (kn->kn_flags & EV_DISPATCH) {
4442 /* disable all dispatch knotes */
4443 kn->kn_status |= KN_DISABLED;
4444 } else if ((kn->kn_flags & EV_CLEAR) == 0) {
4445 /* re-activate in case there are more events */
4446 knote_activate(kq, kn, FILTER_ACTIVE);
4447 }
4448
4449 /*
4450 * callback to handle each event as we find it.
4451 * If we have to detach and drop the knote, do
4452 * it while we have the kq unlocked.
4453 */
4454 if (drop) {
4455 knote_drop(kq, kn, &knlc);
4456 } else {
4457 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4458 }
4459
4460 if (kev.flags & EV_VANISHED) {
4461 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4462 kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4463 kn->kn_filtid);
4464 }
4465
4466 error = (callback)(&kev, kectx);
4467 kqlock(kq);
4468 return error;
4469 }
4470
4471 /*
4472 * Returns -1 if the kqueue was unbound and processing should not happen
4473 */
4474 #define KQWQAE_BEGIN_PROCESSING 1
4475 #define KQWQAE_END_PROCESSING 2
4476 #define KQWQAE_UNBIND 3
4477 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4478 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4479 int kevent_flags, int kqwqae_op)
4480 {
4481 struct knote *kn;
4482 int rc = 0;
4483 bool unbind;
4484 struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4485 struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4486
4487 kqlock_held(&kqwq->kqwq_kqueue);
4488
4489 /*
4490 * Return suppressed knotes to their original state.
4491 * For workq kqueues, suppressed ones that are still
4492 * truly active (not just forced into the queue) will
4493 * set flags we check below to see if anything got
4494 * woken up.
4495 */
4496 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4497 knote_unsuppress(kqwq, kn);
4498 }
4499
4500 if (kqwqae_op == KQWQAE_UNBIND) {
4501 unbind = true;
4502 } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4503 unbind = false;
4504 } else {
4505 unbind = TAILQ_EMPTY(queue);
4506 }
4507 if (unbind) {
4508 thread_t thread = kqr_thread_fast(kqr);
4509 thread_qos_t old_override;
4510
4511 #if MACH_ASSERT
4512 thread_t self = current_thread();
4513 struct uthread *ut = get_bsdthread_info(self);
4514
4515 assert(thread == self);
4516 assert(ut->uu_kqr_bound == kqr);
4517 #endif // MACH_ASSERT
4518
4519 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4520 if (!TAILQ_EMPTY(queue)) {
4521 /*
4522 * Request a new thread if we didn't process the whole
4523 * queue.
4524 */
4525 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4526 kqr->tr_kq_qos_index, 0);
4527 }
4528 if (old_override) {
4529 thread_drop_kevent_override(thread);
4530 }
4531 rc = -1;
4532 }
4533
4534 return rc;
4535 }
4536
4537 /*
4538 * Return 0 to indicate that processing should proceed,
4539 * -1 if there is nothing to process.
4540 *
4541 * Called with kqueue locked and returns the same way,
4542 * but may drop lock temporarily.
4543 */
4544 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4545 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4546 int kevent_flags)
4547 {
4548 int rc = 0;
4549
4550 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4551 0, kqr->tr_kq_qos_index);
4552
4553 rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4554 KQWQAE_BEGIN_PROCESSING);
4555
4556 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4557 thread_tid(kqr_thread(kqr)),
4558 !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4559
4560 return rc;
4561 }
4562
4563 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4564 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4565 {
4566 kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4567 struct knote *kn, *tmp;
4568
4569 kqlock_held(kqwl);
4570
4571 TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4572 /*
4573 * If a knote that can adjust QoS is disabled because of the automatic
4574 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4575 * further overrides keep pushing.
4576 */
4577 if (knote_fops(kn)->f_adjusts_qos &&
4578 (kn->kn_status & KN_DISABLED) != 0 &&
4579 (kn->kn_status & KN_DROPPING) == 0 &&
4580 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4581 qos = MAX(qos, kn->kn_qos_override);
4582 continue;
4583 }
4584 knote_unsuppress(kqwl, kn);
4585 }
4586
4587 return qos;
4588 }
4589
4590 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4591 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4592 {
4593 workq_threadreq_t kqr = &kqwl->kqwl_request;
4594 struct kqueue *kq = &kqwl->kqwl_kqueue;
4595 int rc = 0, op = KQWL_UTQ_NONE;
4596
4597 kqlock_held(kq);
4598
4599 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4600 kqwl->kqwl_dynamicid, 0, 0);
4601
4602 /* nobody else should still be processing */
4603 assert((kq->kq_state & KQ_PROCESSING) == 0);
4604
4605 kq->kq_state |= KQ_PROCESSING;
4606
4607 if (kevent_flags & KEVENT_FLAG_PARKING) {
4608 /*
4609 * When "parking" we want to process events and if no events are found
4610 * unbind. (Except for WORKQ_TR_FLAG_PERMANENT_BIND where the soft unbind
4611 * and bound thread park happen in the caller.)
4612 *
4613 * However, non overcommit threads sometimes park even when they have
4614 * more work so that the pool can narrow. For these, we need to unbind
4615 * early, so that calling kqworkloop_update_threads_qos() can ask the
4616 * workqueue subsystem whether the thread should park despite having
4617 * pending events.
4618 *
4619 */
4620 if (kqr->tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT | WORKQ_TR_FLAG_PERMANENT_BIND)) {
4621 op = KQWL_UTQ_PARKING;
4622 } else {
4623 op = KQWL_UTQ_UNBINDING;
4624 }
4625 } else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4626 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4627 }
4628
4629 if (op != KQWL_UTQ_NONE) {
4630 thread_qos_t qos_override;
4631 thread_t thread = kqr_thread_fast(kqr);
4632
4633 qos_override = kqworkloop_acknowledge_events(kqwl);
4634
4635 if (op == KQWL_UTQ_UNBINDING) {
4636 kqworkloop_unbind_locked(kqwl, thread,
4637 KQWL_OVERRIDE_DROP_IMMEDIATELY, 0);
4638 kqworkloop_release_live(kqwl);
4639 }
4640 kqworkloop_update_threads_qos(kqwl, op, qos_override);
4641 if (op == KQWL_UTQ_PARKING &&
4642 (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4643 if ((kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) &&
4644 (!(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND))) {
4645 kqworkloop_unbind_locked(kqwl, thread,
4646 KQWL_OVERRIDE_DROP_DELAYED, 0);
4647 kqworkloop_release_live(kqwl);
4648 }
4649 rc = -1; /* To indicate stop begin processing. */
4650 } else if (op == KQWL_UTQ_UNBINDING &&
4651 kqr_thread(kqr) != thread) {
4652 rc = -1; /* To indicate stop begin processing. */
4653 }
4654
4655 if (rc == -1) {
4656 kq->kq_state &= ~KQ_PROCESSING;
4657 if (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) {
4658 goto done;
4659 }
4660 kqworkloop_unbind_delayed_override_drop(thread);
4661 }
4662 }
4663 done:
4664 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4665 kqwl->kqwl_dynamicid, 0, 0);
4666
4667 return rc;
4668 }
4669
4670 /*
4671 * Return 0 to indicate that processing should proceed,
4672 * -1 if there is nothing to process.
4673 * EBADF if the kqueue is draining
4674 *
4675 * Called with kqueue locked and returns the same way,
4676 * but may drop lock temporarily.
4677 * May block.
4678 */
4679 static int
kqfile_begin_processing(struct kqfile * kq)4680 kqfile_begin_processing(struct kqfile *kq)
4681 {
4682 kqlock_held(kq);
4683
4684 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4685 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4686 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4687
4688 /* wait to become the exclusive processing thread */
4689 while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4690 kq->kqf_state |= KQ_PROCWAIT;
4691 lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4692 &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4693 }
4694
4695 if (kq->kqf_state & KQ_DRAIN) {
4696 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4697 VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4698 return EBADF;
4699 }
4700
4701 /* Nobody else processing */
4702
4703 /* anything left to process? */
4704 if (kq->kqf_count == 0) {
4705 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4706 VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4707 return -1;
4708 }
4709
4710 /* convert to processing mode */
4711 kq->kqf_state |= KQ_PROCESSING;
4712
4713 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4714 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4715 return 0;
4716 }
4717
4718 /*
4719 * Try to end the processing, only called when a workq thread is attempting to
4720 * park (KEVENT_FLAG_PARKING is set).
4721 *
4722 * When returning -1, the kqworkq is setup again so that it is ready to be
4723 * processed.
4724 */
4725 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4726 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4727 int kevent_flags)
4728 {
4729 if (kevent_flags & KEVENT_FLAG_PARKING) {
4730 /*
4731 * if acknowledge events "succeeds" it means there are events,
4732 * which is a failure condition for end_processing.
4733 */
4734 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4735 KQWQAE_END_PROCESSING);
4736 if (rc == 0) {
4737 return -1;
4738 }
4739 }
4740
4741 return 0;
4742 }
4743
4744 /*
4745 * Try to end the processing, only called when a workq thread is attempting to
4746 * park (KEVENT_FLAG_PARKING is set).
4747 *
4748 * When returning -1, the kqworkq is setup again so that it is ready to be
4749 * processed (as if kqworkloop_begin_processing had just been called).
4750 *
4751 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4752 * the kqworkloop is unbound from its servicer as a side effect.
4753 */
4754 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4755 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4756 {
4757 struct kqueue *kq = &kqwl->kqwl_kqueue;
4758 workq_threadreq_t kqr = &kqwl->kqwl_request;
4759 int rc = 0;
4760
4761 kqlock_held(kq);
4762
4763 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4764 kqwl->kqwl_dynamicid, 0, 0);
4765
4766 if (kevent_flags & KEVENT_FLAG_PARKING) {
4767 thread_t thread = kqr_thread_fast(kqr);
4768 thread_qos_t qos_override;
4769
4770 /*
4771 * When KEVENT_FLAG_PARKING is set, we need to attempt
4772 * an unbind while still under the lock.
4773 *
4774 * So we do everything kqworkloop_unbind() would do, but because
4775 * we're inside kqueue_process(), if the workloop actually
4776 * received events while our locks were dropped, we have
4777 * the opportunity to fail the end processing and loop again.
4778 *
4779 * This avoids going through the process-wide workqueue lock
4780 * hence scales better.
4781 */
4782 assert(flags & KQ_PROCESSING);
4783 qos_override = kqworkloop_acknowledge_events(kqwl);
4784 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4785
4786 if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4787 rc = -1; /* To indicate we should continue processing. */
4788 } else {
4789 if (kqr_thread_permanently_bound(kqr)) {
4790 /*
4791 * For these, the actual soft unbind and bound thread park
4792 * happen in the caller.
4793 */
4794 kq->kq_state &= ~flags;
4795 } else {
4796 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
4797 kqworkloop_release_live(kqwl);
4798 kq->kq_state &= ~flags;
4799 kqworkloop_unbind_delayed_override_drop(thread);
4800 }
4801 }
4802 } else {
4803 kq->kq_state &= ~flags;
4804 kq->kq_state |= KQ_R2K_ARMED;
4805 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4806 }
4807
4808 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4809 kqwl->kqwl_dynamicid, 0, 0);
4810
4811 return rc;
4812 }
4813
4814 /*
4815 * Called with kqueue lock held.
4816 *
4817 * 0: no more events
4818 * -1: has more events
4819 * EBADF: kqueue is in draining mode
4820 */
4821 static int
kqfile_end_processing(struct kqfile * kq)4822 kqfile_end_processing(struct kqfile *kq)
4823 {
4824 struct knote *kn;
4825 int procwait;
4826
4827 kqlock_held(kq);
4828
4829 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4830
4831 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4832 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4833
4834 /*
4835 * Return suppressed knotes to their original state.
4836 */
4837 while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4838 knote_unsuppress(kq, kn);
4839 }
4840
4841 procwait = (kq->kqf_state & KQ_PROCWAIT);
4842 kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4843
4844 if (procwait) {
4845 /* first wake up any thread already waiting to process */
4846 thread_wakeup(&kq->kqf_suppressed);
4847 }
4848
4849 if (kq->kqf_state & KQ_DRAIN) {
4850 return EBADF;
4851 }
4852 return kq->kqf_count != 0 ? -1 : 0;
4853 }
4854
4855 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4856 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4857 struct kqueue_workloop_params *params, int *retval)
4858 {
4859 int error = 0;
4860 struct kqworkloop *kqwl;
4861 struct filedesc *fdp = &p->p_fd;
4862 workq_threadreq_param_t trp = { };
4863 struct workq_threadreq_extended_param_s trp_extended = {0};
4864 integer_t trp_preadopt_priority = 0;
4865 integer_t trp_preadopt_policy = 0;
4866
4867 switch (cmd) {
4868 case KQ_WORKLOOP_CREATE:
4869 if (!params->kqwlp_flags) {
4870 error = EINVAL;
4871 break;
4872 }
4873
4874 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4875 (params->kqwlp_sched_pri < 1 ||
4876 params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4877 error = EINVAL;
4878 break;
4879 }
4880
4881 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4882 invalid_policy(params->kqwlp_sched_pol)) {
4883 error = EINVAL;
4884 break;
4885 }
4886
4887 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4888 (params->kqwlp_cpu_percent <= 0 ||
4889 params->kqwlp_cpu_percent > 100 ||
4890 params->kqwlp_cpu_refillms <= 0 ||
4891 params->kqwlp_cpu_refillms > 0x00ffffff)) {
4892 error = EINVAL;
4893 break;
4894 }
4895
4896 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WITH_BOUND_THREAD) {
4897 if (!bootarg_thread_bound_kqwl_support_enabled) {
4898 error = ENOTSUP;
4899 break;
4900 }
4901 trp.trp_flags |= TRP_BOUND_THREAD;
4902 }
4903
4904 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4905 /*
4906 * This flag serves the purpose of preadopting tg from work interval
4907 * on servicer/creator/bound thread at wakeup/creation time in kernel.
4908 *
4909 * Additionally, it helps the bound thread join the work interval
4910 * before it comes out to userspace for the first time.
4911 */
4912 struct work_interval *work_interval = NULL;
4913 kern_return_t kr;
4914
4915 kr = kern_port_name_to_work_interval(params->kqwl_wi_port,
4916 &work_interval);
4917 if (kr != KERN_SUCCESS) {
4918 error = EINVAL;
4919 break;
4920 }
4921 /* work_interval has a +1 ref */
4922
4923 kr = kern_work_interval_get_policy(work_interval,
4924 &trp_preadopt_policy,
4925 &trp_preadopt_priority);
4926 if (kr != KERN_SUCCESS) {
4927 kern_work_interval_release(work_interval);
4928 error = EINVAL;
4929 break;
4930 }
4931 /* The work interval comes with scheduling policy. */
4932 if (trp_preadopt_policy) {
4933 trp.trp_flags |= TRP_POLICY;
4934 trp.trp_pol = (uint8_t)trp_preadopt_policy;
4935
4936 trp.trp_flags |= TRP_PRIORITY;
4937 trp.trp_pri = (uint8_t)trp_preadopt_priority;
4938 }
4939 #if CONFIG_PREADOPT_TG
4940 kr = kern_work_interval_get_thread_group(work_interval,
4941 &trp_extended.trp_permanent_preadopt_tg);
4942 if (kr != KERN_SUCCESS) {
4943 kern_work_interval_release(work_interval);
4944 error = EINVAL;
4945 break;
4946 }
4947 /*
4948 * In case of KERN_SUCCESS, we take
4949 * : +1 ref on a thread group backing this work interval
4950 * via kern_work_interval_get_thread_group and pass it on to kqwl.
4951 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4952 * get back this ref, we release them before returning.
4953 */
4954 #endif
4955 if (trp.trp_flags & TRP_BOUND_THREAD) {
4956 /*
4957 * For TRP_BOUND_THREAD, we pass +1 ref on the work_interval on to
4958 * kqwl so the bound thread can join it before coming out to
4959 * userspace.
4960 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4961 * get back this ref, we release them before returning.
4962 */
4963 trp_extended.trp_work_interval = work_interval;
4964 } else {
4965 kern_work_interval_release(work_interval);
4966 }
4967 }
4968
4969 if (!(trp.trp_flags & (TRP_POLICY | TRP_PRIORITY))) {
4970 /*
4971 * We always prefer scheduling policy + priority that comes with
4972 * a work interval. It it does not exist, we fallback to what the user
4973 * has asked.
4974 */
4975 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4976 trp.trp_flags |= TRP_PRIORITY;
4977 trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4978 }
4979 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4980 trp.trp_flags |= TRP_POLICY;
4981 trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4982 }
4983 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4984 trp.trp_flags |= TRP_CPUPERCENT;
4985 trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4986 trp.trp_refillms = params->kqwlp_cpu_refillms;
4987 }
4988 }
4989
4990 #if CONFIG_PREADOPT_TG
4991 if ((trp.trp_flags == 0) &&
4992 (trp_extended.trp_permanent_preadopt_tg == NULL)) {
4993 #else
4994 if (trp.trp_flags == 0) {
4995 #endif
4996 error = EINVAL;
4997 break;
4998 }
4999
5000 error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
5001 &trp_extended,
5002 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5003 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
5004 if (error) {
5005 /* kqworkloop_get_or_create did not consume these refs. */
5006 #if CONFIG_PREADOPT_TG
5007 if (trp_extended.trp_permanent_preadopt_tg) {
5008 thread_group_release(trp_extended.trp_permanent_preadopt_tg);
5009 }
5010 #endif
5011 if (trp_extended.trp_work_interval) {
5012 kern_work_interval_release(trp_extended.trp_work_interval);
5013 }
5014 break;
5015 }
5016
5017 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
5018 /* FD_WORKLOOP indicates we've ever created a workloop
5019 * via this syscall but its only ever added to a process, never
5020 * removed.
5021 */
5022 proc_fdlock(p);
5023 fdt_flag_set(fdp, FD_WORKLOOP);
5024 proc_fdunlock(p);
5025 }
5026 break;
5027 case KQ_WORKLOOP_DESTROY:
5028 error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL, NULL,
5029 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5030 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
5031 if (error) {
5032 break;
5033 }
5034 kqlock(kqwl);
5035 trp.trp_value = kqwl->kqwl_params;
5036 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5037 trp.trp_flags |= TRP_RELEASED;
5038 kqwl->kqwl_params = trp.trp_value;
5039 if (trp.trp_flags & TRP_BOUND_THREAD) {
5040 kqworkloop_bound_thread_wakeup(kqwl);
5041 }
5042 kqworkloop_release_live(kqwl);
5043 } else {
5044 error = EINVAL;
5045 }
5046 kqunlock(kqwl);
5047 kqworkloop_release(kqwl);
5048 break;
5049 }
5050 *retval = 0;
5051 return error;
5052 }
5053
5054 int
5055 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
5056 {
5057 struct kqueue_workloop_params params = {
5058 .kqwlp_id = 0,
5059 };
5060 if (uap->sz < sizeof(params.kqwlp_version)) {
5061 return EINVAL;
5062 }
5063
5064 size_t copyin_sz = MIN(sizeof(params), uap->sz);
5065 int rv = copyin(uap->addr, ¶ms, copyin_sz);
5066 if (rv) {
5067 return rv;
5068 }
5069
5070 if (params.kqwlp_version != (int)uap->sz) {
5071 return EINVAL;
5072 }
5073
5074 return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms,
5075 retval);
5076 }
5077
5078 static int
5079 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
5080 {
5081 struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
5082 int retnum = 0;
5083
5084 assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5085
5086 if (which == FREAD) {
5087 kqlock(kq);
5088 if (kqfile_begin_processing(kq) == 0) {
5089 retnum = kq->kqf_count;
5090 kqfile_end_processing(kq);
5091 } else if ((kq->kqf_state & KQ_DRAIN) == 0) {
5092 selrecord(kq->kqf_p, &kq->kqf_sel, wql);
5093 }
5094 kqunlock(kq);
5095 }
5096 return retnum;
5097 }
5098
5099 /*
5100 * kqueue_close -
5101 */
5102 static int
5103 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5104 {
5105 struct kqfile *kqf = fg_get_data(fg);
5106
5107 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5108 kqlock(kqf);
5109 selthreadclear(&kqf->kqf_sel);
5110 kqunlock(kqf);
5111 kqueue_dealloc(&kqf->kqf_kqueue);
5112 fg_set_data(fg, NULL);
5113 return 0;
5114 }
5115
5116 /*
5117 * Max depth of the nested kq path that can be created.
5118 * Note that this has to be less than the size of kq_level
5119 * to avoid wrapping around and mislabeling the level. We also
5120 * want to be aggressive about this so that we don't overflow the
5121 * kernel stack while posting kevents
5122 */
5123 #define MAX_NESTED_KQ 10
5124
5125 /*
5126 * The callers has taken a use-count reference on this kqueue and will donate it
5127 * to the kqueue we are being added to. This keeps the kqueue from closing until
5128 * that relationship is torn down.
5129 */
5130 static int
5131 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
5132 __unused struct kevent_qos_s *kev)
5133 {
5134 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5135 struct kqueue *kq = &kqf->kqf_kqueue;
5136 struct kqueue *parentkq = knote_get_kq(kn);
5137
5138 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5139
5140 if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
5141 knote_set_error(kn, EINVAL);
5142 return 0;
5143 }
5144
5145 /*
5146 * We have to avoid creating a cycle when nesting kqueues
5147 * inside another. Rather than trying to walk the whole
5148 * potential DAG of nested kqueues, we just use a simple
5149 * ceiling protocol. When a kqueue is inserted into another,
5150 * we check that the (future) parent is not already nested
5151 * into another kqueue at a lower level than the potenial
5152 * child (because it could indicate a cycle). If that test
5153 * passes, we just mark the nesting levels accordingly.
5154 *
5155 * Only up to MAX_NESTED_KQ can be nested.
5156 *
5157 * Note: kqworkq and kqworkloop cannot be nested and have reused their
5158 * kq_level field, so ignore these as parent.
5159 */
5160
5161 kqlock(parentkq);
5162
5163 if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
5164 if (parentkq->kq_level > 0 &&
5165 parentkq->kq_level < kq->kq_level) {
5166 kqunlock(parentkq);
5167 knote_set_error(kn, EINVAL);
5168 return 0;
5169 }
5170
5171 /* set parent level appropriately */
5172 uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5173 if (plevel < kq->kq_level + 1) {
5174 if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5175 kqunlock(parentkq);
5176 knote_set_error(kn, EINVAL);
5177 return 0;
5178 }
5179 plevel = kq->kq_level + 1;
5180 }
5181
5182 parentkq->kq_level = plevel;
5183 }
5184
5185 kqunlock(parentkq);
5186
5187 kn->kn_filtid = EVFILTID_KQREAD;
5188 kqlock(kq);
5189 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5190 /* indicate nesting in child, if needed */
5191 if (kq->kq_level == 0) {
5192 kq->kq_level = 1;
5193 }
5194
5195 int count = kq->kq_count;
5196 kqunlock(kq);
5197 return count > 0;
5198 }
5199
5200 __attribute__((noinline))
5201 static void
5202 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
5203 {
5204 /* wakeup a thread waiting on this queue */
5205 selwakeup(&kqf->kqf_sel);
5206
5207 /* wake up threads in kqueue_scan() */
5208 if (kqf->kqf_state & KQ_SLEEP) {
5209 kqf->kqf_state &= ~KQ_SLEEP;
5210 thread_wakeup_with_result(&kqf->kqf_count, wr);
5211 }
5212
5213 if (hint == NOTE_REVOKE) {
5214 /* wakeup threads waiting their turn to process */
5215 if (kqf->kqf_state & KQ_PROCWAIT) {
5216 assert(kqf->kqf_state & KQ_PROCESSING);
5217 kqf->kqf_state &= ~KQ_PROCWAIT;
5218 thread_wakeup(&kqf->kqf_suppressed);
5219 }
5220
5221 /* no need to KNOTE: knote_fdclose() takes care of it */
5222 } else {
5223 /* wakeup other kqueues/select sets we're inside */
5224 KNOTE(&kqf->kqf_sel.si_note, hint);
5225 }
5226 }
5227
5228 /*
5229 * kqueue_drain - called when kq is closed
5230 */
5231 static int
5232 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5233 {
5234 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5235
5236 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5237
5238 kqlock(kqf);
5239 kqf->kqf_state |= KQ_DRAIN;
5240 kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5241 kqunlock(kqf);
5242 return 0;
5243 }
5244
5245 int
5246 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5247 {
5248 assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5249
5250 kqlock(kq);
5251 if (isstat64 != 0) {
5252 struct stat64 *sb64 = (struct stat64 *)ub;
5253
5254 bzero((void *)sb64, sizeof(*sb64));
5255 sb64->st_size = kq->kq_count;
5256 if (kq->kq_state & KQ_KEV_QOS) {
5257 sb64->st_blksize = sizeof(struct kevent_qos_s);
5258 } else if (kq->kq_state & KQ_KEV64) {
5259 sb64->st_blksize = sizeof(struct kevent64_s);
5260 } else if (IS_64BIT_PROCESS(p)) {
5261 sb64->st_blksize = sizeof(struct user64_kevent);
5262 } else {
5263 sb64->st_blksize = sizeof(struct user32_kevent);
5264 }
5265 sb64->st_mode = S_IFIFO;
5266 } else {
5267 struct stat *sb = (struct stat *)ub;
5268
5269 bzero((void *)sb, sizeof(*sb));
5270 sb->st_size = kq->kq_count;
5271 if (kq->kq_state & KQ_KEV_QOS) {
5272 sb->st_blksize = sizeof(struct kevent_qos_s);
5273 } else if (kq->kq_state & KQ_KEV64) {
5274 sb->st_blksize = sizeof(struct kevent64_s);
5275 } else if (IS_64BIT_PROCESS(p)) {
5276 sb->st_blksize = sizeof(struct user64_kevent);
5277 } else {
5278 sb->st_blksize = sizeof(struct user32_kevent);
5279 }
5280 sb->st_mode = S_IFIFO;
5281 }
5282 kqunlock(kq);
5283 return 0;
5284 }
5285
5286 static inline bool
5287 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5288 {
5289 if (current_proc() == kq->kq_p) {
5290 /*
5291 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5292 * do combined send/receive and in the case of self-IPC, the AST may bet
5293 * set on a thread that will not return to userspace and needs the
5294 * thread the AST would create to unblock itself.
5295 *
5296 * At this time, we really want to target:
5297 *
5298 * - kevent variants that can cause thread creations, and dispatch
5299 * really only uses kevent_qos and kevent_id,
5300 *
5301 * - workq_kernreturn (directly about thread creations)
5302 *
5303 * - bsdthread_ctl which is used for qos changes and has direct impact
5304 * on the creator thread scheduling decisions.
5305 */
5306 switch (current_uthread()->syscall_code) {
5307 case SYS_kevent_qos:
5308 case SYS_kevent_id:
5309 case SYS_workq_kernreturn:
5310 case SYS_bsdthread_ctl:
5311 return true;
5312 }
5313 }
5314 return false;
5315 }
5316
5317 /*
5318 * Interact with the pthread kext to request a servicing there at a specific QoS
5319 * level.
5320 *
5321 * - Caller holds the kqlock
5322 *
5323 * - May be called with the kqueue's wait queue set locked,
5324 * so cannot do anything that could recurse on that.
5325 */
5326 static void
5327 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5328 kq_index_t qos, int flags)
5329 {
5330 assert(kqr_thread(kqr) == THREAD_NULL);
5331 assert(!kqr_thread_requested(kqr));
5332 struct turnstile *ts = TURNSTILE_NULL;
5333
5334 if (workq_is_exiting(kqu.kq->kq_p)) {
5335 return;
5336 }
5337
5338 kqlock_held(kqu);
5339
5340 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5341 struct kqworkloop *kqwl = kqu.kqwl;
5342
5343 assert(kqwl->kqwl_owner == THREAD_NULL);
5344 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5345 kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5346 ts = kqwl->kqwl_turnstile;
5347 /* Add a thread request reference on the kqueue. */
5348 kqworkloop_retain(kqwl);
5349
5350 #if CONFIG_PREADOPT_TG
5351 thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5352 &kqwl->kqwl_preadopt_tg, relaxed);
5353 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5354 /*
5355 * This kqwl has been permanently configured with a thread group.
5356 * See kqworkloops with scheduling parameters.
5357 */
5358 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5359 } else {
5360 /*
5361 * This thread is the one which is ack-ing the thread group on the kqwl
5362 * under the kqlock and will take action accordingly, pairs with the
5363 * release barrier in kqueue_set_preadopted_thread_group
5364 */
5365 uint16_t tg_acknowledged;
5366 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5367 KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5368 &tg_acknowledged, acquire)) {
5369 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5370 }
5371 }
5372 #endif
5373 } else {
5374 assert(kqu.kq->kq_state & KQ_WORKQ);
5375 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5376 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5377 }
5378
5379 /*
5380 * New-style thread request supported.
5381 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5382 * its use until a corresponding kqueue_threadreq_bind callback.
5383 */
5384 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5385 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5386 }
5387 if (qos == KQWQ_QOS_MANAGER) {
5388 qos = WORKQ_THREAD_QOS_MANAGER;
5389 }
5390
5391 if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5392 /*
5393 * Process is shutting down or exec'ing.
5394 * All the kqueues are going to be cleaned up
5395 * soon. Forget we even asked for a thread -
5396 * and make sure we don't ask for more.
5397 */
5398 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5399 kqueue_release_live(kqu);
5400 }
5401 }
5402
5403 /*
5404 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5405 *
5406 * This is used when kqueue_threadreq_bind may cause a lock inversion.
5407 */
5408 __attribute__((always_inline))
5409 void
5410 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5411 struct uthread *ut)
5412 {
5413 ut->uu_kqr_bound = kqr;
5414 kqr->tr_thread = get_machthread(ut);
5415 kqr->tr_state = WORKQ_TR_STATE_BINDING;
5416 }
5417
5418 /*
5419 * kqueue_threadreq_bind_commit - commit a bind prepost
5420 *
5421 * The workq code has to commit any binding prepost before the thread has
5422 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5423 */
5424 void
5425 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5426 {
5427 struct uthread *ut = get_bsdthread_info(thread);
5428 workq_threadreq_t kqr = ut->uu_kqr_bound;
5429 kqueue_t kqu = kqr_kqueue(p, kqr);
5430
5431 kqlock(kqu);
5432 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5433 kqueue_threadreq_bind(p, kqr, thread, 0);
5434 }
5435 kqunlock(kqu);
5436 }
5437
5438 void
5439 kqworkloop_bound_thread_terminate(workq_threadreq_t kqr,
5440 uint16_t *uu_workq_flags_orig)
5441 {
5442 struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5443 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5444
5445 assert(uth == current_uthread());
5446
5447 kqlock(kqwl);
5448
5449 *uu_workq_flags_orig = uth->uu_workq_flags;
5450
5451 uth->uu_workq_flags &= ~UT_WORKQ_NEW;
5452 uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_JOINED;
5453 uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_FAILED;
5454
5455 workq_kern_bound_thread_reset_pri(NULL, uth);
5456
5457 kqunlock(kqwl);
5458 }
5459
5460 /*
5461 * This is called from kqueue_process with kqlock held.
5462 */
5463 __attribute__((noreturn, noinline))
5464 static void
5465 kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread)
5466 {
5467 assert(thread == current_thread());
5468
5469 kqlock_held(kqwl);
5470
5471 assert(!kqwl->kqwl_count);
5472
5473 /*
5474 * kevent entry points will take a reference on workloops so we need to
5475 * undo it before we park for good.
5476 */
5477 kqworkloop_release_live(kqwl);
5478
5479 workq_threadreq_t kqr = &kqwl->kqwl_request;
5480 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5481
5482 if (trp.trp_flags & TRP_RELEASED) {
5483 /*
5484 * We need this check since the kqlock is dropped and retaken
5485 * multiple times during kqueue_process and because KQ_SLEEP is not
5486 * set, kqworkloop_bound_thread_wakeup is going to be a no-op.
5487 */
5488 kqunlock(kqwl);
5489 workq_kern_bound_thread_terminate(kqr);
5490 } else {
5491 kqworkloop_unbind_locked(kqwl,
5492 thread, KQWL_OVERRIDE_DROP_DELAYED, KQUEUE_THREADREQ_UNBIND_SOFT);
5493 workq_kern_bound_thread_park(kqr);
5494 }
5495 __builtin_unreachable();
5496 }
5497
5498 /*
5499 * A helper function for pthread workqueue subsystem.
5500 *
5501 * This is used to keep things that the workq code needs to do after
5502 * the bound thread's assert_wait minimum.
5503 */
5504 void
5505 kqworkloop_bound_thread_park_prepost(workq_threadreq_t kqr)
5506 {
5507 assert(current_thread() == kqr->tr_thread);
5508
5509 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5510
5511 kqlock_held(kqwl);
5512
5513 kqwl->kqwl_state |= KQ_SLEEP;
5514
5515 /* uu_kqueue_override is protected under kqlock. */
5516 kqworkloop_unbind_delayed_override_drop(kqr->tr_thread);
5517
5518 kqunlock(kqwl);
5519 }
5520
5521 /*
5522 * A helper function for pthread workqueue subsystem.
5523 *
5524 * This is used to keep things that the workq code needs to do after
5525 * the bound thread's assert_wait minimum.
5526 */
5527 void
5528 kqworkloop_bound_thread_park_commit(workq_threadreq_t kqr,
5529 event_t event,
5530 thread_continue_t continuation)
5531 {
5532 assert(current_thread() == kqr->tr_thread);
5533
5534 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5535 struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5536
5537 kqlock(kqwl);
5538 if (!(kqwl->kqwl_state & KQ_SLEEP)) {
5539 /*
5540 * When we dropped the kqlock to unset the voucher, someone came
5541 * around and made us runnable. But because we weren't waiting on the
5542 * event their thread_wakeup() was ineffectual. To correct for that,
5543 * we just run the continuation ourselves.
5544 */
5545 assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)));
5546 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
5547 __assert_only workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5548 assert(trp.trp_flags & TRP_RELEASED);
5549 }
5550 kqunlock(kqwl);
5551 continuation(NULL, THREAD_AWAKENED);
5552 } else {
5553 assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)) == 0);
5554 thread_set_pending_block_hint(get_machthread(uth),
5555 kThreadWaitParkedBoundWorkQueue);
5556 assert_wait(event, THREAD_INTERRUPTIBLE);
5557 kqunlock(kqwl);
5558 thread_block(continuation);
5559 }
5560 }
5561
5562 static void
5563 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5564 workq_kern_threadreq_flags_t flags)
5565 {
5566 assert(kqr_thread_requested_pending(kqr));
5567
5568 kqlock_held(kqu);
5569
5570 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5571 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5572 }
5573
5574 #if CONFIG_PREADOPT_TG
5575 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5576 struct kqworkloop *kqwl = kqu.kqwl;
5577 thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5578 &kqwl->kqwl_preadopt_tg, relaxed);
5579 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5580 /*
5581 * This kqwl has been permanently configured with a thread group.
5582 * See kqworkloops with scheduling parameters.
5583 */
5584 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5585 } else {
5586 uint16_t tg_ack_status;
5587 /*
5588 * This thread is the one which is ack-ing the thread group on the kqwl
5589 * under the kqlock and will take action accordingly, needs acquire
5590 * barrier.
5591 */
5592 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5593 KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5594 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5595 }
5596 }
5597 }
5598 #endif
5599
5600 workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5601 }
5602
5603 /*
5604 * kqueue_threadreq_bind - bind thread to processing kqrequest
5605 *
5606 * The provided thread will be responsible for delivering events
5607 * associated with the given kqrequest. Bind it and get ready for
5608 * the thread to eventually arrive.
5609 */
5610 void
5611 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5612 unsigned int flags)
5613 {
5614 kqueue_t kqu = kqr_kqueue(p, kqr);
5615 struct uthread *ut = get_bsdthread_info(thread);
5616
5617 kqlock_held(kqu);
5618
5619 assert(ut->uu_kqueue_override == 0);
5620
5621 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5622 assert(ut->uu_kqr_bound == kqr);
5623 assert(kqr->tr_thread == thread);
5624 } else if (kqr->tr_state == WORKQ_TR_STATE_BOUND) {
5625 assert(flags & KQUEUE_THREADREQ_BIND_SOFT);
5626 assert(kqr_thread_permanently_bound(kqr));
5627 } else {
5628 assert(kqr_thread_requested_pending(kqr));
5629 assert(kqr->tr_thread == THREAD_NULL);
5630 assert(ut->uu_kqr_bound == NULL);
5631 ut->uu_kqr_bound = kqr;
5632 kqr->tr_thread = thread;
5633 }
5634
5635 kqr->tr_state = WORKQ_TR_STATE_BOUND;
5636
5637 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5638 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5639
5640 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5641 /*
5642 * <rdar://problem/38626999> shows that asserting here is not ok.
5643 *
5644 * This is not supposed to happen for correct use of the interface,
5645 * but it is sadly possible for userspace (with the help of memory
5646 * corruption, such as over-release of a dispatch queue) to make
5647 * the creator thread the "owner" of a workloop.
5648 *
5649 * Once that happens, and that creator thread picks up the same
5650 * workloop as a servicer, we trip this codepath. We need to fixup
5651 * the state to forget about this thread being the owner, as the
5652 * entire workloop state machine expects servicers to never be
5653 * owners and everything would basically go downhill from here.
5654 */
5655 kqu.kqwl->kqwl_owner = THREAD_NULL;
5656 if (kqworkloop_override(kqu.kqwl)) {
5657 thread_drop_kevent_override(thread);
5658 }
5659 }
5660
5661 if (ts && (flags & KQUEUE_THREADREQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5662 /*
5663 * Past this point, the interlock is the kq req lock again,
5664 * so we can fix the inheritor for good.
5665 */
5666 filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5667 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5668 }
5669
5670 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5671 thread_tid(thread), kqr->tr_kq_qos_index,
5672 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5673
5674 ut->uu_kqueue_override = kqr->tr_kq_override_index;
5675 if (kqr->tr_kq_override_index) {
5676 thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5677 }
5678
5679 #if CONFIG_PREADOPT_TG
5680 /* Remove reference from kqwl and mark it as bound with the SENTINEL */
5681 thread_group_qos_t old_tg;
5682 thread_group_qos_t new_tg;
5683 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5684 if ((old_tg == KQWL_PREADOPTED_TG_NEVER) || KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5685 /*
5686 * Either an app or a kqwl permanently configured with a thread group.
5687 * Nothing to do.
5688 */
5689 os_atomic_rmw_loop_give_up(break);
5690 }
5691 assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5692 new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5693 });
5694
5695 if (ret) {
5696 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5697
5698 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5699 struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5700 assert(tg != NULL);
5701
5702 thread_set_preadopt_thread_group(thread, tg);
5703 thread_group_release_live(tg); // The thread has a reference
5704 } else {
5705 /*
5706 * The thread may already have a preadopt thread group on it -
5707 * we need to make sure to clear that.
5708 */
5709 thread_set_preadopt_thread_group(thread, NULL);
5710 }
5711
5712 /* We have taken action on the preadopted thread group set on the
5713 * set on the kqwl, clear any redrive requests */
5714 os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5715 } else {
5716 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5717 struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5718 assert(tg != NULL);
5719 /*
5720 * For KQUEUE_THREADREQ_BIND_SOFT, technically the following
5721 * set_preadopt should be a no-op since this bound servicer thread
5722 * preadopts kqwl's permanent tg at first-initial bind time and
5723 * never leaves it until its termination.
5724 */
5725 thread_set_preadopt_thread_group(thread, tg);
5726 /*
5727 * From this point on, kqwl and thread both have +1 ref on this tg.
5728 */
5729 }
5730 }
5731 #endif
5732 kqueue_update_iotier_override(kqu);
5733 } else {
5734 assert(kqr->tr_kq_override_index == 0);
5735
5736 #if CONFIG_PREADOPT_TG
5737 /*
5738 * The thread may have a preadopt thread group on it already because it
5739 * got tagged with it as a creator thread. So we need to make sure to
5740 * clear that since we don't have preadopt thread groups for non-kqwl
5741 * cases
5742 */
5743 thread_set_preadopt_thread_group(thread, NULL);
5744 #endif
5745 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5746 thread_tid(thread), kqr->tr_kq_qos_index,
5747 (kqr->tr_kq_override_index << 16) |
5748 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5749 }
5750 }
5751
5752 /*
5753 * kqueue_threadreq_cancel - abort a pending thread request
5754 *
5755 * Called when exiting/exec'ing. Forget our pending request.
5756 */
5757 void
5758 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5759 {
5760 kqueue_release(kqr_kqueue(p, kqr));
5761 }
5762
5763 workq_threadreq_param_t
5764 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5765 {
5766 struct kqworkloop *kqwl;
5767 workq_threadreq_param_t trp;
5768
5769 assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5770 kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5771 trp.trp_value = kqwl->kqwl_params;
5772 return trp;
5773 }
5774
5775 /*
5776 * kqueue_threadreq_unbind - unbind thread from processing kqueue
5777 *
5778 * End processing the per-QoS bucket of events and allow other threads
5779 * to be requested for future servicing.
5780 *
5781 * caller holds a reference on the kqueue.
5782 */
5783 void
5784 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5785 {
5786 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5787 kqworkloop_unbind(kqr_kqworkloop(kqr));
5788 } else {
5789 kqworkq_unbind(p, kqr);
5790 }
5791 }
5792
5793 /*
5794 * If we aren't already busy processing events [for this QoS],
5795 * request workq thread support as appropriate.
5796 *
5797 * TBD - for now, we don't segregate out processing by QoS.
5798 *
5799 * - May be called with the kqueue's wait queue set locked,
5800 * so cannot do anything that could recurse on that.
5801 */
5802 static void
5803 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5804 {
5805 workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5806
5807 /* convert to thread qos value */
5808 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5809
5810 if (!kqr_thread_requested(kqr)) {
5811 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5812 }
5813 }
5814
5815 /*
5816 * This represent the asynchronous QoS a given workloop contributes,
5817 * hence is the max of the current active knotes (override index)
5818 * and the workloop max qos (userspace async qos).
5819 */
5820 static kq_index_t
5821 kqworkloop_override(struct kqworkloop *kqwl)
5822 {
5823 workq_threadreq_t kqr = &kqwl->kqwl_request;
5824 return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5825 }
5826
5827 static inline void
5828 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5829 {
5830 workq_threadreq_t kqr = &kqwl->kqwl_request;
5831
5832 kqlock_held(kqwl);
5833
5834 if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5835 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5836 act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5837 }
5838 }
5839
5840 static void
5841 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5842 {
5843 workq_threadreq_t kqr = &kqwl->kqwl_request;
5844 struct kqueue *kq = &kqwl->kqwl_kqueue;
5845 kq_index_t old_override = kqworkloop_override(kqwl);
5846
5847 kqlock_held(kqwl);
5848
5849 switch (op) {
5850 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5851 kqwl->kqwl_wakeup_qos = qos;
5852 kqworkloop_request_fire_r2k_notification(kqwl);
5853 goto recompute;
5854
5855 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5856 kqr->tr_kq_override_index = qos;
5857 goto recompute;
5858
5859 case KQWL_UTQ_PARKING:
5860 case KQWL_UTQ_UNBINDING:
5861 kqr->tr_kq_override_index = qos;
5862 OS_FALLTHROUGH;
5863
5864 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5865 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5866 assert(qos == THREAD_QOS_UNSPECIFIED);
5867 }
5868 if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5869 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5870 }
5871 kqwl->kqwl_wakeup_qos = 0;
5872 for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5873 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5874 kqwl->kqwl_wakeup_qos = i;
5875 kqworkloop_request_fire_r2k_notification(kqwl);
5876 break;
5877 }
5878 }
5879 OS_FALLTHROUGH;
5880
5881 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5882 recompute:
5883 /*
5884 * When modifying the wakeup QoS or the override QoS, we always need to
5885 * maintain our invariant that kqr_override_index is at least as large
5886 * as the highest QoS for which an event is fired.
5887 *
5888 * However this override index can be larger when there is an overriden
5889 * suppressed knote pushing on the kqueue.
5890 */
5891 if (qos < kqwl->kqwl_wakeup_qos) {
5892 qos = kqwl->kqwl_wakeup_qos;
5893 }
5894 if (kqr->tr_kq_override_index < qos) {
5895 kqr->tr_kq_override_index = qos;
5896 }
5897 break;
5898
5899 case KQWL_UTQ_REDRIVE_EVENTS:
5900 break;
5901
5902 case KQWL_UTQ_SET_QOS_INDEX:
5903 kqr->tr_kq_qos_index = qos;
5904 break;
5905
5906 default:
5907 panic("unknown kqwl thread qos update operation: %d", op);
5908 }
5909
5910 thread_t kqwl_owner = kqwl->kqwl_owner;
5911 thread_t servicer = kqr_thread(kqr);
5912 boolean_t qos_changed = FALSE;
5913 kq_index_t new_override = kqworkloop_override(kqwl);
5914
5915 /*
5916 * Apply the diffs to the owner if applicable
5917 */
5918 if (kqwl_owner) {
5919 #if 0
5920 /* JMM - need new trace hooks for owner overrides */
5921 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5922 kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5923 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5924 #endif
5925 if (new_override == old_override) {
5926 // nothing to do
5927 } else if (old_override == THREAD_QOS_UNSPECIFIED) {
5928 thread_add_kevent_override(kqwl_owner, new_override);
5929 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5930 thread_drop_kevent_override(kqwl_owner);
5931 } else { /* old_override != new_override */
5932 thread_update_kevent_override(kqwl_owner, new_override);
5933 }
5934 }
5935
5936 /*
5937 * apply the diffs to the servicer
5938 */
5939
5940 if (!kqr_thread_requested(kqr)) {
5941 /*
5942 * No servicer, nor thread-request
5943 *
5944 * Make a new thread request, unless there is an owner (or the workloop
5945 * is suspended in userland) or if there is no asynchronous work in the
5946 * first place.
5947 */
5948
5949 if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5950 int initiate_flags = 0;
5951 if (op == KQWL_UTQ_UNBINDING) {
5952 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5953 }
5954
5955 /* kqueue_threadreq_initiate handles the acknowledgement of the TG
5956 * if needed */
5957 kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5958 }
5959 } else if (servicer) {
5960 /*
5961 * Servicer in flight
5962 *
5963 * Just apply the diff to the servicer
5964 */
5965
5966 #if CONFIG_PREADOPT_TG
5967 /* When there's a servicer for the kqwl already, then the servicer will
5968 * adopt the thread group in the kqr, we don't need to poke the
5969 * workqueue subsystem to make different decisions due to the thread
5970 * group. Consider the current request ack-ed.
5971 */
5972 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5973 #endif
5974
5975 if (kqr_thread_permanently_bound(kqr) && (kqwl->kqwl_state & KQ_SLEEP)) {
5976 kqr->tr_qos = new_override;
5977 workq_kern_bound_thread_reset_pri(kqr, get_bsdthread_info(servicer));
5978 } else {
5979 struct uthread *ut = get_bsdthread_info(servicer);
5980 if (ut->uu_kqueue_override != new_override) {
5981 if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5982 thread_add_servicer_override(servicer, new_override);
5983 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5984 thread_drop_servicer_override(servicer);
5985 } else { /* ut->uu_kqueue_override != new_override */
5986 thread_update_servicer_override(servicer, new_override);
5987 }
5988 ut->uu_kqueue_override = new_override;
5989 qos_changed = TRUE;
5990 }
5991 }
5992 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5993 /*
5994 * No events to deliver anymore.
5995 *
5996 * However canceling with turnstiles is challenging, so the fact that
5997 * the request isn't useful will be discovered by the servicer himself
5998 * later on.
5999 */
6000 } else if (old_override != new_override) {
6001 /*
6002 * Request is in flight
6003 *
6004 * Apply the diff to the thread request.
6005 */
6006 kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
6007 qos_changed = TRUE;
6008 }
6009
6010 if (qos_changed) {
6011 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6012 thread_tid(servicer), kqr->tr_kq_qos_index,
6013 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
6014 }
6015 }
6016
6017 static void
6018 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
6019 {
6020 workq_threadreq_t kqr = &kqwl->kqwl_request;
6021 thread_t servicer = kqr_thread(kqr);
6022 uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
6023
6024 kqlock_held(kqwl);
6025
6026 if (servicer) {
6027 thread_update_servicer_iotier_override(servicer, iotier);
6028 }
6029 }
6030
6031 static void
6032 kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl)
6033 {
6034 workq_threadreq_t kqr = &kqwl->kqwl_request;
6035
6036 kqlock_held(kqwl);
6037
6038 assert(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
6039
6040 __assert_only struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
6041 assert(workq_thread_is_permanently_bound(uth));
6042
6043 /*
6044 * The bound thread takes up the responsibility of setting the KQ_SLEEP
6045 * on its way to parking. See kqworkloop_bound_thread_park_prepost.
6046 * This state is always manipulated under kqlock.
6047 */
6048 if (kqwl->kqwl_state & KQ_SLEEP) {
6049 kqwl->kqwl_state &= ~KQ_SLEEP;
6050 kqueue_threadreq_bind(current_proc(),
6051 kqr, kqr->tr_thread, KQUEUE_THREADREQ_BIND_SOFT);
6052 workq_kern_bound_thread_wakeup(kqr);
6053 }
6054 }
6055
6056 static void
6057 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
6058 {
6059 if (qos <= kqwl->kqwl_wakeup_qos) {
6060 /*
6061 * Shortcut wakeups that really do nothing useful
6062 */
6063 return;
6064 }
6065
6066 if ((kqwl->kqwl_state & KQ_PROCESSING) &&
6067 kqr_thread(&kqwl->kqwl_request) == current_thread()) {
6068 /*
6069 * kqworkloop_end_processing() will perform the required QoS
6070 * computations when it unsets the processing mode.
6071 */
6072 return;
6073 }
6074
6075 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
6076
6077 /*
6078 * In case of thread bound kqwl, we let the kqworkloop_update_threads_qos
6079 * take care of overriding the servicer first before it waking up. This
6080 * simplifies the soft bind of the parked bound thread later.
6081 */
6082 if (kqr_thread_permanently_bound(&kqwl->kqwl_request)) {
6083 kqworkloop_bound_thread_wakeup(kqwl);
6084 }
6085 }
6086
6087 static struct kqtailq *
6088 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6089 {
6090 if (kq.kq->kq_state & KQ_WORKLOOP) {
6091 return &kq.kqwl->kqwl_suppressed;
6092 } else if (kq.kq->kq_state & KQ_WORKQ) {
6093 return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
6094 } else {
6095 return &kq.kqf->kqf_suppressed;
6096 }
6097 }
6098
6099 struct turnstile *
6100 kqueue_alloc_turnstile(kqueue_t kqu)
6101 {
6102 struct kqworkloop *kqwl = kqu.kqwl;
6103 kq_state_t kq_state;
6104
6105 kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
6106 if (kq_state & KQ_HAS_TURNSTILE) {
6107 /* force a dependency to pair with the atomic or with release below */
6108 return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
6109 (uintptr_t)kq_state);
6110 }
6111
6112 if (!(kq_state & KQ_WORKLOOP)) {
6113 return TURNSTILE_NULL;
6114 }
6115
6116 struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
6117 bool workq_locked = false;
6118
6119 kqlock(kqu);
6120
6121 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6122 workq_locked = true;
6123 workq_kern_threadreq_lock(kqwl->kqwl_p);
6124 }
6125
6126 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
6127 free_ts = ts;
6128 ts = kqwl->kqwl_turnstile;
6129 } else {
6130 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
6131 ts, TURNSTILE_WORKLOOPS);
6132
6133 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6134 os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
6135
6136 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6137 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
6138 &kqwl->kqwl_request, kqwl->kqwl_owner,
6139 ts, TURNSTILE_IMMEDIATE_UPDATE);
6140 /*
6141 * The workq may no longer be the interlock after this.
6142 * In which case the inheritor wasn't updated.
6143 */
6144 }
6145 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
6146 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6147 }
6148 }
6149
6150 if (workq_locked) {
6151 workq_kern_threadreq_unlock(kqwl->kqwl_p);
6152 }
6153
6154 kqunlock(kqu);
6155
6156 if (free_ts) {
6157 turnstile_deallocate(free_ts);
6158 } else {
6159 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
6160 }
6161 return ts;
6162 }
6163
6164 __attribute__((always_inline))
6165 struct turnstile *
6166 kqueue_turnstile(kqueue_t kqu)
6167 {
6168 kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
6169 if (kq_state & KQ_WORKLOOP) {
6170 return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
6171 }
6172 return TURNSTILE_NULL;
6173 }
6174
6175 __attribute__((always_inline))
6176 struct turnstile *
6177 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
6178 {
6179 struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6180 if (kqwl) {
6181 return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
6182 }
6183 return TURNSTILE_NULL;
6184 }
6185
6186 static void
6187 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
6188 {
6189 workq_threadreq_t kqr = &kqwl->kqwl_request;
6190
6191 /*
6192 * This test is racy, but since we never remove this bit,
6193 * it allows us to avoid taking a lock.
6194 */
6195 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
6196 return;
6197 }
6198
6199 kqlock_held(kqwl);
6200
6201 if (kqr_thread_requested_pending(kqr)) {
6202 kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
6203 WORKQ_THREADREQ_MAKE_OVERCOMMIT);
6204 } else {
6205 kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
6206 }
6207 }
6208
6209 static void
6210 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
6211 kq_index_t override_index)
6212 {
6213 workq_threadreq_t kqr;
6214 kq_index_t old_override_index;
6215 kq_index_t queue_index = kn->kn_qos_index;
6216
6217 if (override_index <= queue_index) {
6218 return;
6219 }
6220
6221 kqr = kqworkq_get_request(kqwq, queue_index);
6222
6223 kqlock_held(kqwq);
6224
6225 old_override_index = kqr->tr_kq_override_index;
6226 if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
6227 thread_t servicer = kqr_thread(kqr);
6228 kqr->tr_kq_override_index = override_index;
6229
6230 /* apply the override to [incoming?] servicing thread */
6231 if (servicer) {
6232 if (old_override_index) {
6233 thread_update_kevent_override(servicer, override_index);
6234 } else {
6235 thread_add_kevent_override(servicer, override_index);
6236 }
6237 }
6238 }
6239 }
6240
6241 static void
6242 kqueue_update_iotier_override(kqueue_t kqu)
6243 {
6244 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6245 kqworkloop_update_iotier_override(kqu.kqwl);
6246 }
6247 }
6248
6249 static void
6250 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
6251 {
6252 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6253 kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6254 qos);
6255 } else {
6256 kqworkq_update_override(kqu.kqwq, kn, qos);
6257 }
6258 }
6259
6260 static void
6261 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
6262 enum kqwl_unbind_locked_mode how, unsigned int flags)
6263 {
6264 struct uthread *ut = get_bsdthread_info(thread);
6265 workq_threadreq_t kqr = &kqwl->kqwl_request;
6266
6267 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6268 thread_tid(thread), 0, 0);
6269
6270 kqlock_held(kqwl);
6271
6272 assert(ut->uu_kqr_bound == kqr);
6273
6274 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6275 ut->uu_kqr_bound = NULL;
6276 }
6277
6278 if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
6279 ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6280 thread_drop_servicer_override(thread);
6281 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6282 }
6283
6284 if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6285 turnstile_update_inheritor(kqwl->kqwl_turnstile,
6286 TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6287 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6288 TURNSTILE_INTERLOCK_HELD);
6289 }
6290
6291 #if CONFIG_PREADOPT_TG
6292 /* The kqueue is able to adopt a thread group again */
6293
6294 thread_group_qos_t old_tg, new_tg = NULL;
6295 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
6296 new_tg = old_tg;
6297 if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
6298 new_tg = KQWL_PREADOPTED_TG_NULL;
6299 }
6300 });
6301
6302 if (ret) {
6303 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) &&
6304 KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
6305 // The permanently configured bound thread remains a part of the
6306 // thread group until its termination.
6307 } else {
6308 // Servicer can drop any preadopt thread group it has since it has
6309 // unbound.
6310 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
6311 thread_set_preadopt_thread_group(thread, NULL);
6312 }
6313 }
6314 #endif
6315 thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
6316
6317 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6318 kqr->tr_thread = THREAD_NULL;
6319 kqr->tr_state = WORKQ_TR_STATE_IDLE;
6320 }
6321 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
6322 }
6323
6324 static void
6325 kqworkloop_unbind_delayed_override_drop(thread_t thread)
6326 {
6327 struct uthread *ut = get_bsdthread_info(thread);
6328 if (!workq_thread_is_permanently_bound(ut)) {
6329 assert(ut->uu_kqr_bound == NULL);
6330 }
6331 if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6332 thread_drop_servicer_override(thread);
6333 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6334 }
6335 }
6336
6337 /*
6338 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6339 *
6340 * It will acknowledge events, and possibly request a new thread if:
6341 * - there were active events left
6342 * - we pended waitq hook callouts during processing
6343 * - we pended wakeups while processing (or unsuppressing)
6344 *
6345 * Called with kqueue lock held.
6346 */
6347 static void
6348 kqworkloop_unbind(struct kqworkloop *kqwl)
6349 {
6350 struct kqueue *kq = &kqwl->kqwl_kqueue;
6351 workq_threadreq_t kqr = &kqwl->kqwl_request;
6352 thread_t thread = kqr_thread_fast(kqr);
6353 int op = KQWL_UTQ_PARKING;
6354 kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6355
6356 /*
6357 * For kqwl permanently bound to a thread, this path is only
6358 * exercised when the thread is on its way to terminate.
6359 * We don't care about asking for a new thread in that case.
6360 */
6361 bool kqwl_had_bound_thread = kqr_thread_permanently_bound(kqr);
6362
6363 assert(thread == current_thread());
6364
6365 kqlock(kqwl);
6366
6367 if (!kqwl_had_bound_thread) {
6368 /*
6369 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6370 * unsuppressing knotes not to be applied until the eventual call to
6371 * kqworkloop_update_threads_qos() below.
6372 */
6373 assert((kq->kq_state & KQ_PROCESSING) == 0);
6374 if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6375 kq->kq_state |= KQ_PROCESSING;
6376 qos_override = kqworkloop_acknowledge_events(kqwl);
6377 kq->kq_state &= ~KQ_PROCESSING;
6378 }
6379 }
6380
6381 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
6382
6383 if (!kqwl_had_bound_thread) {
6384 kqworkloop_update_threads_qos(kqwl, op, qos_override);
6385 }
6386
6387 kqunlock(kqwl);
6388
6389 /*
6390 * Drop the override on the current thread last, after the call to
6391 * kqworkloop_update_threads_qos above.
6392 */
6393 kqworkloop_unbind_delayed_override_drop(thread);
6394
6395 /* If last reference, dealloc the workloop kq */
6396 kqworkloop_release(kqwl);
6397 }
6398
6399 static thread_qos_t
6400 kqworkq_unbind_locked(struct kqworkq *kqwq,
6401 workq_threadreq_t kqr, thread_t thread)
6402 {
6403 struct uthread *ut = get_bsdthread_info(thread);
6404 kq_index_t old_override = kqr->tr_kq_override_index;
6405
6406 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6407 thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
6408
6409 kqlock_held(kqwq);
6410
6411 assert(ut->uu_kqr_bound == kqr);
6412 ut->uu_kqr_bound = NULL;
6413 kqr->tr_thread = THREAD_NULL;
6414 kqr->tr_state = WORKQ_TR_STATE_IDLE;
6415 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6416 kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6417
6418 return old_override;
6419 }
6420
6421 /*
6422 * kqworkq_unbind - unbind of a workq kqueue from a thread
6423 *
6424 * We may have to request new threads.
6425 * This can happen there are no waiting processing threads and:
6426 * - there were active events we never got to (count > 0)
6427 * - we pended waitq hook callouts during processing
6428 * - we pended wakeups while processing (or unsuppressing)
6429 */
6430 static void
6431 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6432 {
6433 struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
6434 __assert_only int rc;
6435
6436 kqlock(kqwq);
6437 rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6438 assert(rc == -1);
6439 kqunlock(kqwq);
6440 }
6441
6442 workq_threadreq_t
6443 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6444 {
6445 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6446 return &kqwq->kqwq_request[qos_index - 1];
6447 }
6448
6449 static void
6450 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6451 {
6452 kq_index_t qos = _pthread_priority_thread_qos(pp);
6453
6454 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6455 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6456 pp = _pthread_priority_normalize(pp);
6457 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6458 if (qos == THREAD_QOS_UNSPECIFIED) {
6459 /* On workqueues, outside of QoS means MANAGER */
6460 qos = KQWQ_QOS_MANAGER;
6461 pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6462 } else {
6463 pp = _pthread_priority_normalize(pp);
6464 }
6465 } else {
6466 pp = _pthread_unspecified_priority();
6467 qos = THREAD_QOS_UNSPECIFIED;
6468 }
6469
6470 kn->kn_qos = (int32_t)pp;
6471
6472 if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6473 /* Never lower QoS when in "Merge" mode */
6474 kn->kn_qos_override = qos;
6475 }
6476
6477 /* only adjust in-use qos index when not suppressed */
6478 if (kn->kn_status & KN_SUPPRESSED) {
6479 kqueue_update_override(kqu, kn, qos);
6480 } else if (kn->kn_qos_index != qos) {
6481 knote_dequeue(kqu, kn);
6482 kn->kn_qos_index = qos;
6483 }
6484 }
6485
6486 static void
6487 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
6488 {
6489 thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6490
6491 kqlock_held(kq);
6492
6493 assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6494 assert(qos_index < THREAD_QOS_LAST);
6495
6496 /*
6497 * Early exit for knotes that should not change QoS
6498 */
6499 if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6500 panic("filter %d cannot change QoS", kn->kn_filtid);
6501 } else if (__improbable(!knote_has_qos(kn))) {
6502 return;
6503 }
6504
6505 /*
6506 * knotes with the FALLBACK flag will only use their registration QoS if the
6507 * incoming event has no QoS, else, the registration QoS acts as a floor.
6508 */
6509 thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
6510 if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6511 if (qos_index == THREAD_QOS_UNSPECIFIED) {
6512 qos_index = req_qos;
6513 }
6514 } else {
6515 if (qos_index < req_qos) {
6516 qos_index = req_qos;
6517 }
6518 }
6519 if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6520 /* Never lower QoS when in "Merge" mode */
6521 return;
6522 }
6523
6524 if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6525 /*
6526 * When we're trying to update the QoS override and that both an
6527 * f_event() and other f_* calls are running concurrently, any of these
6528 * in flight calls may want to perform overrides that aren't properly
6529 * serialized with each other.
6530 *
6531 * The first update that observes this racy situation enters a "Merge"
6532 * mode which causes subsequent override requests to saturate the
6533 * override instead of replacing its value.
6534 *
6535 * This mode is left when knote_unlock() or knote_post()
6536 * observe that no other f_* routine is in flight.
6537 */
6538 kn->kn_status |= KN_MERGE_QOS;
6539 }
6540
6541 /*
6542 * Now apply the override if it changed.
6543 */
6544
6545 if (kn->kn_qos_override == qos_index) {
6546 return;
6547 }
6548
6549 kn->kn_qos_override = qos_index;
6550
6551 if (kn->kn_status & KN_SUPPRESSED) {
6552 /*
6553 * For suppressed events, the kn_qos_index field cannot be touched as it
6554 * allows us to know on which supress queue the knote is for a kqworkq.
6555 *
6556 * Also, there's no natural push applied on the kqueues when this field
6557 * changes anyway. We hence need to apply manual overrides in this case,
6558 * which will be cleared when the events are later acknowledged.
6559 */
6560 kqueue_update_override(kq, kn, qos_index);
6561 } else if (kn->kn_qos_index != qos_index) {
6562 knote_dequeue(kq, kn);
6563 kn->kn_qos_index = qos_index;
6564 }
6565 }
6566
6567 void
6568 klist_init(struct klist *list)
6569 {
6570 SLIST_INIT(list);
6571 }
6572
6573
6574 /*
6575 * Query/Post each knote in the object's list
6576 *
6577 * The object lock protects the list. It is assumed that the filter/event
6578 * routine for the object can determine that the object is already locked (via
6579 * the hint) and not deadlock itself.
6580 *
6581 * Autodetach is a specific contract which will detach all knotes from the
6582 * object prior to posting the final event for that knote. This is done while
6583 * under the object lock. A breadcrumb is left in the knote's next pointer to
6584 * indicate to future calls to f_detach routines that they need not reattempt
6585 * to knote_detach from the object's klist again. This is currently used by
6586 * EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6587 *
6588 */
6589 void
6590 knote(struct klist *list, long hint, bool autodetach)
6591 {
6592 struct knote *kn;
6593 struct knote *tmp_kn;
6594 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6595 /*
6596 * We can modify the knote's next pointer since since we are holding the
6597 * object lock and the list can't be concurrently modified. Anyone
6598 * determining auto-detached-ness of a knote should take the primitive lock
6599 * to synchronize.
6600 *
6601 * Note that we do this here instead of the filter's f_event since we may
6602 * not even post the event if the knote is being dropped.
6603 */
6604 if (autodetach) {
6605 kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6606 }
6607 knote_post(kn, hint);
6608 }
6609
6610 /* Blast away the entire klist */
6611 if (autodetach) {
6612 klist_init(list);
6613 }
6614 }
6615
6616 /*
6617 * attach a knote to the specified list. Return true if this is the first entry.
6618 * The list is protected by whatever lock the object it is associated with uses.
6619 */
6620 int
6621 knote_attach(struct klist *list, struct knote *kn)
6622 {
6623 int ret = SLIST_EMPTY(list);
6624 SLIST_INSERT_HEAD(list, kn, kn_selnext);
6625 return ret;
6626 }
6627
6628 /*
6629 * detach a knote from the specified list. Return true if that was the last
6630 * entry. The list is protected by whatever lock the object it is associated
6631 * with uses.
6632 */
6633 int
6634 knote_detach(struct klist *list, struct knote *kn)
6635 {
6636 assert(!KNOTE_IS_AUTODETACHED(kn));
6637
6638 SLIST_REMOVE(list, kn, knote, kn_selnext);
6639 return SLIST_EMPTY(list);
6640 }
6641
6642 /*
6643 * knote_vanish - Indicate that the source has vanished
6644 *
6645 * Used only for vanishing ports - vanishing fds go
6646 * through knote_fdclose()
6647 *
6648 * If the knote has requested EV_VANISHED delivery,
6649 * arrange for that. Otherwise, deliver a NOTE_REVOKE
6650 * event for backward compatibility.
6651 *
6652 * The knote is marked as having vanished. The source's
6653 * reference to the knote is dropped by caller, but the knote's
6654 * source reference is only cleaned up later when the knote is dropped.
6655 *
6656 * Our caller already has the object lock held. Calling
6657 * the detach routine would try to take that lock
6658 * recursively - which likely is not supported.
6659 */
6660 void
6661 knote_vanish(struct klist *list, bool make_active)
6662 {
6663 struct knote *kn;
6664 struct knote *kn_next;
6665
6666 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6667 struct kqueue *kq = knote_get_kq(kn);
6668
6669 kqlock(kq);
6670 if (__probable(kn->kn_status & KN_REQVANISH)) {
6671 /*
6672 * If EV_VANISH supported - prepare to deliver one
6673 */
6674 kn->kn_status |= KN_VANISHED;
6675 } else {
6676 /*
6677 * Handle the legacy way to indicate that the port/portset was
6678 * deallocated or left the current Mach portspace (modern technique
6679 * is with an EV_VANISHED protocol).
6680 *
6681 * Deliver an EV_EOF event for these changes (hopefully it will get
6682 * delivered before the port name recycles to the same generation
6683 * count and someone tries to re-register a kevent for it or the
6684 * events are udata-specific - avoiding a conflict).
6685 */
6686 kn->kn_flags |= EV_EOF | EV_ONESHOT;
6687 }
6688 if (make_active) {
6689 knote_activate(kq, kn, FILTER_ACTIVE);
6690 }
6691 kqunlock(kq);
6692 }
6693 }
6694
6695 /*
6696 * remove all knotes referencing a specified fd
6697 *
6698 * Entered with the proc_fd lock already held.
6699 * It returns the same way, but may drop it temporarily.
6700 */
6701 void
6702 knote_fdclose(struct proc *p, int fd)
6703 {
6704 struct filedesc *fdt = &p->p_fd;
6705 struct klist *list;
6706 struct knote *kn;
6707 KNOTE_LOCK_CTX(knlc);
6708
6709 restart:
6710 list = &fdt->fd_knlist[fd];
6711 SLIST_FOREACH(kn, list, kn_link) {
6712 struct kqueue *kq = knote_get_kq(kn);
6713
6714 kqlock(kq);
6715
6716 if (kq->kq_p != p) {
6717 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6718 __func__, kq->kq_p, p);
6719 }
6720
6721 /*
6722 * If the knote supports EV_VANISHED delivery,
6723 * transition it to vanished mode (or skip over
6724 * it if already vanished).
6725 */
6726 if (kn->kn_status & KN_VANISHED) {
6727 kqunlock(kq);
6728 continue;
6729 }
6730
6731 proc_fdunlock(p);
6732 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6733 /* the knote was dropped by someone, nothing to do */
6734 } else if (kn->kn_status & KN_REQVANISH) {
6735 /*
6736 * Since we have REQVANISH for this knote, we need to notify clients about
6737 * the EV_VANISHED.
6738 *
6739 * But unlike mach ports, we want to do the detach here as well and not
6740 * defer it so that we can release the iocount that is on the knote and
6741 * close the fp.
6742 */
6743 kn->kn_status |= KN_VANISHED;
6744
6745 /*
6746 * There may be a concurrent post happening, make sure to wait for it
6747 * before we detach. knote_wait_for_post() unlocks on kq on exit
6748 */
6749 knote_wait_for_post(kq, kn);
6750
6751 knote_fops(kn)->f_detach(kn);
6752 if (kn->kn_is_fd) {
6753 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6754 }
6755 kn->kn_filtid = EVFILTID_DETACHED;
6756 kqlock(kq);
6757
6758 knote_activate(kq, kn, FILTER_ACTIVE);
6759 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6760 } else {
6761 knote_drop(kq, kn, &knlc);
6762 }
6763
6764 proc_fdlock(p);
6765 goto restart;
6766 }
6767 }
6768
6769 /*
6770 * knote_fdfind - lookup a knote in the fd table for process
6771 *
6772 * If the filter is file-based, lookup based on fd index.
6773 * Otherwise use a hash based on the ident.
6774 *
6775 * Matching is based on kq, filter, and ident. Optionally,
6776 * it may also be based on the udata field in the kevent -
6777 * allowing multiple event registration for the file object
6778 * per kqueue.
6779 *
6780 * fd_knhashlock or fdlock held on entry (and exit)
6781 */
6782 static struct knote *
6783 knote_fdfind(struct kqueue *kq,
6784 const struct kevent_internal_s *kev,
6785 bool is_fd,
6786 struct proc *p)
6787 {
6788 struct filedesc *fdp = &p->p_fd;
6789 struct klist *list = NULL;
6790 struct knote *kn = NULL;
6791
6792 /*
6793 * determine where to look for the knote
6794 */
6795 if (is_fd) {
6796 /* fd-based knotes are linked off the fd table */
6797 if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6798 list = &fdp->fd_knlist[kev->kei_ident];
6799 }
6800 } else if (fdp->fd_knhashmask != 0) {
6801 /* hash non-fd knotes here too */
6802 list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6803 }
6804
6805 /*
6806 * scan the selected list looking for a match
6807 */
6808 if (list != NULL) {
6809 SLIST_FOREACH(kn, list, kn_link) {
6810 if (kq == knote_get_kq(kn) &&
6811 kev->kei_ident == kn->kn_id &&
6812 kev->kei_filter == kn->kn_filter) {
6813 if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6814 if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6815 kev->kei_udata == kn->kn_udata) {
6816 break; /* matching udata-specific knote */
6817 }
6818 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6819 break; /* matching non-udata-specific knote */
6820 }
6821 }
6822 }
6823 }
6824 return kn;
6825 }
6826
6827 /*
6828 * kq_add_knote- Add knote to the fd table for process
6829 * while checking for duplicates.
6830 *
6831 * All file-based filters associate a list of knotes by file
6832 * descriptor index. All other filters hash the knote by ident.
6833 *
6834 * May have to grow the table of knote lists to cover the
6835 * file descriptor index presented.
6836 *
6837 * fd_knhashlock and fdlock unheld on entry (and exit).
6838 *
6839 * Takes a rwlock boost if inserting the knote is successful.
6840 */
6841 static int
6842 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6843 struct proc *p)
6844 {
6845 struct filedesc *fdp = &p->p_fd;
6846 struct klist *list = NULL;
6847 int ret = 0;
6848 bool is_fd = kn->kn_is_fd;
6849
6850 if (is_fd) {
6851 proc_fdlock(p);
6852 } else {
6853 knhash_lock(fdp);
6854 }
6855
6856 if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6857 /* found an existing knote: we can't add this one */
6858 ret = ERESTART;
6859 goto out_locked;
6860 }
6861
6862 /* knote was not found: add it now */
6863 if (!is_fd) {
6864 if (fdp->fd_knhashmask == 0) {
6865 u_long size = 0;
6866
6867 list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6868 if (list == NULL) {
6869 ret = ENOMEM;
6870 goto out_locked;
6871 }
6872
6873 fdp->fd_knhash = list;
6874 fdp->fd_knhashmask = size;
6875 }
6876
6877 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6878 SLIST_INSERT_HEAD(list, kn, kn_link);
6879 ret = 0;
6880 goto out_locked;
6881 } else {
6882 /* knote is fd based */
6883
6884 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6885 u_int size = 0;
6886
6887 /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6888 if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6889 ret = EINVAL;
6890 goto out_locked;
6891 }
6892 /* have to grow the fd_knlist */
6893 size = fdp->fd_knlistsize;
6894 while (size <= kn->kn_id) {
6895 size += KQEXTENT;
6896 }
6897
6898 if (size >= (UINT_MAX / sizeof(struct klist))) {
6899 ret = EINVAL;
6900 goto out_locked;
6901 }
6902
6903 list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6904 if (list == NULL) {
6905 ret = ENOMEM;
6906 goto out_locked;
6907 }
6908
6909 bcopy(fdp->fd_knlist, list,
6910 fdp->fd_knlistsize * sizeof(struct klist));
6911 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6912 fdp->fd_knlist = list;
6913 fdp->fd_knlistsize = size;
6914 }
6915
6916 list = &fdp->fd_knlist[kn->kn_id];
6917 SLIST_INSERT_HEAD(list, kn, kn_link);
6918 ret = 0;
6919 goto out_locked;
6920 }
6921
6922 out_locked:
6923 if (ret == 0) {
6924 kqlock(kq);
6925 assert((kn->kn_status & KN_LOCKED) == 0);
6926 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6927 kqueue_retain(kq); /* retain a kq ref */
6928 }
6929 if (is_fd) {
6930 proc_fdunlock(p);
6931 } else {
6932 knhash_unlock(fdp);
6933 }
6934
6935 return ret;
6936 }
6937
6938 /*
6939 * kq_remove_knote - remove a knote from the fd table for process
6940 *
6941 * If the filter is file-based, remove based on fd index.
6942 * Otherwise remove from the hash based on the ident.
6943 *
6944 * fd_knhashlock and fdlock unheld on entry (and exit).
6945 */
6946 static void
6947 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6948 struct knote_lock_ctx *knlc)
6949 {
6950 struct filedesc *fdp = &p->p_fd;
6951 struct klist *list = NULL;
6952 uint16_t kq_state;
6953 bool is_fd = kn->kn_is_fd;
6954
6955 if (is_fd) {
6956 proc_fdlock(p);
6957 } else {
6958 knhash_lock(fdp);
6959 }
6960
6961 if (is_fd) {
6962 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6963 list = &fdp->fd_knlist[kn->kn_id];
6964 } else {
6965 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6966 }
6967 SLIST_REMOVE(list, kn, knote, kn_link);
6968
6969 kqlock(kq);
6970
6971 /* Update the servicer iotier override */
6972 kqueue_update_iotier_override(kq);
6973
6974 kq_state = kq->kq_state;
6975 if (knlc) {
6976 knote_unlock_cancel(kq, kn, knlc);
6977 } else {
6978 kqunlock(kq);
6979 }
6980 if (is_fd) {
6981 proc_fdunlock(p);
6982 } else {
6983 knhash_unlock(fdp);
6984 }
6985
6986 if (kq_state & KQ_DYNAMIC) {
6987 kqworkloop_release((struct kqworkloop *)kq);
6988 }
6989 }
6990
6991 /*
6992 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6993 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6994 *
6995 * fd_knhashlock or fdlock unheld on entry (and exit)
6996 */
6997
6998 static struct knote *
6999 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
7000 bool is_fd, struct proc *p)
7001 {
7002 struct filedesc *fdp = &p->p_fd;
7003 struct knote *kn;
7004
7005 if (is_fd) {
7006 proc_fdlock(p);
7007 } else {
7008 knhash_lock(fdp);
7009 }
7010
7011 /*
7012 * Temporary horrible hack:
7013 * this cast is gross and will go away in a future change.
7014 * It is OK to do because we don't look at xflags/s_fflags,
7015 * and that when we cast down the kev this way,
7016 * the truncated filter field works.
7017 */
7018 kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
7019
7020 if (kn) {
7021 kqlock(kq);
7022 assert(knote_get_kq(kn) == kq);
7023 }
7024
7025 if (is_fd) {
7026 proc_fdunlock(p);
7027 } else {
7028 knhash_unlock(fdp);
7029 }
7030
7031 return kn;
7032 }
7033
7034 static struct kqtailq *
7035 knote_get_tailq(kqueue_t kqu, struct knote *kn)
7036 {
7037 kq_index_t qos_index = kn->kn_qos_index;
7038
7039 if (kqu.kq->kq_state & KQ_WORKLOOP) {
7040 assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
7041 return &kqu.kqwl->kqwl_queue[qos_index - 1];
7042 } else if (kqu.kq->kq_state & KQ_WORKQ) {
7043 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
7044 return &kqu.kqwq->kqwq_queue[qos_index - 1];
7045 } else {
7046 assert(qos_index == QOS_INDEX_KQFILE);
7047 return &kqu.kqf->kqf_queue;
7048 }
7049 }
7050
7051 static void
7052 knote_enqueue(kqueue_t kqu, struct knote *kn)
7053 {
7054 kqlock_held(kqu);
7055
7056 if ((kn->kn_status & KN_ACTIVE) == 0) {
7057 return;
7058 }
7059
7060 if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
7061 return;
7062 }
7063
7064 struct kqtailq *queue = knote_get_tailq(kqu, kn);
7065 bool wakeup = TAILQ_EMPTY(queue);
7066
7067 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7068 kn->kn_status |= KN_QUEUED;
7069 kqu.kq->kq_count++;
7070
7071 if (wakeup) {
7072 if (kqu.kq->kq_state & KQ_WORKLOOP) {
7073 kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
7074 } else if (kqu.kq->kq_state & KQ_WORKQ) {
7075 kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
7076 } else {
7077 kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
7078 }
7079 }
7080 }
7081
7082 __attribute__((always_inline))
7083 static inline void
7084 knote_dequeue(kqueue_t kqu, struct knote *kn)
7085 {
7086 if (kn->kn_status & KN_QUEUED) {
7087 struct kqtailq *queue = knote_get_tailq(kqu, kn);
7088
7089 // attaching the knote calls knote_reset_priority() without
7090 // the kqlock which is fine, so we can't call kqlock_held()
7091 // if we're not queued.
7092 kqlock_held(kqu);
7093
7094 TAILQ_REMOVE(queue, kn, kn_tqe);
7095 kn->kn_status &= ~KN_QUEUED;
7096 kqu.kq->kq_count--;
7097 if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
7098 assert((kqu.kq->kq_count == 0) ==
7099 (bool)TAILQ_EMPTY(queue));
7100 }
7101 }
7102 }
7103
7104 /* called with kqueue lock held */
7105 static void
7106 knote_suppress(kqueue_t kqu, struct knote *kn)
7107 {
7108 struct kqtailq *suppressq;
7109
7110 kqlock_held(kqu);
7111
7112 assert((kn->kn_status & KN_SUPPRESSED) == 0);
7113 assert(kn->kn_status & KN_QUEUED);
7114
7115 knote_dequeue(kqu, kn);
7116 /* deactivate - so new activations indicate a wakeup */
7117 kn->kn_status &= ~KN_ACTIVE;
7118 kn->kn_status |= KN_SUPPRESSED;
7119 suppressq = kqueue_get_suppressed_queue(kqu, kn);
7120 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7121 }
7122
7123 __attribute__((always_inline))
7124 static inline void
7125 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
7126 {
7127 struct kqtailq *suppressq;
7128
7129 kqlock_held(kqu);
7130
7131 assert(kn->kn_status & KN_SUPPRESSED);
7132
7133 kn->kn_status &= ~KN_SUPPRESSED;
7134 suppressq = kqueue_get_suppressed_queue(kqu, kn);
7135 TAILQ_REMOVE(suppressq, kn, kn_tqe);
7136
7137 /*
7138 * If the knote is no longer active, reset its push,
7139 * and resynchronize kn_qos_index with kn_qos_override
7140 * for knotes with a real qos.
7141 */
7142 if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
7143 kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
7144 }
7145 kn->kn_qos_index = kn->kn_qos_override;
7146 }
7147
7148 /* called with kqueue lock held */
7149 static void
7150 knote_unsuppress(kqueue_t kqu, struct knote *kn)
7151 {
7152 knote_unsuppress_noqueue(kqu, kn);
7153 knote_enqueue(kqu, kn);
7154 }
7155
7156 __attribute__((always_inline))
7157 static inline void
7158 knote_mark_active(struct knote *kn)
7159 {
7160 if ((kn->kn_status & KN_ACTIVE) == 0) {
7161 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7162 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
7163 kn->kn_filtid);
7164 }
7165
7166 kn->kn_status |= KN_ACTIVE;
7167 }
7168
7169 /* called with kqueue lock held */
7170 static void
7171 knote_activate(kqueue_t kqu, struct knote *kn, int result)
7172 {
7173 assert(result & FILTER_ACTIVE);
7174 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
7175 // may dequeue the knote
7176 knote_adjust_qos(kqu.kq, kn, result);
7177 }
7178 knote_mark_active(kn);
7179 knote_enqueue(kqu, kn);
7180 }
7181
7182 /*
7183 * This function applies changes requested by f_attach or f_touch for
7184 * a given filter. It proceeds in a carefully chosen order to help
7185 * every single transition do the minimal amount of work possible.
7186 */
7187 static void
7188 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
7189 int result)
7190 {
7191 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
7192 kn->kn_status &= ~KN_DISABLED;
7193
7194 /*
7195 * it is possible for userland to have knotes registered for a given
7196 * workloop `wl_orig` but really handled on another workloop `wl_new`.
7197 *
7198 * In that case, rearming will happen from the servicer thread of
7199 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7200 * this knote to stay suppressed forever if we only relied on
7201 * kqworkloop_acknowledge_events to be called by `wl_orig`.
7202 *
7203 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7204 * unsuppress because that would mess with the processing phase of
7205 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7206 * will be called.
7207 */
7208 if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
7209 if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
7210 knote_unsuppress_noqueue(kqu, kn);
7211 }
7212 }
7213 }
7214
7215 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
7216 kqueue_update_iotier_override(kqu);
7217 }
7218
7219 if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
7220 // may dequeue the knote
7221 knote_reset_priority(kqu, kn, kev->qos);
7222 }
7223
7224 /*
7225 * When we unsuppress above, or because of knote_reset_priority(),
7226 * the knote may have been dequeued, we need to restore the invariant
7227 * that if the knote is active it needs to be queued now that
7228 * we're done applying changes.
7229 */
7230 if (result & FILTER_ACTIVE) {
7231 knote_activate(kqu, kn, result);
7232 } else {
7233 knote_enqueue(kqu, kn);
7234 }
7235
7236 if ((result & FILTER_THREADREQ_NODEFEER) &&
7237 act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
7238 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7239 }
7240 }
7241
7242 /*
7243 * knote_drop - disconnect and drop the knote
7244 *
7245 * Called with the kqueue locked, returns with the kqueue unlocked.
7246 *
7247 * If a knote locking context is passed, it is canceled.
7248 *
7249 * The knote may have already been detached from
7250 * (or not yet attached to) its source object.
7251 */
7252 static void
7253 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
7254 {
7255 struct proc *p = kq->kq_p;
7256
7257 kqlock_held(kq);
7258
7259 assert((kn->kn_status & KN_DROPPING) == 0);
7260 if (knlc == NULL) {
7261 assert((kn->kn_status & KN_LOCKED) == 0);
7262 }
7263 kn->kn_status |= KN_DROPPING;
7264
7265 if (kn->kn_status & KN_SUPPRESSED) {
7266 knote_unsuppress_noqueue(kq, kn);
7267 } else {
7268 knote_dequeue(kq, kn);
7269 }
7270 knote_wait_for_post(kq, kn);
7271
7272 /* Even if we are autodetached, the filter may need to do cleanups of any
7273 * stuff stashed on the knote so always make the call and let each filter
7274 * handle the possibility of autodetached-ness */
7275 knote_fops(kn)->f_detach(kn);
7276
7277 /* kq may be freed when kq_remove_knote() returns */
7278 kq_remove_knote(kq, kn, p, knlc);
7279 if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
7280 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
7281 }
7282
7283 knote_free(kn);
7284 }
7285
7286 void
7287 knote_init(void)
7288 {
7289 #if CONFIG_MEMORYSTATUS
7290 /* Initialize the memorystatus list lock */
7291 memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
7292 #endif
7293 }
7294 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
7295
7296 const struct filterops *
7297 knote_fops(struct knote *kn)
7298 {
7299 return sysfilt_ops[kn->kn_filtid];
7300 }
7301
7302 static struct knote *
7303 knote_alloc(void)
7304 {
7305 return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
7306 }
7307
7308 static void
7309 knote_free(struct knote *kn)
7310 {
7311 assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
7312 zfree(knote_zone, kn);
7313 }
7314
7315 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
7316
7317 kevent_ctx_t
7318 kevent_get_context(thread_t thread)
7319 {
7320 uthread_t ut = get_bsdthread_info(thread);
7321 return &ut->uu_save.uus_kevent;
7322 }
7323
7324 static inline bool
7325 kevent_args_requesting_events(unsigned int flags, int nevents)
7326 {
7327 return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
7328 }
7329
7330 static inline int
7331 kevent_adjust_flags_for_proc(proc_t p, int flags)
7332 {
7333 __builtin_assume(p);
7334 return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
7335 }
7336
7337 /*!
7338 * @function kevent_get_kqfile
7339 *
7340 * @brief
7341 * Lookup a kqfile by fd.
7342 *
7343 * @discussion
7344 * Callers: kevent, kevent64, kevent_qos
7345 *
7346 * This is not assumed to be a fastpath (kqfile interfaces are legacy)
7347 */
7348 OS_NOINLINE
7349 static int
7350 kevent_get_kqfile(struct proc *p, int fd, int flags,
7351 struct fileproc **fpp, struct kqueue **kqp)
7352 {
7353 int error = 0;
7354 struct kqueue *kq;
7355
7356 error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
7357 if (__improbable(error)) {
7358 return error;
7359 }
7360 kq = (struct kqueue *)fp_get_data((*fpp));
7361
7362 uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
7363 if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
7364 kqlock(kq);
7365 kq_state = kq->kq_state;
7366 if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
7367 if (flags & KEVENT_FLAG_LEGACY32) {
7368 kq_state |= KQ_KEV32;
7369 } else if (flags & KEVENT_FLAG_LEGACY64) {
7370 kq_state |= KQ_KEV64;
7371 } else {
7372 kq_state |= KQ_KEV_QOS;
7373 }
7374 kq->kq_state = kq_state;
7375 }
7376 kqunlock(kq);
7377 }
7378
7379 /*
7380 * kqfiles can't be used through the legacy kevent()
7381 * and other interfaces at the same time.
7382 */
7383 if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7384 (bool)(kq_state & KQ_KEV32))) {
7385 fp_drop(p, fd, *fpp, 0);
7386 return EINVAL;
7387 }
7388
7389 *kqp = kq;
7390 return 0;
7391 }
7392
7393 /*!
7394 * @function kevent_get_kqwq
7395 *
7396 * @brief
7397 * Lookup or create the process kqwq (faspath).
7398 *
7399 * @discussion
7400 * Callers: kevent64, kevent_qos
7401 */
7402 OS_ALWAYS_INLINE
7403 static int
7404 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7405 {
7406 struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7407
7408 if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7409 return EINVAL;
7410 }
7411 if (__improbable(kqwq == NULL)) {
7412 kqwq = kqworkq_alloc(p, flags);
7413 if (__improbable(kqwq == NULL)) {
7414 return ENOMEM;
7415 }
7416 }
7417
7418 *kqp = &kqwq->kqwq_kqueue;
7419 return 0;
7420 }
7421
7422 #pragma mark kevent copyio
7423
7424 /*!
7425 * @function kevent_get_data_size
7426 *
7427 * @brief
7428 * Copies in the extra data size from user-space.
7429 */
7430 static int
7431 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7432 kevent_ctx_t kectx)
7433 {
7434 if (!data_avail || !data_out) {
7435 kectx->kec_data_size = 0;
7436 kectx->kec_data_resid = 0;
7437 } else if (flags & KEVENT_FLAG_PROC64) {
7438 user64_size_t usize = 0;
7439 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7440 if (__improbable(error)) {
7441 return error;
7442 }
7443 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7444 } else {
7445 user32_size_t usize = 0;
7446 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7447 if (__improbable(error)) {
7448 return error;
7449 }
7450 kectx->kec_data_avail = data_avail;
7451 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7452 }
7453 kectx->kec_data_out = data_out;
7454 kectx->kec_data_avail = data_avail;
7455 return 0;
7456 }
7457
7458 /*!
7459 * @function kevent_put_data_size
7460 *
7461 * @brief
7462 * Copies out the residual data size to user-space if any has been used.
7463 */
7464 static int
7465 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7466 {
7467 if (kectx->kec_data_resid == kectx->kec_data_size) {
7468 return 0;
7469 }
7470 if (flags & KEVENT_FLAG_KERNEL) {
7471 *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7472 return 0;
7473 }
7474 if (flags & KEVENT_FLAG_PROC64) {
7475 user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7476 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7477 } else {
7478 user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7479 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7480 }
7481 }
7482
7483 /*!
7484 * @function kevent_legacy_copyin
7485 *
7486 * @brief
7487 * Handles the copyin of a kevent/kevent64 event.
7488 */
7489 static int
7490 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
7491 {
7492 int error;
7493
7494 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7495
7496 if (flags & KEVENT_FLAG_LEGACY64) {
7497 struct kevent64_s kev64;
7498
7499 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7500 if (__improbable(error)) {
7501 return error;
7502 }
7503 *addrp += sizeof(kev64);
7504 *kevp = (struct kevent_qos_s){
7505 .ident = kev64.ident,
7506 .filter = kev64.filter,
7507 /* Make sure user doesn't pass in any system flags */
7508 .flags = kev64.flags & ~EV_SYSFLAGS,
7509 .udata = kev64.udata,
7510 .fflags = kev64.fflags,
7511 .data = kev64.data,
7512 .ext[0] = kev64.ext[0],
7513 .ext[1] = kev64.ext[1],
7514 };
7515 } else if (flags & KEVENT_FLAG_PROC64) {
7516 struct user64_kevent kev64;
7517
7518 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7519 if (__improbable(error)) {
7520 return error;
7521 }
7522 *addrp += sizeof(kev64);
7523 *kevp = (struct kevent_qos_s){
7524 .ident = kev64.ident,
7525 .filter = kev64.filter,
7526 /* Make sure user doesn't pass in any system flags */
7527 .flags = kev64.flags & ~EV_SYSFLAGS,
7528 .udata = kev64.udata,
7529 .fflags = kev64.fflags,
7530 .data = kev64.data,
7531 };
7532 } else {
7533 struct user32_kevent kev32;
7534
7535 error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7536 if (__improbable(error)) {
7537 return error;
7538 }
7539 *addrp += sizeof(kev32);
7540 *kevp = (struct kevent_qos_s){
7541 .ident = (uintptr_t)kev32.ident,
7542 .filter = kev32.filter,
7543 /* Make sure user doesn't pass in any system flags */
7544 .flags = kev32.flags & ~EV_SYSFLAGS,
7545 .udata = CAST_USER_ADDR_T(kev32.udata),
7546 .fflags = kev32.fflags,
7547 .data = (intptr_t)kev32.data,
7548 };
7549 }
7550
7551 return 0;
7552 }
7553
7554 /*!
7555 * @function kevent_modern_copyin
7556 *
7557 * @brief
7558 * Handles the copyin of a kevent_qos/kevent_id event.
7559 */
7560 static int
7561 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7562 {
7563 int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7564 if (__probable(!error)) {
7565 /* Make sure user doesn't pass in any system flags */
7566 *addrp += sizeof(struct kevent_qos_s);
7567 kevp->flags &= ~EV_SYSFLAGS;
7568 }
7569 return error;
7570 }
7571
7572 /*!
7573 * @function kevent_legacy_copyout
7574 *
7575 * @brief
7576 * Handles the copyout of a kevent/kevent64 event.
7577 */
7578 static int
7579 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7580 {
7581 int advance;
7582 int error;
7583
7584 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7585
7586 /*
7587 * fully initialize the differnt output event structure
7588 * types from the internal kevent (and some universal
7589 * defaults for fields not represented in the internal
7590 * form).
7591 *
7592 * Note: these structures have no padding hence the C99
7593 * initializers below do not leak kernel info.
7594 */
7595 if (flags & KEVENT_FLAG_LEGACY64) {
7596 struct kevent64_s kev64 = {
7597 .ident = kevp->ident,
7598 .filter = kevp->filter,
7599 .flags = kevp->flags,
7600 .fflags = kevp->fflags,
7601 .data = (int64_t)kevp->data,
7602 .udata = kevp->udata,
7603 .ext[0] = kevp->ext[0],
7604 .ext[1] = kevp->ext[1],
7605 };
7606 advance = sizeof(struct kevent64_s);
7607 error = copyout((caddr_t)&kev64, *addrp, advance);
7608 } else if (flags & KEVENT_FLAG_PROC64) {
7609 /*
7610 * deal with the special case of a user-supplied
7611 * value of (uintptr_t)-1.
7612 */
7613 uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7614 (uint64_t)-1LL : (uint64_t)kevp->ident;
7615 struct user64_kevent kev64 = {
7616 .ident = ident,
7617 .filter = kevp->filter,
7618 .flags = kevp->flags,
7619 .fflags = kevp->fflags,
7620 .data = (int64_t) kevp->data,
7621 .udata = (user_addr_t) kevp->udata,
7622 };
7623 advance = sizeof(kev64);
7624 error = copyout((caddr_t)&kev64, *addrp, advance);
7625 } else {
7626 struct user32_kevent kev32 = {
7627 .ident = (uint32_t)kevp->ident,
7628 .filter = kevp->filter,
7629 .flags = kevp->flags,
7630 .fflags = kevp->fflags,
7631 .data = (int32_t)kevp->data,
7632 .udata = (uint32_t)kevp->udata,
7633 };
7634 advance = sizeof(kev32);
7635 error = copyout((caddr_t)&kev32, *addrp, advance);
7636 }
7637 if (__probable(!error)) {
7638 *addrp += advance;
7639 }
7640 return error;
7641 }
7642
7643 /*!
7644 * @function kevent_modern_copyout
7645 *
7646 * @brief
7647 * Handles the copyout of a kevent_qos/kevent_id event.
7648 */
7649 OS_ALWAYS_INLINE
7650 static inline int
7651 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7652 {
7653 int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7654 if (__probable(!error)) {
7655 *addrp += sizeof(struct kevent_qos_s);
7656 }
7657 return error;
7658 }
7659
7660 #pragma mark kevent core implementation
7661
7662 /*!
7663 * @function kevent_callback_inline
7664 *
7665 * @brief
7666 * Callback for each individual event
7667 *
7668 * @discussion
7669 * This is meant to be inlined in kevent_modern_callback and
7670 * kevent_legacy_callback.
7671 */
7672 OS_ALWAYS_INLINE
7673 static inline int
7674 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7675 {
7676 int error;
7677
7678 assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7679
7680 /*
7681 * Copy out the appropriate amount of event data for this user.
7682 */
7683 if (legacy) {
7684 error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7685 kectx->kec_process_flags);
7686 } else {
7687 error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7688 }
7689
7690 /*
7691 * If there isn't space for additional events, return
7692 * a harmless error to stop the processing here
7693 */
7694 if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7695 error = EWOULDBLOCK;
7696 }
7697 return error;
7698 }
7699
7700 /*!
7701 * @function kevent_modern_callback
7702 *
7703 * @brief
7704 * Callback for each individual modern event.
7705 *
7706 * @discussion
7707 * This callback handles kevent_qos/kevent_id events.
7708 */
7709 static int
7710 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7711 {
7712 return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7713 }
7714
7715 /*!
7716 * @function kevent_legacy_callback
7717 *
7718 * @brief
7719 * Callback for each individual legacy event.
7720 *
7721 * @discussion
7722 * This callback handles kevent/kevent64 events.
7723 */
7724 static int
7725 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7726 {
7727 return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7728 }
7729
7730 /*!
7731 * @function kevent_cleanup
7732 *
7733 * @brief
7734 * Handles the cleanup returning from a kevent call.
7735 *
7736 * @discussion
7737 * kevent entry points will take a reference on workloops,
7738 * and a usecount on the fileglob of kqfiles.
7739 *
7740 * This function undoes this on the exit paths of kevents.
7741 *
7742 * @returns
7743 * The error to return to userspace.
7744 */
7745 static int
7746 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7747 {
7748 // poll should not call any codepath leading to this
7749 assert((flags & KEVENT_FLAG_POLL) == 0);
7750
7751 if (flags & KEVENT_FLAG_WORKLOOP) {
7752 kqworkloop_release(kqu.kqwl);
7753 } else if (flags & KEVENT_FLAG_WORKQ) {
7754 /* nothing held */
7755 } else {
7756 fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7757 }
7758
7759 /* don't restart after signals... */
7760 if (error == ERESTART) {
7761 error = EINTR;
7762 } else if (error == 0) {
7763 /* don't abandon other output just because of residual copyout failures */
7764 (void)kevent_put_data_size(flags, kectx);
7765 }
7766
7767 if (flags & KEVENT_FLAG_PARKING) {
7768 thread_t th = current_thread();
7769 struct uthread *uth = get_bsdthread_info(th);
7770 workq_threadreq_t kqr = uth->uu_kqr_bound;
7771 if (kqr && !(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND)) {
7772 thread_unfreeze_base_pri(th);
7773 }
7774 }
7775 return error;
7776 }
7777
7778 /*!
7779 * @function kqueue_process
7780 *
7781 * @brief
7782 * Process the triggered events in a kqueue.
7783 *
7784 * @discussion
7785 * Walk the queued knotes and validate that they are really still triggered
7786 * events by calling the filter routines (if necessary).
7787 *
7788 * For each event that is still considered triggered, invoke the callback
7789 * routine provided.
7790 *
7791 * caller holds a reference on the kqueue.
7792 * kqueue locked on entry and exit - but may be dropped
7793 * kqueue list locked (held for duration of call)
7794 *
7795 * This is only called by kqueue_scan() so that the compiler can inline it.
7796 *
7797 * For kqworkloops that are permanently configured with a bound thread, this
7798 * function parks the bound thread (instead of returning) if there are no events
7799 * or errors to be returned and KEVENT_FLAG_PARKING was specified.
7800 *
7801 * @returns
7802 * - 0: no event was returned, no other error occured
7803 * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7804 * - EWOULDBLOCK: (not an error) events have been found and we should return
7805 * - EFAULT: copyout failed
7806 * - filter specific errors
7807 */
7808 static int
7809 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7810 kevent_callback_t callback)
7811 {
7812 workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7813 struct knote *kn;
7814 int error = 0, rc = 0;
7815 struct kqtailq *base_queue, *queue;
7816 uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7817 bool kqwl_permanently_bound = false;
7818
7819 if (kq_type & KQ_WORKQ) {
7820 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7821 } else if (kq_type & KQ_WORKLOOP) {
7822 kqwl_permanently_bound = kqr_thread_permanently_bound(kqr);
7823 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7824 } else {
7825 kqfile_retry:
7826 rc = kqfile_begin_processing(kqu.kqf);
7827 if (rc == EBADF) {
7828 return EBADF;
7829 }
7830 }
7831
7832 if (rc == -1) {
7833 /* Nothing to process */
7834 if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7835 kqwl_permanently_bound) {
7836 goto kqwl_bound_thread_park;
7837 }
7838 return 0;
7839 }
7840
7841 /*
7842 * loop through the enqueued knotes associated with this request,
7843 * processing each one. Each request may have several queues
7844 * of knotes to process (depending on the type of kqueue) so we
7845 * have to loop through all the queues as long as we have additional
7846 * space.
7847 */
7848
7849 process_again:
7850 if (kq_type & KQ_WORKQ) {
7851 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7852 } else if (kq_type & KQ_WORKLOOP) {
7853 base_queue = &kqu.kqwl->kqwl_queue[0];
7854 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7855 } else {
7856 base_queue = queue = &kqu.kqf->kqf_queue;
7857 }
7858
7859 do {
7860 while ((kn = TAILQ_FIRST(queue)) != NULL) {
7861 error = knote_process(kn, kectx, callback);
7862 if (error == EJUSTRETURN) {
7863 error = 0;
7864 } else if (__improbable(error)) {
7865 /* error is EWOULDBLOCK when the out event array is full */
7866 goto stop_processing;
7867 }
7868 }
7869 } while (queue-- > base_queue);
7870
7871 if (kectx->kec_process_noutputs) {
7872 /* callers will transform this into no error */
7873 error = EWOULDBLOCK;
7874 }
7875
7876 stop_processing:
7877 /*
7878 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7879 * we want to unbind the kqrequest from the thread.
7880 *
7881 * However, because the kq locks are dropped several times during process,
7882 * new knotes may have fired again, in which case, we want to fail the end
7883 * processing and process again, until it converges.
7884 *
7885 * If we have an error or returned events, end processing never fails.
7886 */
7887 if (error) {
7888 flags &= ~KEVENT_FLAG_PARKING;
7889 }
7890 if (kq_type & KQ_WORKQ) {
7891 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7892 } else if (kq_type & KQ_WORKLOOP) {
7893 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7894 } else {
7895 rc = kqfile_end_processing(kqu.kqf);
7896 }
7897
7898 if (__probable(error)) {
7899 return error;
7900 }
7901
7902 if (__probable(rc >= 0)) {
7903 assert(rc == 0 || rc == EBADF);
7904 if (rc == 0) {
7905 if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7906 kqwl_permanently_bound) {
7907 goto kqwl_bound_thread_park;
7908 }
7909 }
7910 return rc;
7911 }
7912
7913 if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7914 assert(flags & KEVENT_FLAG_PARKING);
7915 goto process_again;
7916 } else {
7917 goto kqfile_retry;
7918 }
7919
7920 kqwl_bound_thread_park:
7921 #if DEVELOPMENT | DEBUG
7922 assert(current_thread() == kqr_thread_fast(kqr));
7923 assert(workq_thread_is_permanently_bound(current_uthread()));
7924 #endif
7925 kqworkloop_bound_thread_park(kqu.kqwl, kqr_thread_fast(kqr));
7926 __builtin_unreachable();
7927 }
7928
7929 /*!
7930 * @function kqueue_scan_continue
7931 *
7932 * @brief
7933 * The continuation used by kqueue_scan for kevent entry points.
7934 *
7935 * @discussion
7936 * Assumes we inherit a use/ref count on the kq or its fileglob.
7937 *
7938 * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7939 * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7940 */
7941 OS_NORETURN OS_NOINLINE
7942 static void
7943 kqueue_scan_continue(void *data, wait_result_t wait_result)
7944 {
7945 uthread_t ut = current_uthread();
7946 kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7947 int error = 0, flags = kectx->kec_process_flags;
7948 struct kqueue *kq = data;
7949
7950 /*
7951 * only kevent variants call in here, so we know the callback is
7952 * kevent_legacy_callback or kevent_modern_callback.
7953 */
7954 assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7955
7956 switch (wait_result) {
7957 case THREAD_AWAKENED:
7958 if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7959 error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7960 } else {
7961 error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7962 }
7963 break;
7964 case THREAD_TIMED_OUT:
7965 error = 0;
7966 break;
7967 case THREAD_INTERRUPTED:
7968 error = EINTR;
7969 break;
7970 case THREAD_RESTART:
7971 error = EBADF;
7972 break;
7973 default:
7974 panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7975 }
7976
7977
7978 error = kevent_cleanup(kq, flags, error, kectx);
7979 *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7980 unix_syscall_return(error);
7981 }
7982
7983 /*!
7984 * @function kqueue_scan
7985 *
7986 * @brief
7987 * Scan and wait for events in a kqueue (used by poll & kevent).
7988 *
7989 * @discussion
7990 * Process the triggered events in a kqueue.
7991 *
7992 * If there are no events triggered arrange to wait for them:
7993 * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7994 * - possibly until kectx->kec_deadline expires
7995 *
7996 * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7997 * are set, then it will wait in the kqueue_scan_continue continuation.
7998 *
7999 * poll() will block in place, and KEVENT_FLAG_KERNEL calls
8000 * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
8001 *
8002 * @param kqu
8003 * The kqueue being scanned.
8004 *
8005 * @param flags
8006 * The KEVENT_FLAG_* flags for this call.
8007 *
8008 * @param kectx
8009 * The context used for this scan.
8010 * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
8011 *
8012 * @param callback
8013 * The callback to be called on events sucessfully processed.
8014 * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
8015 */
8016 int
8017 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
8018 kevent_callback_t callback)
8019 {
8020 int error;
8021
8022 for (;;) {
8023 kqlock(kqu);
8024 error = kqueue_process(kqu, flags, kectx, callback);
8025
8026 /*
8027 * If we got an error, events returned (EWOULDBLOCK)
8028 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
8029 * just return.
8030 */
8031 if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
8032 kqunlock(kqu);
8033 return error == EWOULDBLOCK ? 0 : error;
8034 }
8035
8036 assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
8037
8038 kqu.kqf->kqf_state |= KQ_SLEEP;
8039 assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
8040 kectx->kec_deadline);
8041 kqunlock(kqu);
8042
8043 if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
8044 thread_block_parameter(kqueue_scan_continue, kqu.kqf);
8045 __builtin_unreachable();
8046 }
8047
8048 wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
8049 switch (wr) {
8050 case THREAD_AWAKENED:
8051 break;
8052 case THREAD_TIMED_OUT:
8053 return 0;
8054 case THREAD_INTERRUPTED:
8055 return EINTR;
8056 case THREAD_RESTART:
8057 return EBADF;
8058 default:
8059 panic("%s: - bad wait_result (%d)", __func__, wr);
8060 }
8061 }
8062 }
8063
8064 /*!
8065 * @function kevent_internal
8066 *
8067 * @brief
8068 * Common kevent code.
8069 *
8070 * @discussion
8071 * Needs to be inlined to specialize for legacy or modern and
8072 * eliminate dead code.
8073 *
8074 * This is the core logic of kevent entry points, that will:
8075 * - register kevents
8076 * - optionally scan the kqueue for events
8077 *
8078 * The caller is giving kevent_internal a reference on the kqueue
8079 * or its fileproc that needs to be cleaned up by kevent_cleanup().
8080 */
8081 OS_ALWAYS_INLINE
8082 static inline int
8083 kevent_internal(kqueue_t kqu,
8084 user_addr_t changelist, int nchanges,
8085 user_addr_t ueventlist, int nevents,
8086 int flags, kevent_ctx_t kectx, int32_t *retval,
8087 bool legacy)
8088 {
8089 int error = 0, noutputs = 0, register_rc;
8090
8091 /* only bound threads can receive events on workloops */
8092 if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
8093 #if CONFIG_WORKLOOP_DEBUG
8094 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
8095 .uu_kqid = kqu.kqwl->kqwl_dynamicid,
8096 .uu_kq = error ? NULL : kqu.kq,
8097 .uu_error = error,
8098 .uu_nchanges = nchanges,
8099 .uu_nevents = nevents,
8100 .uu_flags = flags,
8101 });
8102 #endif // CONFIG_WORKLOOP_DEBUG
8103
8104 if (flags & KEVENT_FLAG_KERNEL) {
8105 /* see kevent_workq_internal */
8106 error = copyout(&kqu.kqwl->kqwl_dynamicid,
8107 ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
8108 kectx->kec_data_resid -= sizeof(kqueue_id_t);
8109 if (__improbable(error)) {
8110 goto out;
8111 }
8112 }
8113
8114 if (kevent_args_requesting_events(flags, nevents)) {
8115 /*
8116 * Disable the R2K notification while doing a register, if the
8117 * caller wants events too, we don't want the AST to be set if we
8118 * will process these events soon.
8119 */
8120 kqlock(kqu);
8121 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
8122 kqunlock(kqu);
8123 flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
8124 }
8125 }
8126
8127 /* register all the change requests the user provided... */
8128 while (nchanges > 0 && error == 0) {
8129 struct kevent_qos_s kev;
8130 struct knote *kn = NULL;
8131
8132 if (legacy) {
8133 error = kevent_legacy_copyin(&changelist, &kev, flags);
8134 } else {
8135 error = kevent_modern_copyin(&changelist, &kev);
8136 }
8137 if (error) {
8138 break;
8139 }
8140
8141 register_rc = kevent_register(kqu.kq, &kev, &kn);
8142 if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
8143 thread_t thread = current_thread();
8144
8145 kqlock_held(kqu);
8146
8147 if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
8148 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
8149 }
8150
8151 // f_post_register_wait is meant to call a continuation and not to
8152 // return, which is why we don't support FILTER_REGISTER_WAIT if
8153 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
8154 // waits isn't the last.
8155 //
8156 // It is implementable, but not used by any userspace code at the
8157 // moment, so for now return ENOTSUP if someone tries to do it.
8158 if (nchanges == 1 && noutputs < nevents &&
8159 (flags & KEVENT_FLAG_KERNEL) == 0 &&
8160 (flags & KEVENT_FLAG_PARKING) == 0 &&
8161 (flags & KEVENT_FLAG_ERROR_EVENTS) &&
8162 (flags & KEVENT_FLAG_WORKLOOP)) {
8163 uthread_t ut = get_bsdthread_info(thread);
8164
8165 /*
8166 * store the continuation/completion data in the uthread
8167 *
8168 * Note: the kectx aliases with this,
8169 * and is destroyed in the process.
8170 */
8171 ut->uu_save.uus_kevent_register = (struct _kevent_register){
8172 .kev = kev,
8173 .kqwl = kqu.kqwl,
8174 .eventout = noutputs,
8175 .ueventlist = ueventlist,
8176 };
8177 knote_fops(kn)->f_post_register_wait(ut, kn,
8178 &ut->uu_save.uus_kevent_register);
8179 __builtin_unreachable();
8180 }
8181 kqunlock(kqu);
8182
8183 kev.flags |= EV_ERROR;
8184 kev.data = ENOTSUP;
8185 } else {
8186 assert((register_rc & FILTER_REGISTER_WAIT) == 0);
8187 }
8188
8189 // keep in sync with kevent_register_wait_return()
8190 if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
8191 if ((kev.flags & EV_ERROR) == 0) {
8192 kev.flags |= EV_ERROR;
8193 kev.data = 0;
8194 }
8195 if (legacy) {
8196 error = kevent_legacy_copyout(&kev, &ueventlist, flags);
8197 } else {
8198 error = kevent_modern_copyout(&kev, &ueventlist);
8199 }
8200 if (error == 0) {
8201 noutputs++;
8202 }
8203 } else if (kev.flags & EV_ERROR) {
8204 error = (int)kev.data;
8205 }
8206 nchanges--;
8207 }
8208
8209 if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
8210 nevents > 0 && noutputs == 0 && error == 0) {
8211 kectx->kec_process_flags = flags;
8212 kectx->kec_process_nevents = nevents;
8213 kectx->kec_process_noutputs = 0;
8214 kectx->kec_process_eventlist = ueventlist;
8215
8216 if (legacy) {
8217 error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
8218 } else {
8219 error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
8220 }
8221
8222 noutputs = kectx->kec_process_noutputs;
8223 } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
8224 /*
8225 * If we didn't through kqworkloop_end_processing(),
8226 * we need to do it here.
8227 *
8228 * kqueue_scan will call kqworkloop_end_processing(),
8229 * so we only need to do it if we didn't scan.
8230 */
8231 kqlock(kqu);
8232 kqworkloop_end_processing(kqu.kqwl, 0, 0);
8233 kqunlock(kqu);
8234 }
8235
8236 *retval = noutputs;
8237 out:
8238 return kevent_cleanup(kqu.kq, flags, error, kectx);
8239 }
8240
8241 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
8242
8243 /*!
8244 * @function kevent_modern_internal
8245 *
8246 * @brief
8247 * The backend of the kevent_id and kevent_workq_internal entry points.
8248 *
8249 * @discussion
8250 * Needs to be inline due to the number of arguments.
8251 */
8252 OS_NOINLINE
8253 static int
8254 kevent_modern_internal(kqueue_t kqu,
8255 user_addr_t changelist, int nchanges,
8256 user_addr_t ueventlist, int nevents,
8257 int flags, kevent_ctx_t kectx, int32_t *retval)
8258 {
8259 return kevent_internal(kqu.kq, changelist, nchanges,
8260 ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
8261 }
8262
8263 /*!
8264 * @function kevent_id
8265 *
8266 * @brief
8267 * The kevent_id() syscall.
8268 */
8269 int
8270 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
8271 {
8272 int error, flags = uap->flags & KEVENT_FLAG_USER;
8273 uthread_t uth = current_uthread();
8274 workq_threadreq_t kqr = uth->uu_kqr_bound;
8275 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8276 kqueue_t kqu;
8277
8278 flags = kevent_adjust_flags_for_proc(p, flags);
8279 flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
8280
8281 if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
8282 KEVENT_FLAG_WORKLOOP)) {
8283 return EINVAL;
8284 }
8285
8286 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8287 if (__improbable(error)) {
8288 return error;
8289 }
8290
8291 kectx->kec_deadline = 0;
8292 kectx->kec_fp = NULL;
8293 kectx->kec_fd = -1;
8294 /* the kec_process_* fields are filled if kqueue_scann is called only */
8295
8296 /*
8297 * Get the kq we are going to be working on
8298 * As a fastpath, look at the currently bound workloop.
8299 */
8300 kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
8301 if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
8302 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
8303 return EEXIST;
8304 }
8305 kqworkloop_retain(kqu.kqwl);
8306 } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
8307 return EXDEV;
8308 } else {
8309 error = kqworkloop_get_or_create(p, uap->id, NULL, NULL,
8310 flags, &kqu.kqwl);
8311 if (__improbable(error)) {
8312 return error;
8313 }
8314 }
8315
8316 return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
8317 uap->eventlist, uap->nevents, flags, kectx, retval);
8318 }
8319
8320 /**!
8321 * @function kevent_workq_internal
8322 *
8323 * @discussion
8324 * This function is exported for the sake of the workqueue subsystem.
8325 *
8326 * It is called in two ways:
8327 * - when a thread is about to go to userspace to ask for pending event
8328 * - when a thread is returning from userspace with events back
8329 *
8330 * the workqueue subsystem will only use the following flags:
8331 * - KEVENT_FLAG_STACK_DATA (always)
8332 * - KEVENT_FLAG_IMMEDIATE (always)
8333 * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
8334 * userspace).
8335 *
8336 * It implicitly acts on the bound kqueue, and for the case of workloops
8337 * will copyout the kqueue ID before anything else.
8338 *
8339 *
8340 * Pthread will have setup the various arguments to fit this stack layout:
8341 *
8342 * +-------....----+--------------+-----------+--------------------+
8343 * | user stack | data avail | nevents | pthread_self() |
8344 * +-------....----+--------------+-----------+--------------------+
8345 * ^ ^
8346 * data_out eventlist
8347 *
8348 * When a workloop is used, the workloop ID is copied out right before
8349 * the eventlist and is taken from the data buffer.
8350 *
8351 * @warning
8352 * This function is carefuly tailored to not make any call except the final tail
8353 * call into kevent_modern_internal. (LTO inlines current_uthread()).
8354 *
8355 * This function is performance sensitive due to the workq subsystem.
8356 */
8357 int
8358 kevent_workq_internal(struct proc *p,
8359 user_addr_t changelist, int nchanges,
8360 user_addr_t eventlist, int nevents,
8361 user_addr_t data_out, user_size_t *data_available,
8362 unsigned int flags, int32_t *retval)
8363 {
8364 uthread_t uth = current_uthread();
8365 workq_threadreq_t kqr = uth->uu_kqr_bound;
8366 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8367 kqueue_t kqu;
8368
8369 assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
8370 flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
8371
8372 kectx->kec_data_out = data_out;
8373 kectx->kec_data_avail = (uint64_t)data_available;
8374 kectx->kec_data_size = *data_available;
8375 kectx->kec_data_resid = *data_available;
8376 kectx->kec_deadline = 0;
8377 kectx->kec_fp = NULL;
8378 kectx->kec_fd = -1;
8379 /* the kec_process_* fields are filled if kqueue_scann is called only */
8380
8381 flags = kevent_adjust_flags_for_proc(p, flags);
8382
8383 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
8384 kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
8385 kqworkloop_retain(kqu.kqwl);
8386
8387 flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
8388 KEVENT_FLAG_KERNEL;
8389 } else {
8390 kqu.kqwq = p->p_fd.fd_wqkqueue;
8391
8392 flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
8393 }
8394
8395 return kevent_modern_internal(kqu, changelist, nchanges,
8396 eventlist, nevents, flags, kectx, retval);
8397 }
8398
8399 /*!
8400 * @function kevent_qos
8401 *
8402 * @brief
8403 * The kevent_qos() syscall.
8404 */
8405 int
8406 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
8407 {
8408 uthread_t uth = current_uthread();
8409 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8410 int error, flags = uap->flags & KEVENT_FLAG_USER;
8411 struct kqueue *kq;
8412
8413 if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8414 return EINVAL;
8415 }
8416
8417 flags = kevent_adjust_flags_for_proc(p, flags);
8418
8419 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8420 if (__improbable(error)) {
8421 return error;
8422 }
8423
8424 kectx->kec_deadline = 0;
8425 kectx->kec_fp = NULL;
8426 kectx->kec_fd = uap->fd;
8427 /* the kec_process_* fields are filled if kqueue_scann is called only */
8428
8429 /* get the kq we are going to be working on */
8430 if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8431 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8432 } else {
8433 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8434 }
8435 if (__improbable(error)) {
8436 return error;
8437 }
8438
8439 return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
8440 uap->eventlist, uap->nevents, flags, kectx, retval);
8441 }
8442
8443 #pragma mark legacy syscalls: kevent, kevent64
8444
8445 /*!
8446 * @function kevent_legacy_get_deadline
8447 *
8448 * @brief
8449 * Compute the deadline for the legacy kevent syscalls.
8450 *
8451 * @discussion
8452 * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8453 * as this takes precedence over the deadline.
8454 *
8455 * This function will fail if utimeout is USER_ADDR_NULL
8456 * (the caller should check).
8457 */
8458 static int
8459 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8460 {
8461 struct timespec ts;
8462
8463 if (flags & KEVENT_FLAG_PROC64) {
8464 struct user64_timespec ts64;
8465 int error = copyin(utimeout, &ts64, sizeof(ts64));
8466 if (__improbable(error)) {
8467 return error;
8468 }
8469 ts.tv_sec = (unsigned long)ts64.tv_sec;
8470 ts.tv_nsec = (long)ts64.tv_nsec;
8471 } else {
8472 struct user32_timespec ts32;
8473 int error = copyin(utimeout, &ts32, sizeof(ts32));
8474 if (__improbable(error)) {
8475 return error;
8476 }
8477 ts.tv_sec = ts32.tv_sec;
8478 ts.tv_nsec = ts32.tv_nsec;
8479 }
8480 if (!timespec_is_valid(&ts)) {
8481 return EINVAL;
8482 }
8483
8484 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
8485 return 0;
8486 }
8487
8488 /*!
8489 * @function kevent_legacy_internal
8490 *
8491 * @brief
8492 * The core implementation for kevent and kevent64
8493 */
8494 OS_NOINLINE
8495 static int
8496 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
8497 int32_t *retval, int flags)
8498 {
8499 uthread_t uth = current_uthread();
8500 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8501 struct kqueue *kq;
8502 int error;
8503
8504 if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8505 return EINVAL;
8506 }
8507
8508 flags = kevent_adjust_flags_for_proc(p, flags);
8509
8510 kectx->kec_data_out = 0;
8511 kectx->kec_data_avail = 0;
8512 kectx->kec_data_size = 0;
8513 kectx->kec_data_resid = 0;
8514 kectx->kec_deadline = 0;
8515 kectx->kec_fp = NULL;
8516 kectx->kec_fd = uap->fd;
8517 /* the kec_process_* fields are filled if kqueue_scann is called only */
8518
8519 /* convert timeout to absolute - if we have one (and not immediate) */
8520 if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8521 error = kevent_legacy_get_deadline(flags, uap->timeout,
8522 &kectx->kec_deadline);
8523 if (__improbable(error)) {
8524 return error;
8525 }
8526 }
8527
8528 /* get the kq we are going to be working on */
8529 if (flags & KEVENT_FLAG_WORKQ) {
8530 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8531 } else {
8532 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8533 }
8534 if (__improbable(error)) {
8535 return error;
8536 }
8537
8538 return kevent_internal(kq, uap->changelist, uap->nchanges,
8539 uap->eventlist, uap->nevents, flags, kectx, retval,
8540 /*legacy*/ true);
8541 }
8542
8543 /*!
8544 * @function kevent
8545 *
8546 * @brief
8547 * The legacy kevent() syscall.
8548 */
8549 int
8550 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8551 {
8552 struct kevent64_args args = {
8553 .fd = uap->fd,
8554 .changelist = uap->changelist,
8555 .nchanges = uap->nchanges,
8556 .eventlist = uap->eventlist,
8557 .nevents = uap->nevents,
8558 .timeout = uap->timeout,
8559 };
8560
8561 return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8562 }
8563
8564 /*!
8565 * @function kevent64
8566 *
8567 * @brief
8568 * The legacy kevent64() syscall.
8569 */
8570 int
8571 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8572 {
8573 int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8574 return kevent_legacy_internal(p, uap, retval, flags);
8575 }
8576
8577 #pragma mark - socket interface
8578
8579 #if SOCKETS
8580 #include <sys/param.h>
8581 #include <sys/socket.h>
8582 #include <sys/protosw.h>
8583 #include <sys/domain.h>
8584 #include <sys/mbuf.h>
8585 #include <sys/kern_event.h>
8586 #include <sys/malloc.h>
8587 #include <sys/sys_domain.h>
8588 #include <sys/syslog.h>
8589
8590 #ifndef ROUNDUP64
8591 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8592 #endif
8593
8594 #ifndef ADVANCE64
8595 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8596 #endif
8597
8598 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8599 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8600
8601 static int kev_attach(struct socket *so, int proto, struct proc *p);
8602 static int kev_detach(struct socket *so);
8603 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8604 struct ifnet *ifp, struct proc *p);
8605 static lck_mtx_t * event_getlock(struct socket *, int);
8606 static int event_lock(struct socket *, int, void *);
8607 static int event_unlock(struct socket *, int, void *);
8608
8609 static int event_sofreelastref(struct socket *);
8610 static void kev_delete(struct kern_event_pcb *);
8611
8612 static struct pr_usrreqs event_usrreqs = {
8613 .pru_attach = kev_attach,
8614 .pru_control = kev_control,
8615 .pru_detach = kev_detach,
8616 .pru_soreceive = soreceive,
8617 };
8618
8619 static struct protosw eventsw[] = {
8620 {
8621 .pr_type = SOCK_RAW,
8622 .pr_protocol = SYSPROTO_EVENT,
8623 .pr_flags = PR_ATOMIC,
8624 .pr_usrreqs = &event_usrreqs,
8625 .pr_lock = event_lock,
8626 .pr_unlock = event_unlock,
8627 .pr_getlock = event_getlock,
8628 }
8629 };
8630
8631 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8632 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8633
8634 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8635 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8636
8637 struct kevtstat kevtstat;
8638 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8639 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8640 kevt_getstat, "S,kevtstat", "");
8641
8642 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8643 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8644 kevt_pcblist, "S,xkevtpcb", "");
8645
8646 SYSCTL_UINT(_net_systm_kevt, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
8647 (unsigned int *)&kevtstat.kes_pcbcount, 0, "");
8648
8649 static lck_mtx_t *
8650 event_getlock(struct socket *so, int flags)
8651 {
8652 #pragma unused(flags)
8653 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8654
8655 if (so->so_pcb != NULL) {
8656 if (so->so_usecount < 0) {
8657 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8658 so, so->so_usecount, solockhistory_nr(so));
8659 }
8660 /* NOTREACHED */
8661 } else {
8662 panic("%s: so=%p NULL NO so_pcb %s", __func__,
8663 so, solockhistory_nr(so));
8664 /* NOTREACHED */
8665 }
8666 return &ev_pcb->evp_mtx;
8667 }
8668
8669 static int
8670 event_lock(struct socket *so, int refcount, void *lr)
8671 {
8672 void *lr_saved;
8673
8674 if (lr == NULL) {
8675 lr_saved = __builtin_return_address(0);
8676 } else {
8677 lr_saved = lr;
8678 }
8679
8680 if (so->so_pcb != NULL) {
8681 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8682 } else {
8683 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8684 so, lr_saved, solockhistory_nr(so));
8685 /* NOTREACHED */
8686 }
8687
8688 if (so->so_usecount < 0) {
8689 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8690 so, so->so_pcb, lr_saved, so->so_usecount,
8691 solockhistory_nr(so));
8692 /* NOTREACHED */
8693 }
8694
8695 if (refcount) {
8696 so->so_usecount++;
8697 }
8698
8699 so->lock_lr[so->next_lock_lr] = lr_saved;
8700 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8701 return 0;
8702 }
8703
8704 static int
8705 event_unlock(struct socket *so, int refcount, void *lr)
8706 {
8707 void *lr_saved;
8708 lck_mtx_t *mutex_held;
8709
8710 if (lr == NULL) {
8711 lr_saved = __builtin_return_address(0);
8712 } else {
8713 lr_saved = lr;
8714 }
8715
8716 if (refcount) {
8717 so->so_usecount--;
8718 }
8719 if (so->so_usecount < 0) {
8720 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8721 so, so->so_usecount, solockhistory_nr(so));
8722 /* NOTREACHED */
8723 }
8724 if (so->so_pcb == NULL) {
8725 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8726 so, so->so_usecount, (void *)lr_saved,
8727 solockhistory_nr(so));
8728 /* NOTREACHED */
8729 }
8730 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8731
8732 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8733 so->unlock_lr[so->next_unlock_lr] = lr_saved;
8734 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8735
8736 if (so->so_usecount == 0) {
8737 VERIFY(so->so_flags & SOF_PCBCLEARING);
8738 event_sofreelastref(so);
8739 } else {
8740 lck_mtx_unlock(mutex_held);
8741 }
8742
8743 return 0;
8744 }
8745
8746 static int
8747 event_sofreelastref(struct socket *so)
8748 {
8749 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8750
8751 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8752
8753 so->so_pcb = NULL;
8754
8755 /*
8756 * Disable upcall in the event another thread is in kev_post_msg()
8757 * appending record to the receive socket buffer, since sbwakeup()
8758 * may release the socket lock otherwise.
8759 */
8760 so->so_rcv.sb_flags &= ~SB_UPCALL;
8761 so->so_snd.sb_flags &= ~SB_UPCALL;
8762 so->so_event = sonullevent;
8763 lck_mtx_unlock(&(ev_pcb->evp_mtx));
8764
8765 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8766 lck_rw_lock_exclusive(&kev_rwlock);
8767 LIST_REMOVE(ev_pcb, evp_link);
8768 kevtstat.kes_pcbcount--;
8769 kevtstat.kes_gencnt++;
8770 lck_rw_done(&kev_rwlock);
8771 kev_delete(ev_pcb);
8772
8773 sofreelastref(so, 1);
8774 return 0;
8775 }
8776
8777 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8778
8779 static
8780 struct kern_event_head kern_event_head;
8781
8782 static u_int32_t static_event_id = 0;
8783
8784 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8785
8786 /*
8787 * Install the protosw's for the NKE manager. Invoked at extension load time
8788 */
8789 void
8790 kern_event_init(struct domain *dp)
8791 {
8792 struct protosw *pr;
8793 int i;
8794
8795 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8796 VERIFY(dp == systemdomain);
8797
8798 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8799 net_add_proto(pr, dp, 1);
8800 }
8801 }
8802
8803 static int
8804 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8805 {
8806 int error = 0;
8807 struct kern_event_pcb *ev_pcb;
8808
8809 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8810 if (error != 0) {
8811 return error;
8812 }
8813
8814 ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8815 lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8816
8817 ev_pcb->evp_socket = so;
8818 ev_pcb->evp_vendor_code_filter = 0xffffffff;
8819
8820 so->so_pcb = (caddr_t) ev_pcb;
8821 lck_rw_lock_exclusive(&kev_rwlock);
8822 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8823 kevtstat.kes_pcbcount++;
8824 kevtstat.kes_gencnt++;
8825 lck_rw_done(&kev_rwlock);
8826
8827 return error;
8828 }
8829
8830 static void
8831 kev_delete(struct kern_event_pcb *ev_pcb)
8832 {
8833 VERIFY(ev_pcb != NULL);
8834 lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8835 zfree(ev_pcb_zone, ev_pcb);
8836 }
8837
8838 static int
8839 kev_detach(struct socket *so)
8840 {
8841 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8842
8843 if (ev_pcb != NULL) {
8844 soisdisconnected(so);
8845 so->so_flags |= SOF_PCBCLEARING;
8846 }
8847
8848 return 0;
8849 }
8850
8851 /*
8852 * For now, kev_vendor_code and mbuf_tags use the same
8853 * mechanism.
8854 */
8855 errno_t
8856 kev_vendor_code_find(
8857 const char *string,
8858 u_int32_t *out_vendor_code)
8859 {
8860 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8861 return EINVAL;
8862 }
8863 return net_str_id_find_internal(string, out_vendor_code,
8864 NSI_VENDOR_CODE, 1);
8865 }
8866
8867 errno_t
8868 kev_msg_post(struct kev_msg *event_msg)
8869 {
8870 mbuf_tag_id_t min_vendor, max_vendor;
8871
8872 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8873
8874 if (event_msg == NULL) {
8875 return EINVAL;
8876 }
8877
8878 /*
8879 * Limit third parties to posting events for registered vendor codes
8880 * only
8881 */
8882 if (event_msg->vendor_code < min_vendor ||
8883 event_msg->vendor_code > max_vendor) {
8884 os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8885 return EINVAL;
8886 }
8887 return kev_post_msg(event_msg);
8888 }
8889
8890 static int
8891 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8892 {
8893 struct mbuf *m, *m2;
8894 struct kern_event_pcb *ev_pcb;
8895 struct kern_event_msg *ev;
8896 char *tmp;
8897 u_int32_t total_size;
8898 int i;
8899
8900 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8901 /*
8902 * Special hook for ALF state updates
8903 */
8904 if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8905 event_msg->kev_class == KEV_NKE_CLASS &&
8906 event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8907 event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8908 #if MACH_ASSERT
8909 os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8910 #endif /* MACH_ASSERT */
8911 net_filter_event_mark(NET_FILTER_EVENT_ALF,
8912 net_check_compatible_alf());
8913 }
8914 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8915
8916 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8917 total_size = KEV_MSG_HEADER_SIZE;
8918
8919 for (i = 0; i < 5; i++) {
8920 if (event_msg->dv[i].data_length == 0) {
8921 break;
8922 }
8923 total_size += event_msg->dv[i].data_length;
8924 }
8925
8926 if (total_size > MLEN) {
8927 os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8928 return EMSGSIZE;
8929 }
8930
8931 m = m_get(wait, MT_DATA);
8932 if (m == 0) {
8933 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8934 return ENOMEM;
8935 }
8936 ev = mtod(m, struct kern_event_msg *);
8937 total_size = KEV_MSG_HEADER_SIZE;
8938
8939 tmp = (char *) &ev->event_data[0];
8940 for (i = 0; i < 5; i++) {
8941 if (event_msg->dv[i].data_length == 0) {
8942 break;
8943 }
8944
8945 total_size += event_msg->dv[i].data_length;
8946 bcopy(event_msg->dv[i].data_ptr, tmp,
8947 event_msg->dv[i].data_length);
8948 tmp += event_msg->dv[i].data_length;
8949 }
8950
8951 ev->id = ++static_event_id;
8952 ev->total_size = total_size;
8953 ev->vendor_code = event_msg->vendor_code;
8954 ev->kev_class = event_msg->kev_class;
8955 ev->kev_subclass = event_msg->kev_subclass;
8956 ev->event_code = event_msg->event_code;
8957
8958 m->m_len = total_size;
8959 lck_rw_lock_shared(&kev_rwlock);
8960 for (ev_pcb = LIST_FIRST(&kern_event_head);
8961 ev_pcb;
8962 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8963 lck_mtx_lock(&ev_pcb->evp_mtx);
8964 if (ev_pcb->evp_socket->so_pcb == NULL) {
8965 lck_mtx_unlock(&ev_pcb->evp_mtx);
8966 continue;
8967 }
8968 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8969 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8970 lck_mtx_unlock(&ev_pcb->evp_mtx);
8971 continue;
8972 }
8973
8974 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8975 if (ev_pcb->evp_class_filter != ev->kev_class) {
8976 lck_mtx_unlock(&ev_pcb->evp_mtx);
8977 continue;
8978 }
8979
8980 if ((ev_pcb->evp_subclass_filter !=
8981 KEV_ANY_SUBCLASS) &&
8982 (ev_pcb->evp_subclass_filter !=
8983 ev->kev_subclass)) {
8984 lck_mtx_unlock(&ev_pcb->evp_mtx);
8985 continue;
8986 }
8987 }
8988 }
8989
8990 m2 = m_copym(m, 0, m->m_len, wait);
8991 if (m2 == 0) {
8992 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8993 m_free(m);
8994 lck_mtx_unlock(&ev_pcb->evp_mtx);
8995 lck_rw_done(&kev_rwlock);
8996 return ENOMEM;
8997 }
8998 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8999 /*
9000 * We use "m" for the socket stats as it would be
9001 * unsafe to use "m2"
9002 */
9003 so_inc_recv_data_stat(ev_pcb->evp_socket,
9004 1, m->m_len);
9005
9006 sorwakeup(ev_pcb->evp_socket);
9007 os_atomic_inc(&kevtstat.kes_posted, relaxed);
9008 } else {
9009 os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
9010 }
9011 lck_mtx_unlock(&ev_pcb->evp_mtx);
9012 }
9013 m_free(m);
9014 lck_rw_done(&kev_rwlock);
9015
9016 return 0;
9017 }
9018
9019 int
9020 kev_post_msg(struct kev_msg *event_msg)
9021 {
9022 return kev_post_msg_internal(event_msg, M_WAIT);
9023 }
9024
9025 int
9026 kev_post_msg_nowait(struct kev_msg *event_msg)
9027 {
9028 return kev_post_msg_internal(event_msg, M_NOWAIT);
9029 }
9030
9031 static int
9032 kev_control(struct socket *so,
9033 u_long cmd,
9034 caddr_t data,
9035 __unused struct ifnet *ifp,
9036 __unused struct proc *p)
9037 {
9038 struct kev_request *kev_req = (struct kev_request *) data;
9039 struct kern_event_pcb *ev_pcb;
9040 struct kev_vendor_code *kev_vendor;
9041 u_int32_t *id_value = (u_int32_t *) data;
9042
9043 switch (cmd) {
9044 case SIOCGKEVID:
9045 *id_value = static_event_id;
9046 break;
9047 case SIOCSKEVFILT:
9048 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9049 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9050 ev_pcb->evp_class_filter = kev_req->kev_class;
9051 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
9052 break;
9053 case SIOCGKEVFILT:
9054 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9055 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9056 kev_req->kev_class = ev_pcb->evp_class_filter;
9057 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9058 break;
9059 case SIOCGKEVVENDOR:
9060 kev_vendor = (struct kev_vendor_code *)data;
9061 /* Make sure string is NULL terminated */
9062 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
9063 return net_str_id_find_internal(kev_vendor->vendor_string,
9064 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
9065 default:
9066 return ENOTSUP;
9067 }
9068
9069 return 0;
9070 }
9071
9072 int
9073 kevt_getstat SYSCTL_HANDLER_ARGS
9074 {
9075 #pragma unused(oidp, arg1, arg2)
9076 int error = 0;
9077
9078 lck_rw_lock_shared(&kev_rwlock);
9079
9080 if (req->newptr != USER_ADDR_NULL) {
9081 error = EPERM;
9082 goto done;
9083 }
9084 if (req->oldptr == USER_ADDR_NULL) {
9085 req->oldidx = sizeof(struct kevtstat);
9086 goto done;
9087 }
9088
9089 error = SYSCTL_OUT(req, &kevtstat,
9090 MIN(sizeof(struct kevtstat), req->oldlen));
9091 done:
9092 lck_rw_done(&kev_rwlock);
9093
9094 return error;
9095 }
9096
9097 __private_extern__ int
9098 kevt_pcblist SYSCTL_HANDLER_ARGS
9099 {
9100 #pragma unused(oidp, arg1, arg2)
9101 int error = 0;
9102 uint64_t n, i;
9103 struct xsystmgen xsg;
9104 void *buf = NULL;
9105 size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
9106 ROUNDUP64(sizeof(struct xsocket_n)) +
9107 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
9108 ROUNDUP64(sizeof(struct xsockstat_n));
9109 struct kern_event_pcb *ev_pcb;
9110
9111 buf = kalloc_data(item_size, Z_WAITOK_ZERO_NOFAIL);
9112
9113 lck_rw_lock_shared(&kev_rwlock);
9114
9115 n = kevtstat.kes_pcbcount;
9116
9117 if (req->oldptr == USER_ADDR_NULL) {
9118 req->oldidx = (size_t) ((n + n / 8) * item_size);
9119 goto done;
9120 }
9121 if (req->newptr != USER_ADDR_NULL) {
9122 error = EPERM;
9123 goto done;
9124 }
9125 bzero(&xsg, sizeof(xsg));
9126 xsg.xg_len = sizeof(xsg);
9127 xsg.xg_count = n;
9128 xsg.xg_gen = kevtstat.kes_gencnt;
9129 xsg.xg_sogen = so_gencnt;
9130 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9131 if (error) {
9132 goto done;
9133 }
9134 /*
9135 * We are done if there is no pcb
9136 */
9137 if (n == 0) {
9138 goto done;
9139 }
9140
9141 i = 0;
9142 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9143 i < n && ev_pcb != NULL;
9144 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9145 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9146 struct xsocket_n *xso = (struct xsocket_n *)
9147 ADVANCE64(xk, sizeof(*xk));
9148 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9149 ADVANCE64(xso, sizeof(*xso));
9150 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9151 ADVANCE64(xsbrcv, sizeof(*xsbrcv));
9152 struct xsockstat_n *xsostats = (struct xsockstat_n *)
9153 ADVANCE64(xsbsnd, sizeof(*xsbsnd));
9154
9155 bzero(buf, item_size);
9156
9157 lck_mtx_lock(&ev_pcb->evp_mtx);
9158
9159 xk->kep_len = sizeof(struct xkevtpcb);
9160 xk->kep_kind = XSO_EVT;
9161 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRHASH(ev_pcb);
9162 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9163 xk->kep_class_filter = ev_pcb->evp_class_filter;
9164 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9165
9166 sotoxsocket_n(ev_pcb->evp_socket, xso);
9167 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9168 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9169 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9170 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9171 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9172
9173 lck_mtx_unlock(&ev_pcb->evp_mtx);
9174
9175 error = SYSCTL_OUT(req, buf, item_size);
9176 }
9177
9178 if (error == 0) {
9179 /*
9180 * Give the user an updated idea of our state.
9181 * If the generation differs from what we told
9182 * her before, she knows that something happened
9183 * while we were processing this request, and it
9184 * might be necessary to retry.
9185 */
9186 bzero(&xsg, sizeof(xsg));
9187 xsg.xg_len = sizeof(xsg);
9188 xsg.xg_count = n;
9189 xsg.xg_gen = kevtstat.kes_gencnt;
9190 xsg.xg_sogen = so_gencnt;
9191 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9192 if (error) {
9193 goto done;
9194 }
9195 }
9196
9197 done:
9198 lck_rw_done(&kev_rwlock);
9199
9200 kfree_data(buf, item_size);
9201 return error;
9202 }
9203
9204 #endif /* SOCKETS */
9205
9206
9207 int
9208 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
9209 {
9210 struct vinfo_stat * st;
9211
9212 st = &kinfo->kq_stat;
9213
9214 st->vst_size = kqu.kq->kq_count;
9215 if (kqu.kq->kq_state & KQ_KEV_QOS) {
9216 st->vst_blksize = sizeof(struct kevent_qos_s);
9217 } else if (kqu.kq->kq_state & KQ_KEV64) {
9218 st->vst_blksize = sizeof(struct kevent64_s);
9219 } else {
9220 st->vst_blksize = sizeof(struct kevent);
9221 }
9222 st->vst_mode = S_IFIFO;
9223 st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
9224 kqu.kqwl->kqwl_dynamicid : 0;
9225
9226 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9227 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9228 static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
9229 static_assert(PROC_KQUEUE_32 == KQ_KEV32);
9230 static_assert(PROC_KQUEUE_64 == KQ_KEV64);
9231 static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
9232 static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
9233 static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
9234 kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
9235 if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
9236 if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
9237 kinfo->kq_state |= PROC_KQUEUE_SELECT;
9238 }
9239 }
9240
9241 return 0;
9242 }
9243
9244 static int
9245 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
9246 {
9247 workq_threadreq_t kqr = &kqwl->kqwl_request;
9248 workq_threadreq_param_t trp = {};
9249 int err;
9250
9251 if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
9252 return EINVAL;
9253 }
9254
9255 if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
9256 return err;
9257 }
9258
9259 kqlock(kqwl);
9260
9261 kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
9262 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9263 kqdi->kqdi_request_state = kqr->tr_state;
9264 kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
9265 kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
9266 kqdi->kqdi_sync_waiters = 0;
9267 kqdi->kqdi_sync_waiter_qos = 0;
9268
9269 trp.trp_value = kqwl->kqwl_params;
9270 if (trp.trp_flags & TRP_PRIORITY) {
9271 kqdi->kqdi_pri = trp.trp_pri;
9272 } else {
9273 kqdi->kqdi_pri = 0;
9274 }
9275
9276 if (trp.trp_flags & TRP_POLICY) {
9277 kqdi->kqdi_pol = trp.trp_pol;
9278 } else {
9279 kqdi->kqdi_pol = 0;
9280 }
9281
9282 if (trp.trp_flags & TRP_CPUPERCENT) {
9283 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
9284 } else {
9285 kqdi->kqdi_cpupercent = 0;
9286 }
9287
9288 kqunlock(kqwl);
9289
9290 return 0;
9291 }
9292
9293
9294 static unsigned long
9295 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9296 unsigned long buflen, unsigned long nknotes)
9297 {
9298 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9299 if (kq == knote_get_kq(kn)) {
9300 if (nknotes < buflen) {
9301 struct kevent_extinfo *info = &buf[nknotes];
9302
9303 kqlock(kq);
9304
9305 if (knote_fops(kn)->f_sanitized_copyout) {
9306 knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
9307 } else {
9308 info->kqext_kev = *(struct kevent_qos_s *)&kn->kn_kevent;
9309 }
9310
9311 if (knote_has_qos(kn)) {
9312 info->kqext_kev.qos =
9313 _pthread_priority_thread_qos_fast(kn->kn_qos);
9314 } else {
9315 info->kqext_kev.qos = kn->kn_qos_override;
9316 }
9317 info->kqext_kev.filter |= 0xff00; /* sign extend filter */
9318 info->kqext_kev.xflags = 0; /* this is where sfflags lives */
9319 info->kqext_kev.data = 0; /* this is where sdata lives */
9320 info->kqext_sdata = kn->kn_sdata;
9321 info->kqext_status = kn->kn_status;
9322 info->kqext_sfflags = kn->kn_sfflags;
9323
9324 kqunlock(kq);
9325 }
9326
9327 /* we return total number of knotes, which may be more than requested */
9328 nknotes++;
9329 }
9330 }
9331
9332 return nknotes;
9333 }
9334
9335 int
9336 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9337 int32_t *nkqueues_out)
9338 {
9339 proc_t p = (proc_t)proc;
9340 struct filedesc *fdp = &p->p_fd;
9341 unsigned int nkqueues = 0;
9342 unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9343 size_t buflen, bufsize;
9344 kqueue_id_t *kq_ids = NULL;
9345 int err = 0;
9346
9347 assert(p != NULL);
9348
9349 if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9350 err = EINVAL;
9351 goto out;
9352 }
9353
9354 buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9355
9356 if (ubuflen != 0) {
9357 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9358 err = ERANGE;
9359 goto out;
9360 }
9361 kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
9362 if (!kq_ids) {
9363 err = ENOMEM;
9364 goto out;
9365 }
9366 }
9367
9368 kqhash_lock(fdp);
9369
9370 u_long kqhashmask = fdp->fd_kqhashmask;
9371 if (kqhashmask > 0) {
9372 for (uint32_t i = 0; i < kqhashmask + 1; i++) {
9373 struct kqworkloop *kqwl;
9374
9375 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9376 /* report the number of kqueues, even if they don't all fit */
9377 if (nkqueues < buflen) {
9378 kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9379 }
9380 nkqueues++;
9381 }
9382
9383 /*
9384 * Drop the kqhash lock and take it again to give some breathing room
9385 */
9386 kqhash_unlock(fdp);
9387 kqhash_lock(fdp);
9388
9389 /*
9390 * Reevaluate to see if we have raced with someone who changed this -
9391 * if we have, we should bail out with the set of info captured so far
9392 */
9393 if (fdp->fd_kqhashmask != kqhashmask) {
9394 break;
9395 }
9396 }
9397 }
9398
9399 kqhash_unlock(fdp);
9400
9401 if (kq_ids) {
9402 size_t copysize;
9403 if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), ©size)) {
9404 err = ERANGE;
9405 goto out;
9406 }
9407
9408 assert(ubufsize >= copysize);
9409 err = copyout(kq_ids, ubuf, copysize);
9410 }
9411
9412 out:
9413 if (kq_ids) {
9414 kfree_data(kq_ids, bufsize);
9415 }
9416
9417 if (!err) {
9418 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9419 }
9420 return err;
9421 }
9422
9423 int
9424 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9425 uint32_t ubufsize, int32_t *size_out)
9426 {
9427 proc_t p = (proc_t)proc;
9428 struct kqworkloop *kqwl;
9429 int err = 0;
9430 struct kqueue_dyninfo kqdi = { };
9431
9432 assert(p != NULL);
9433
9434 if (ubufsize < sizeof(struct kqueue_info)) {
9435 return ENOBUFS;
9436 }
9437
9438 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9439 if (!kqwl) {
9440 return ESRCH;
9441 }
9442
9443 /*
9444 * backward compatibility: allow the argument to this call to only be
9445 * a struct kqueue_info
9446 */
9447 if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9448 ubufsize = sizeof(struct kqueue_dyninfo);
9449 err = fill_kqueue_dyninfo(kqwl, &kqdi);
9450 } else {
9451 ubufsize = sizeof(struct kqueue_info);
9452 err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
9453 }
9454 if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9455 *size_out = ubufsize;
9456 }
9457 kqworkloop_release(kqwl);
9458 return err;
9459 }
9460
9461 int
9462 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9463 uint32_t ubufsize, int32_t *nknotes_out)
9464 {
9465 proc_t p = (proc_t)proc;
9466 struct kqworkloop *kqwl;
9467 int err;
9468
9469 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9470 if (!kqwl) {
9471 return ESRCH;
9472 }
9473
9474 err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
9475 kqworkloop_release(kqwl);
9476 return err;
9477 }
9478
9479 int
9480 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9481 uint32_t bufsize, int32_t *retval)
9482 {
9483 struct knote *kn;
9484 int i;
9485 int err = 0;
9486 struct filedesc *fdp = &p->p_fd;
9487 unsigned long nknotes = 0;
9488 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9489 struct kevent_extinfo *kqext = NULL;
9490
9491 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9492 buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9493
9494 kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
9495 if (kqext == NULL) {
9496 err = ENOMEM;
9497 goto out;
9498 }
9499
9500 proc_fdlock(p);
9501 u_long fd_knlistsize = fdp->fd_knlistsize;
9502 struct klist *fd_knlist = fdp->fd_knlist;
9503
9504 for (i = 0; i < fd_knlistsize; i++) {
9505 kn = SLIST_FIRST(&fd_knlist[i]);
9506 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9507
9508 proc_fdunlock(p);
9509 proc_fdlock(p);
9510 /*
9511 * Reevaluate to see if we have raced with someone who changed this -
9512 * if we have, we return the set of info for fd_knlistsize we knew
9513 * in the beginning except if knotes_dealloc interleaves with us.
9514 * In that case, we bail out early with the set of info captured so far.
9515 */
9516 if (fd_knlistsize != fdp->fd_knlistsize) {
9517 if (fdp->fd_knlistsize) {
9518 /* kq_add_knote might grow fdp->fd_knlist. */
9519 fd_knlist = fdp->fd_knlist;
9520 } else {
9521 break;
9522 }
9523 }
9524 }
9525 proc_fdunlock(p);
9526
9527 knhash_lock(fdp);
9528 u_long knhashmask = fdp->fd_knhashmask;
9529
9530 if (knhashmask != 0) {
9531 for (i = 0; i < (int)knhashmask + 1; i++) {
9532 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9533 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9534
9535 knhash_unlock(fdp);
9536 knhash_lock(fdp);
9537
9538 /*
9539 * Reevaluate to see if we have raced with someone who changed this -
9540 * if we have, we should bail out with the set of info captured so far
9541 */
9542 if (fdp->fd_knhashmask != knhashmask) {
9543 break;
9544 }
9545 }
9546 }
9547 knhash_unlock(fdp);
9548
9549 assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9550 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9551
9552 out:
9553 kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9554
9555 if (!err) {
9556 *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9557 }
9558 return err;
9559 }
9560
9561 static unsigned int
9562 klist_copy_udata(struct klist *list, uint64_t *buf,
9563 unsigned int buflen, unsigned int nknotes)
9564 {
9565 struct knote *kn;
9566 SLIST_FOREACH(kn, list, kn_link) {
9567 if (nknotes < buflen) {
9568 /*
9569 * kevent_register will always set kn_udata atomically
9570 * so that we don't have to take any kqlock here.
9571 */
9572 buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9573 }
9574 /* we return total number of knotes, which may be more than requested */
9575 nknotes++;
9576 }
9577
9578 return nknotes;
9579 }
9580
9581 int
9582 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9583 {
9584 proc_t p = (proc_t)proc;
9585 struct filedesc *fdp = &p->p_fd;
9586 unsigned int nuptrs = 0;
9587 unsigned int buflen = bufsize / sizeof(uint64_t);
9588 struct kqworkloop *kqwl;
9589 u_long size = 0;
9590 struct klist *fd_knlist = NULL;
9591
9592 if (buflen > 0) {
9593 assert(buf != NULL);
9594 }
9595
9596 /*
9597 * Copyout the uptrs as much as possible but make sure to drop the respective
9598 * locks and take them again periodically so that we don't blow through
9599 * preemption disabled timeouts. Always reevaluate to see if we have raced
9600 * with someone who changed size of the hash - if we have, we return info for
9601 * the size of the hash we knew in the beginning except if it drops to 0.
9602 * In that case, we bail out with the set of info captured so far
9603 */
9604 proc_fdlock(p);
9605 size = fdp->fd_knlistsize;
9606 fd_knlist = fdp->fd_knlist;
9607
9608 for (int i = 0; i < size; i++) {
9609 nuptrs = klist_copy_udata(&fd_knlist[i], buf, buflen, nuptrs);
9610
9611 proc_fdunlock(p);
9612 proc_fdlock(p);
9613 if (size != fdp->fd_knlistsize) {
9614 if (fdp->fd_knlistsize) {
9615 /* kq_add_knote might grow fdp->fd_knlist. */
9616 fd_knlist = fdp->fd_knlist;
9617 } else {
9618 break;
9619 }
9620 }
9621 }
9622 proc_fdunlock(p);
9623
9624 knhash_lock(fdp);
9625 size = fdp->fd_knhashmask;
9626
9627 if (size != 0) {
9628 for (size_t i = 0; i < size + 1; i++) {
9629 nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9630
9631 knhash_unlock(fdp);
9632 knhash_lock(fdp);
9633 /* The only path that can interleave with us today is knotes_dealloc. */
9634 if (size != fdp->fd_knhashmask) {
9635 break;
9636 }
9637 }
9638 }
9639 knhash_unlock(fdp);
9640
9641 kqhash_lock(fdp);
9642 size = fdp->fd_kqhashmask;
9643
9644 if (size != 0) {
9645 for (size_t i = 0; i < size + 1; i++) {
9646 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9647 if (nuptrs < buflen) {
9648 buf[nuptrs] = kqwl->kqwl_dynamicid;
9649 }
9650 nuptrs++;
9651 }
9652
9653 kqhash_unlock(fdp);
9654 kqhash_lock(fdp);
9655 if (size != fdp->fd_kqhashmask) {
9656 break;
9657 }
9658 }
9659 }
9660 kqhash_unlock(fdp);
9661
9662 return (int)nuptrs;
9663 }
9664
9665 static void
9666 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9667 {
9668 uint64_t ast_addr;
9669 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9670 size_t user_addr_size = proc_is_64bit ? 8 : 4;
9671 uint32_t ast_flags32 = 0;
9672 uint64_t ast_flags64 = 0;
9673 struct uthread *ut = get_bsdthread_info(thread);
9674
9675 if (ut->uu_kqr_bound != NULL) {
9676 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9677 }
9678
9679 if (ast_flags64 == 0) {
9680 return;
9681 }
9682
9683 if (!(p->p_flag & P_LP64)) {
9684 ast_flags32 = (uint32_t)ast_flags64;
9685 assert(ast_flags64 < 0x100000000ull);
9686 }
9687
9688 ast_addr = thread_rettokern_addr(thread);
9689 if (ast_addr == 0) {
9690 return;
9691 }
9692
9693 if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9694 (user_addr_t)ast_addr,
9695 user_addr_size) != 0) {
9696 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9697 "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9698 }
9699 }
9700
9701 /*
9702 * Semantics of writing to TSD value:
9703 *
9704 * 1. It is written to by the kernel and cleared by userspace.
9705 * 2. When the userspace code clears the TSD field, it takes responsibility for
9706 * taking action on the quantum expiry action conveyed by kernel.
9707 * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9708 * userspace back to kernel to make sure that it is never leaked across thread
9709 * requests.
9710 */
9711 void
9712 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9713 uint64_t flags)
9714 {
9715 uint64_t ast_addr;
9716 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9717 uint32_t ast_flags32 = 0;
9718 uint64_t ast_flags64 = flags;
9719
9720 if (ast_flags64 == 0) {
9721 return;
9722 }
9723
9724 if (!(p->p_flag & P_LP64)) {
9725 ast_flags32 = (uint32_t)ast_flags64;
9726 assert(ast_flags64 < 0x100000000ull);
9727 }
9728
9729 ast_addr = thread_wqquantum_addr(thread);
9730 assert(ast_addr != 0);
9731
9732 if (proc_is_64bit) {
9733 if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9734 #if DEBUG || DEVELOPMENT
9735 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9736 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9737 #endif
9738 }
9739 } else {
9740 if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9741 #if DEBUG || DEVELOPMENT
9742 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9743 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9744 #endif
9745 }
9746 }
9747 }
9748
9749 void
9750 kevent_ast(thread_t thread, uint16_t bits)
9751 {
9752 proc_t p = current_proc();
9753
9754
9755 if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9756 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9757 }
9758 if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9759 kevent_set_return_to_kernel_user_tsd(p, thread);
9760 }
9761
9762 if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9763 workq_kern_quantum_expiry_reevaluate(p, thread);
9764 }
9765 }
9766
9767 #if DEVELOPMENT || DEBUG
9768
9769 #define KEVENT_SYSCTL_BOUND_ID 1
9770
9771 static int
9772 kevent_sysctl SYSCTL_HANDLER_ARGS
9773 {
9774 #pragma unused(oidp, arg2)
9775 uintptr_t type = (uintptr_t)arg1;
9776 uint64_t bound_id = 0;
9777
9778 if (type != KEVENT_SYSCTL_BOUND_ID) {
9779 return EINVAL;
9780 }
9781
9782 if (req->newptr) {
9783 return EINVAL;
9784 }
9785
9786 struct uthread *ut = current_uthread();
9787 if (!ut) {
9788 return EFAULT;
9789 }
9790
9791 workq_threadreq_t kqr = ut->uu_kqr_bound;
9792 if (kqr) {
9793 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9794 bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9795 } else {
9796 bound_id = -1;
9797 }
9798 }
9799
9800 return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9801 }
9802
9803 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9804 "kevent information");
9805
9806 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9807 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9808 (void *)KEVENT_SYSCTL_BOUND_ID,
9809 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9810 "get the ID of the bound kqueue");
9811
9812 #endif /* DEVELOPMENT || DEBUG */
9813