xref: /xnu-8796.141.3/bsd/kern/kern_event.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 /*-
30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 /*
55  *	@(#)kern_event.c       1.0 (3/31/2000)
56  */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91 
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 
107 #include <pthread/priority_private.h>
108 #include <pthread/workqueue_syscalls.h>
109 #include <pthread/workqueue_internal.h>
110 #include <libkern/libkern.h>
111 
112 #include <os/log.h>
113 
114 #include "net/net_str_id.h"
115 
116 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
117 #include <skywalk/lib/net_filter_event.h>
118 
119 extern bool net_check_compatible_alf(void);
120 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
121 
122 #include <mach/task.h>
123 #include <libkern/section_keywords.h>
124 
125 #if CONFIG_MEMORYSTATUS
126 #include <sys/kern_memorystatus.h>
127 #endif
128 
129 #if DEVELOPMENT || DEBUG
130 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
131 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
132 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
133 #endif
134 
135 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
136 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
137     VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
138 
139 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
140 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
141 
142 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
143 
144 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
145     vfs_context_t ctx);
146 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
147 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
148     struct kevent_qos_s *kev);
149 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
150 
151 static const struct fileops kqueueops = {
152 	.fo_type     = DTYPE_KQUEUE,
153 	.fo_read     = fo_no_read,
154 	.fo_write    = fo_no_write,
155 	.fo_ioctl    = fo_no_ioctl,
156 	.fo_select   = kqueue_select,
157 	.fo_close    = kqueue_close,
158 	.fo_drain    = kqueue_drain,
159 	.fo_kqfilter = kqueue_kqfilter,
160 };
161 
162 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
163 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
164 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
165     thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
166 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
167 static void kevent_register_wait_cleanup(struct knote *kn);
168 
169 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
170 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
171 
172 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
173 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
174 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
175 static void kqueue_update_iotier_override(kqueue_t kqu);
176 
177 static void kqworkloop_unbind(struct kqworkloop *kwql);
178 
179 enum kqwl_unbind_locked_mode {
180 	KQWL_OVERRIDE_DROP_IMMEDIATELY,
181 	KQWL_OVERRIDE_DROP_DELAYED,
182 };
183 static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
184     enum kqwl_unbind_locked_mode how);
185 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
186 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
187 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
188 enum {
189 	KQWL_UTQ_NONE,
190 	/*
191 	 * The wakeup qos is the qos of QUEUED knotes.
192 	 *
193 	 * This QoS is accounted for with the events override in the
194 	 * kqr_override_index field. It is raised each time a new knote is queued at
195 	 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
196 	 * knote buckets and is recomputed after each event delivery.
197 	 */
198 	KQWL_UTQ_UPDATE_WAKEUP_QOS,
199 	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
200 	KQWL_UTQ_UNBINDING, /* attempt to rebind */
201 	KQWL_UTQ_PARKING,
202 	/*
203 	 * The wakeup override is for suppressed knotes that have fired again at
204 	 * a higher QoS than the one for which they are suppressed already.
205 	 * This override is cleared when the knote suppressed list becomes empty.
206 	 */
207 	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
208 	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
209 	/*
210 	 * The QoS is the maximum QoS of an event enqueued on this workloop in
211 	 * userland. It is copied from the only EVFILT_WORKLOOP knote with
212 	 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
213 	 * such knote, this QoS is 0.
214 	 */
215 	KQWL_UTQ_SET_QOS_INDEX,
216 	KQWL_UTQ_REDRIVE_EVENTS,
217 };
218 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
219 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
220 
221 static struct knote *knote_alloc(void);
222 static void knote_free(struct knote *kn);
223 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
224     struct knote_lock_ctx *knlc, struct proc *p);
225 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
226     struct kevent_qos_s *kev, bool is_fd, struct proc *p);
227 
228 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
229 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
230 
231 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
232     struct kevent_qos_s *kev, int result);
233 static void knote_suppress(kqueue_t kqu, struct knote *kn);
234 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
235 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
236 
237 // both these functions may dequeue the knote and it is up to the caller
238 // to enqueue the knote back
239 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
240 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
241 
242 static ZONE_DEFINE(knote_zone, "knote zone",
243     sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
244 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
245     sizeof(struct kqfile), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
246 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
247     sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
248 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
249     sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
250 
251 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
252 
253 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
254 static void filt_no_detach(struct knote *kn);
255 static int filt_bad_event(struct knote *kn, long hint);
256 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
257 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
258 
259 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
260 	.f_attach  = filt_no_attach,
261 	.f_detach  = filt_no_detach,
262 	.f_event   = filt_bad_event,
263 	.f_touch   = filt_bad_touch,
264 	.f_process = filt_bad_process,
265 };
266 
267 #if CONFIG_MEMORYSTATUS
268 extern const struct filterops memorystatus_filtops;
269 #endif /* CONFIG_MEMORYSTATUS */
270 extern const struct filterops fs_filtops;
271 extern const struct filterops sig_filtops;
272 extern const struct filterops machport_filtops;
273 extern const struct filterops pipe_nfiltops;
274 extern const struct filterops pipe_rfiltops;
275 extern const struct filterops pipe_wfiltops;
276 extern const struct filterops ptsd_kqops;
277 extern const struct filterops ptmx_kqops;
278 extern const struct filterops soread_filtops;
279 extern const struct filterops sowrite_filtops;
280 extern const struct filterops sock_filtops;
281 extern const struct filterops soexcept_filtops;
282 extern const struct filterops spec_filtops;
283 extern const struct filterops bpfread_filtops;
284 extern const struct filterops necp_fd_rfiltops;
285 #if SKYWALK
286 extern const struct filterops skywalk_channel_rfiltops;
287 extern const struct filterops skywalk_channel_wfiltops;
288 extern const struct filterops skywalk_channel_efiltops;
289 #endif /* SKYWALK */
290 extern const struct filterops fsevent_filtops;
291 extern const struct filterops vnode_filtops;
292 extern const struct filterops tty_filtops;
293 
294 const static struct filterops file_filtops;
295 const static struct filterops kqread_filtops;
296 const static struct filterops proc_filtops;
297 const static struct filterops timer_filtops;
298 const static struct filterops user_filtops;
299 const static struct filterops workloop_filtops;
300 
301 /*
302  *
303  * Rules for adding new filters to the system:
304  * Public filters:
305  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
306  *   in the exported section of the header
307  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
308  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
309  *   of the Public Filters section in the array.
310  * Private filters:
311  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
312  *   in the XNU_KERNEL_PRIVATE section of the header
313  * - Update the EVFILTID_MAX value to reflect the new addition
314  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
315  *   the Private filters section of the array.
316  */
317 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
318 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
319 	/* Public Filters */
320 	[~EVFILT_READ]                  = &file_filtops,
321 	[~EVFILT_WRITE]                 = &file_filtops,
322 	[~EVFILT_AIO]                   = &bad_filtops,
323 	[~EVFILT_VNODE]                 = &file_filtops,
324 	[~EVFILT_PROC]                  = &proc_filtops,
325 	[~EVFILT_SIGNAL]                = &sig_filtops,
326 	[~EVFILT_TIMER]                 = &timer_filtops,
327 	[~EVFILT_MACHPORT]              = &machport_filtops,
328 	[~EVFILT_FS]                    = &fs_filtops,
329 	[~EVFILT_USER]                  = &user_filtops,
330 	[~EVFILT_UNUSED_11]             = &bad_filtops,
331 	[~EVFILT_VM]                    = &bad_filtops,
332 	[~EVFILT_SOCK]                  = &file_filtops,
333 #if CONFIG_MEMORYSTATUS
334 	[~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
335 #else
336 	[~EVFILT_MEMORYSTATUS]          = &bad_filtops,
337 #endif
338 	[~EVFILT_EXCEPT]                = &file_filtops,
339 #if SKYWALK
340 	[~EVFILT_NW_CHANNEL]            = &file_filtops,
341 #else /* !SKYWALK */
342 	[~EVFILT_NW_CHANNEL]            = &bad_filtops,
343 #endif /* !SKYWALK */
344 	[~EVFILT_WORKLOOP]              = &workloop_filtops,
345 
346 	/* Private filters */
347 	[EVFILTID_KQREAD]               = &kqread_filtops,
348 	[EVFILTID_PIPE_N]               = &pipe_nfiltops,
349 	[EVFILTID_PIPE_R]               = &pipe_rfiltops,
350 	[EVFILTID_PIPE_W]               = &pipe_wfiltops,
351 	[EVFILTID_PTSD]                 = &ptsd_kqops,
352 	[EVFILTID_SOREAD]               = &soread_filtops,
353 	[EVFILTID_SOWRITE]              = &sowrite_filtops,
354 	[EVFILTID_SCK]                  = &sock_filtops,
355 	[EVFILTID_SOEXCEPT]             = &soexcept_filtops,
356 	[EVFILTID_SPEC]                 = &spec_filtops,
357 	[EVFILTID_BPFREAD]              = &bpfread_filtops,
358 	[EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
359 #if SKYWALK
360 	[EVFILTID_SKYWALK_CHANNEL_W]    = &skywalk_channel_wfiltops,
361 	[EVFILTID_SKYWALK_CHANNEL_R]    = &skywalk_channel_rfiltops,
362 	[EVFILTID_SKYWALK_CHANNEL_E]    = &skywalk_channel_efiltops,
363 #else /* !SKYWALK */
364 	[EVFILTID_SKYWALK_CHANNEL_W]    = &bad_filtops,
365 	[EVFILTID_SKYWALK_CHANNEL_R]    = &bad_filtops,
366 	[EVFILTID_SKYWALK_CHANNEL_E]    = &bad_filtops,
367 #endif /* !SKYWALK */
368 	[EVFILTID_FSEVENT]              = &fsevent_filtops,
369 	[EVFILTID_VN]                   = &vnode_filtops,
370 	[EVFILTID_TTY]                  = &tty_filtops,
371 	[EVFILTID_PTMX]                 = &ptmx_kqops,
372 
373 	/* fake filter for detached knotes, keep last */
374 	[EVFILTID_DETACHED]             = &bad_filtops,
375 };
376 
377 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)378 kqr_thread_bound(workq_threadreq_t kqr)
379 {
380 	return kqr->tr_state == WORKQ_TR_STATE_BOUND;
381 }
382 
383 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)384 kqr_thread_requested_pending(workq_threadreq_t kqr)
385 {
386 	workq_tr_state_t tr_state = kqr->tr_state;
387 	return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
388 }
389 
390 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)391 kqr_thread_requested(workq_threadreq_t kqr)
392 {
393 	return kqr->tr_state != WORKQ_TR_STATE_IDLE;
394 }
395 
396 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)397 kqr_thread_fast(workq_threadreq_t kqr)
398 {
399 	assert(kqr_thread_bound(kqr));
400 	return kqr->tr_thread;
401 }
402 
403 static inline thread_t
kqr_thread(workq_threadreq_t kqr)404 kqr_thread(workq_threadreq_t kqr)
405 {
406 	return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
407 }
408 
409 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)410 kqr_kqworkloop(workq_threadreq_t kqr)
411 {
412 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
413 		return __container_of(kqr, struct kqworkloop, kqwl_request);
414 	}
415 	return NULL;
416 }
417 
418 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)419 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
420 {
421 	kqueue_t kqu;
422 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
423 		kqu.kqwl = kqr_kqworkloop(kqr);
424 	} else {
425 		kqu.kqwq = p->p_fd.fd_wqkqueue;
426 		assert(kqr >= kqu.kqwq->kqwq_request &&
427 		    kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
428 	}
429 	return kqu;
430 }
431 
432 #if CONFIG_PREADOPT_TG
433 /* There are no guarantees about which locks are held when this is called */
434 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)435 kqr_preadopt_thread_group(workq_threadreq_t req)
436 {
437 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
438 	return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
439 }
440 
441 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)442 inline _Atomic(thread_group_qos_t) *
443 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
444 {
445 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
446 	return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
447 }
448 #endif
449 
450 /*
451  * kqueue/note lock implementations
452  *
453  *	The kqueue lock guards the kq state, the state of its queues,
454  *	and the kqueue-aware status and locks of individual knotes.
455  *
456  *	The kqueue workq lock is used to protect state guarding the
457  *	interaction of the kqueue with the workq.  This state cannot
458  *	be guarded by the kq lock - as it needs to be taken when we
459  *	already have the waitq set lock held (during the waitq hook
460  *	callback).  It might be better to use the waitq lock itself
461  *	for this, but the IRQ requirements make that difficult).
462  *
463  *	Knote flags, filter flags, and associated data are protected
464  *	by the underlying object lock - and are only ever looked at
465  *	by calling the filter to get a [consistent] snapshot of that
466  *	data.
467  */
468 
469 static inline void
kqlock(kqueue_t kqu)470 kqlock(kqueue_t kqu)
471 {
472 	lck_spin_lock(&kqu.kq->kq_lock);
473 }
474 
475 static inline void
kqlock_held(__assert_only kqueue_t kqu)476 kqlock_held(__assert_only kqueue_t kqu)
477 {
478 	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
479 }
480 
481 static inline void
kqunlock(kqueue_t kqu)482 kqunlock(kqueue_t kqu)
483 {
484 	lck_spin_unlock(&kqu.kq->kq_lock);
485 }
486 
487 static inline void
knhash_lock(struct filedesc * fdp)488 knhash_lock(struct filedesc *fdp)
489 {
490 	lck_mtx_lock(&fdp->fd_knhashlock);
491 }
492 
493 static inline void
knhash_unlock(struct filedesc * fdp)494 knhash_unlock(struct filedesc *fdp)
495 {
496 	lck_mtx_unlock(&fdp->fd_knhashlock);
497 }
498 
499 /* wait event for knote locks */
500 static inline event_t
knote_lock_wev(struct knote * kn)501 knote_lock_wev(struct knote *kn)
502 {
503 	return (event_t)(&kn->kn_hook);
504 }
505 
506 /* wait event for kevent_register_wait_* */
507 static inline event64_t
knote_filt_wev64(struct knote * kn)508 knote_filt_wev64(struct knote *kn)
509 {
510 	/* kdp_workloop_sync_wait_find_owner knows about this */
511 	return CAST_EVENT64_T(kn);
512 }
513 
514 /* wait event for knote_post/knote_drop */
515 static inline event_t
knote_post_wev(struct knote * kn)516 knote_post_wev(struct knote *kn)
517 {
518 	return &kn->kn_kevent;
519 }
520 
521 /*!
522  * @function knote_has_qos
523  *
524  * @brief
525  * Whether the knote has a regular QoS.
526  *
527  * @discussion
528  * kn_qos_override is:
529  * - 0 on kqfiles
530  * - THREAD_QOS_LAST for special buckets (manager)
531  *
532  * Other values mean the knote participates to QoS propagation.
533  */
534 static inline bool
knote_has_qos(struct knote * kn)535 knote_has_qos(struct knote *kn)
536 {
537 	return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
538 }
539 
540 #pragma mark knote locks
541 
542 /*
543  * Enum used by the knote_lock_* functions.
544  *
545  * KNOTE_KQ_LOCK_ALWAYS
546  *   The function will always return with the kq lock held.
547  *
548  * KNOTE_KQ_LOCK_ON_SUCCESS
549  *   The function will return with the kq lock held if it was successful
550  *   (knote_lock() is the only function that can fail).
551  *
552  * KNOTE_KQ_LOCK_ON_FAILURE
553  *   The function will return with the kq lock held if it was unsuccessful
554  *   (knote_lock() is the only function that can fail).
555  *
556  * KNOTE_KQ_UNLOCK:
557  *   The function returns with the kq unlocked.
558  */
559 enum kqlocking {
560 	KNOTE_KQ_LOCK_ALWAYS,
561 	KNOTE_KQ_LOCK_ON_SUCCESS,
562 	KNOTE_KQ_LOCK_ON_FAILURE,
563 	KNOTE_KQ_UNLOCK,
564 };
565 
566 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)567 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
568 {
569 	struct knote_lock_ctx *ctx;
570 	LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
571 		if (ctx->knlc_knote == kn) {
572 			return ctx;
573 		}
574 	}
575 	panic("knote lock context not found: %p", kn);
576 	__builtin_trap();
577 }
578 
579 /* slowpath of knote_lock() */
580 __attribute__((noinline))
581 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)582 knote_lock_slow(kqueue_t kqu, struct knote *kn,
583     struct knote_lock_ctx *knlc, int kqlocking)
584 {
585 	struct knote_lock_ctx *owner_lc;
586 	struct uthread *uth = current_uthread();
587 	wait_result_t wr;
588 
589 	kqlock_held(kqu);
590 
591 	owner_lc = knote_lock_ctx_find(kqu, kn);
592 #if DEBUG || DEVELOPMENT
593 	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
594 #endif
595 	owner_lc->knlc_waiters++;
596 
597 	/*
598 	 * Make our lock context visible to knote_unlock()
599 	 */
600 	uth->uu_knlock = knlc;
601 
602 	wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
603 	    knote_lock_wev(kn), owner_lc->knlc_thread,
604 	    THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
605 
606 	if (wr == THREAD_RESTART) {
607 		/*
608 		 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
609 		 * We need to cleanup the state since no one did.
610 		 */
611 		uth->uu_knlock = NULL;
612 #if DEBUG || DEVELOPMENT
613 		assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
614 		knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
615 #endif
616 
617 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
618 		    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
619 			kqlock(kqu);
620 		}
621 		return false;
622 	} else {
623 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
624 		    kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
625 			kqlock(kqu);
626 #if DEBUG || DEVELOPMENT
627 			/*
628 			 * This state is set under the lock so we can't
629 			 * really assert this unless we hold the lock.
630 			 */
631 			assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
632 #endif
633 		}
634 		return true;
635 	}
636 }
637 
638 /*
639  * Attempts to take the "knote" lock.
640  *
641  * Called with the kqueue lock held.
642  *
643  * Returns true if the knote lock is acquired, false if it has been dropped
644  */
645 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)646 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
647     enum kqlocking kqlocking)
648 {
649 	kqlock_held(kqu);
650 
651 #if DEBUG || DEVELOPMENT
652 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
653 #endif
654 	knlc->knlc_knote = kn;
655 	knlc->knlc_thread = current_thread();
656 	knlc->knlc_waiters = 0;
657 
658 	if (__improbable(kn->kn_status & KN_LOCKED)) {
659 		return knote_lock_slow(kqu, kn, knlc, kqlocking);
660 	}
661 
662 	/*
663 	 * When the knote will be dropped, the knote lock is taken before
664 	 * KN_DROPPING is set, and then the knote will be removed from any
665 	 * hash table that references it before the lock is canceled.
666 	 */
667 	assert((kn->kn_status & KN_DROPPING) == 0);
668 	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
669 	kn->kn_status |= KN_LOCKED;
670 #if DEBUG || DEVELOPMENT
671 	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
672 #endif
673 
674 	if (kqlocking == KNOTE_KQ_UNLOCK ||
675 	    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
676 		kqunlock(kqu);
677 	}
678 	return true;
679 }
680 
681 /*
682  * Unlocks a knote successfully locked with knote_lock().
683  *
684  * Called with the kqueue lock held.
685  *
686  * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
687  */
688 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)689 knote_unlock(kqueue_t kqu, struct knote *kn,
690     struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
691 {
692 	kqlock_held(kqu);
693 
694 	assert(knlc->knlc_knote == kn);
695 	assert(kn->kn_status & KN_LOCKED);
696 #if DEBUG || DEVELOPMENT
697 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
698 #endif
699 
700 	LIST_REMOVE(knlc, knlc_link);
701 
702 	if (knlc->knlc_waiters) {
703 		thread_t thread = THREAD_NULL;
704 
705 		wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
706 		    LCK_WAKE_DEFAULT, &thread);
707 
708 		/*
709 		 * knote_lock_slow() publishes the lock context of waiters
710 		 * in uthread::uu_knlock.
711 		 *
712 		 * Reach out and make this context the new owner.
713 		 */
714 		struct uthread *ut = get_bsdthread_info(thread);
715 		struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
716 
717 		assert(next_owner_lc->knlc_knote == kn);
718 		next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
719 		LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
720 #if DEBUG || DEVELOPMENT
721 		next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
722 #endif
723 		ut->uu_knlock = NULL;
724 		thread_deallocate_safe(thread);
725 	} else {
726 		kn->kn_status &= ~KN_LOCKED;
727 	}
728 
729 	if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
730 		/*
731 		 * No f_event() in flight anymore, we can leave QoS "Merge" mode
732 		 *
733 		 * See knote_adjust_qos()
734 		 */
735 		kn->kn_status &= ~KN_MERGE_QOS;
736 	}
737 	if (kqlocking == KNOTE_KQ_UNLOCK) {
738 		kqunlock(kqu);
739 	}
740 #if DEBUG || DEVELOPMENT
741 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
742 #endif
743 }
744 
745 /*
746  * Aborts all waiters for a knote lock, and unlock the knote.
747  *
748  * Called with the kqueue lock held.
749  *
750  * Returns with the kqueue unlocked.
751  */
752 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)753 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
754     struct knote_lock_ctx *knlc)
755 {
756 	kqlock_held(kq);
757 
758 	assert(knlc->knlc_knote == kn);
759 	assert(kn->kn_status & KN_LOCKED);
760 	assert(kn->kn_status & KN_DROPPING);
761 
762 	LIST_REMOVE(knlc, knlc_link);
763 	kn->kn_status &= ~KN_LOCKED;
764 	kqunlock(kq);
765 
766 	if (knlc->knlc_waiters) {
767 		wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
768 	}
769 #if DEBUG || DEVELOPMENT
770 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
771 #endif
772 }
773 
774 /*
775  * Call the f_event hook of a given filter.
776  *
777  * Takes a use count to protect against concurrent drops.
778  * Called with the object lock held.
779  */
780 static void
knote_post(struct knote * kn,long hint)781 knote_post(struct knote *kn, long hint)
782 {
783 	struct kqueue *kq = knote_get_kq(kn);
784 	int dropping, result;
785 
786 	kqlock(kq);
787 
788 	if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
789 		return kqunlock(kq);
790 	}
791 
792 	if (__improbable(kn->kn_status & KN_POSTING)) {
793 		panic("KNOTE() called concurrently on knote %p", kn);
794 	}
795 
796 	kn->kn_status |= KN_POSTING;
797 
798 	kqunlock(kq);
799 	result = filter_call(knote_fops(kn), f_event(kn, hint));
800 	kqlock(kq);
801 
802 	/* Someone dropped the knote/the monitored object vanished while we
803 	 * were in f_event, swallow the side effects of the post.
804 	 */
805 	dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
806 
807 	if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
808 		kqueue_update_iotier_override(kq);
809 	}
810 
811 	if (!dropping && (result & FILTER_ACTIVE)) {
812 		knote_activate(kq, kn, result);
813 	}
814 
815 	if ((kn->kn_status & KN_LOCKED) == 0) {
816 		/*
817 		 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
818 		 *
819 		 * See knote_adjust_qos()
820 		 */
821 		kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
822 	} else {
823 		kn->kn_status &= ~KN_POSTING;
824 	}
825 
826 	if (__improbable(dropping)) {
827 		thread_wakeup(knote_post_wev(kn));
828 	}
829 
830 	kqunlock(kq);
831 }
832 
833 /*
834  * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
835  * caller to be done.
836  *
837  *	- kq locked at entry
838  *	- kq unlocked at exit
839  */
840 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)841 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
842 {
843 	kqlock_held(kq);
844 
845 	assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
846 
847 	if (kn->kn_status & KN_POSTING) {
848 		lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
849 		    THREAD_UNINT | THREAD_WAIT_NOREPORT);
850 	} else {
851 		kqunlock(kq);
852 	}
853 }
854 
855 #pragma mark knote helpers for filters
856 
857 OS_ALWAYS_INLINE
858 void
knote_set_error(struct knote * kn,int error)859 knote_set_error(struct knote *kn, int error)
860 {
861 	kn->kn_flags |= EV_ERROR;
862 	kn->kn_sdata = error;
863 }
864 
865 OS_ALWAYS_INLINE
866 int64_t
knote_low_watermark(const struct knote * kn)867 knote_low_watermark(const struct knote *kn)
868 {
869 	return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
870 }
871 
872 /*!
873  * @function knote_fill_kevent_with_sdata
874  *
875  * @brief
876  * Fills in a kevent from the current content of a knote.
877  *
878  * @discussion
879  * This is meant to be called from filter's f_event hooks.
880  * The kevent data is filled with kn->kn_sdata.
881  *
882  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
883  *
884  * Using knote_fill_kevent is typically preferred.
885  */
886 OS_ALWAYS_INLINE
887 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)888 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
889 {
890 #define knote_assert_aliases(name1, offs1, name2) \
891 	static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
892 	    offsetof(struct kevent_internal_s, name2), \
893 	        "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
894 	/*
895 	 * All the code makes assumptions on these aliasing,
896 	 * so make sure we fail the build if we ever ever ever break them.
897 	 */
898 	knote_assert_aliases(ident, 0, kei_ident);
899 #ifdef __LITTLE_ENDIAN__
900 	knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
901 	knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
902 #else
903 	knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
904 	knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
905 #endif
906 	knote_assert_aliases(flags, 0, kei_flags);
907 	knote_assert_aliases(qos, 0, kei_qos);
908 	knote_assert_aliases(udata, 0, kei_udata);
909 	knote_assert_aliases(fflags, 0, kei_fflags);
910 	knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
911 	knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
912 	knote_assert_aliases(ext, 0, kei_ext);
913 #undef knote_assert_aliases
914 
915 	/*
916 	 * Fix the differences between kevent_qos_s and kevent_internal_s:
917 	 * - xflags is where kn_sfflags lives, we need to zero it
918 	 * - fixup the high bits of `filter` where kn_filtid lives
919 	 */
920 	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;
921 	kev->xflags = 0;
922 	kev->filter |= 0xff00;
923 	if (kn->kn_flags & EV_CLEAR) {
924 		kn->kn_fflags = 0;
925 	}
926 }
927 
928 /*!
929  * @function knote_fill_kevent
930  *
931  * @brief
932  * Fills in a kevent from the current content of a knote.
933  *
934  * @discussion
935  * This is meant to be called from filter's f_event hooks.
936  * The kevent data is filled with the passed in data.
937  *
938  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
939  */
940 OS_ALWAYS_INLINE
941 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)942 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
943 {
944 	knote_fill_kevent_with_sdata(kn, kev);
945 	kev->filter = kn->kn_filter;
946 	kev->data = data;
947 }
948 
949 
950 #pragma mark file_filtops
951 
952 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)953 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
954 {
955 	return fo_kqfilter(kn->kn_fp, kn, kev);
956 }
957 
958 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
959 	.f_isfd = 1,
960 	.f_attach = filt_fileattach,
961 };
962 
963 #pragma mark kqread_filtops
964 
965 #define f_flag fp_glob->fg_flag
966 #define f_ops fp_glob->fg_ops
967 #define f_lflags fp_glob->fg_lflags
968 
969 static void
filt_kqdetach(struct knote * kn)970 filt_kqdetach(struct knote *kn)
971 {
972 	struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
973 	struct kqueue *kq = &kqf->kqf_kqueue;
974 
975 	kqlock(kq);
976 	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
977 	kqunlock(kq);
978 }
979 
980 static int
filt_kqueue(struct knote * kn,__unused long hint)981 filt_kqueue(struct knote *kn, __unused long hint)
982 {
983 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
984 
985 	return kq->kq_count > 0;
986 }
987 
988 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)989 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
990 {
991 #pragma unused(kev)
992 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
993 	int res;
994 
995 	kqlock(kq);
996 	res = (kq->kq_count > 0);
997 	kqunlock(kq);
998 
999 	return res;
1000 }
1001 
1002 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1003 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1004 {
1005 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1006 	int res = 0;
1007 
1008 	kqlock(kq);
1009 	if (kq->kq_count) {
1010 		knote_fill_kevent(kn, kev, kq->kq_count);
1011 		res = 1;
1012 	}
1013 	kqunlock(kq);
1014 
1015 	return res;
1016 }
1017 
1018 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1019 	.f_isfd = 1,
1020 	.f_detach = filt_kqdetach,
1021 	.f_event = filt_kqueue,
1022 	.f_touch = filt_kqtouch,
1023 	.f_process = filt_kqprocess,
1024 };
1025 
1026 #pragma mark proc_filtops
1027 
1028 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1029 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1030 {
1031 	struct proc *p;
1032 
1033 	assert(PID_MAX < NOTE_PDATAMASK);
1034 
1035 	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1036 		knote_set_error(kn, ENOTSUP);
1037 		return 0;
1038 	}
1039 
1040 	p = proc_find((int)kn->kn_id);
1041 	if (p == NULL) {
1042 		knote_set_error(kn, ESRCH);
1043 		return 0;
1044 	}
1045 
1046 	const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1047 
1048 	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1049 		do {
1050 			pid_t selfpid = proc_selfpid();
1051 
1052 			if (p->p_ppid == selfpid) {
1053 				break;  /* parent => ok */
1054 			}
1055 			if ((p->p_lflag & P_LTRACED) != 0 &&
1056 			    (p->p_oppid == selfpid)) {
1057 				break;  /* parent-in-waiting => ok */
1058 			}
1059 			if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1060 				break; /* allowed to signal => ok */
1061 			}
1062 			proc_rele(p);
1063 			knote_set_error(kn, EACCES);
1064 			return 0;
1065 		} while (0);
1066 	}
1067 
1068 	kn->kn_proc = p;
1069 	kn->kn_flags |= EV_CLEAR;       /* automatically set */
1070 	kn->kn_sdata = 0;               /* incoming data is ignored */
1071 
1072 	proc_klist_lock();
1073 
1074 	KNOTE_ATTACH(&p->p_klist, kn);
1075 
1076 	proc_klist_unlock();
1077 
1078 	proc_rele(p);
1079 
1080 	/*
1081 	 * only captures edge-triggered events after this point
1082 	 * so it can't already be fired.
1083 	 */
1084 	return 0;
1085 }
1086 
1087 
1088 /*
1089  * The knote may be attached to a different process, which may exit,
1090  * leaving nothing for the knote to be attached to.  In that case,
1091  * the pointer to the process will have already been nulled out.
1092  */
1093 static void
filt_procdetach(struct knote * kn)1094 filt_procdetach(struct knote *kn)
1095 {
1096 	struct proc *p;
1097 
1098 	proc_klist_lock();
1099 
1100 	p = kn->kn_proc;
1101 	if (p != PROC_NULL) {
1102 		kn->kn_proc = PROC_NULL;
1103 		KNOTE_DETACH(&p->p_klist, kn);
1104 	}
1105 
1106 	proc_klist_unlock();
1107 }
1108 
1109 static int
filt_procevent(struct knote * kn,long hint)1110 filt_procevent(struct knote *kn, long hint)
1111 {
1112 	u_int event;
1113 
1114 	/* ALWAYS CALLED WITH proc_klist_lock */
1115 
1116 	/*
1117 	 * Note: a lot of bits in hint may be obtained from the knote
1118 	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1119 	 * bits in hint for filt_procevent
1120 	 *
1121 	 * mask off extra data
1122 	 */
1123 	event = (u_int)hint & NOTE_PCTRLMASK;
1124 
1125 	/*
1126 	 * termination lifecycle events can happen while a debugger
1127 	 * has reparented a process, in which case notifications
1128 	 * should be quashed except to the tracing parent. When
1129 	 * the debugger reaps the child (either via wait4(2) or
1130 	 * process exit), the child will be reparented to the original
1131 	 * parent and these knotes re-fired.
1132 	 */
1133 	if (event & NOTE_EXIT) {
1134 		if ((kn->kn_proc->p_oppid != 0)
1135 		    && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1136 			/*
1137 			 * This knote is not for the current ptrace(2) parent, ignore.
1138 			 */
1139 			return 0;
1140 		}
1141 	}
1142 
1143 	/*
1144 	 * if the user is interested in this event, record it.
1145 	 */
1146 	if (kn->kn_sfflags & event) {
1147 		kn->kn_fflags |= event;
1148 	}
1149 
1150 #pragma clang diagnostic push
1151 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1152 	if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1153 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1154 	}
1155 #pragma clang diagnostic pop
1156 
1157 
1158 	/*
1159 	 * The kernel has a wrapper in place that returns the same data
1160 	 * as is collected here, in kn_hook32.  Any changes to how
1161 	 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1162 	 * should also be reflected in the proc_pidnoteexit() wrapper.
1163 	 */
1164 	if (event == NOTE_EXIT) {
1165 		kn->kn_hook32 = 0;
1166 		if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1167 			kn->kn_fflags |= NOTE_EXITSTATUS;
1168 			kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1169 		}
1170 		if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1171 			kn->kn_fflags |= NOTE_EXIT_DETAIL;
1172 			if ((kn->kn_proc->p_lflag &
1173 			    P_LTERM_DECRYPTFAIL) != 0) {
1174 				kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1175 			}
1176 			if ((kn->kn_proc->p_lflag &
1177 			    P_LTERM_JETSAM) != 0) {
1178 				kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1179 				switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1180 				case P_JETSAM_VMPAGESHORTAGE:
1181 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1182 					break;
1183 				case P_JETSAM_VMTHRASHING:
1184 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1185 					break;
1186 				case P_JETSAM_FCTHRASHING:
1187 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1188 					break;
1189 				case P_JETSAM_VNODE:
1190 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1191 					break;
1192 				case P_JETSAM_HIWAT:
1193 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1194 					break;
1195 				case P_JETSAM_PID:
1196 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1197 					break;
1198 				case P_JETSAM_IDLEEXIT:
1199 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1200 					break;
1201 				}
1202 			}
1203 			if ((proc_getcsflags(kn->kn_proc) &
1204 			    CS_KILLED) != 0) {
1205 				kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1206 			}
1207 		}
1208 	}
1209 
1210 	/* if we have any matching state, activate the knote */
1211 	return kn->kn_fflags != 0;
1212 }
1213 
1214 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1215 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1216 {
1217 	int res;
1218 
1219 	proc_klist_lock();
1220 
1221 	/* accept new filter flags and mask off output events no long interesting */
1222 	kn->kn_sfflags = kev->fflags;
1223 
1224 	/* restrict the current results to the (smaller?) set of new interest */
1225 	/*
1226 	 * For compatibility with previous implementations, we leave kn_fflags
1227 	 * as they were before.
1228 	 */
1229 	//kn->kn_fflags &= kn->kn_sfflags;
1230 
1231 	res = (kn->kn_fflags != 0);
1232 
1233 	proc_klist_unlock();
1234 
1235 	return res;
1236 }
1237 
1238 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1239 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1240 {
1241 	int res = 0;
1242 
1243 	proc_klist_lock();
1244 	if (kn->kn_fflags) {
1245 		knote_fill_kevent(kn, kev, kn->kn_hook32);
1246 		kn->kn_hook32 = 0;
1247 		res = 1;
1248 	}
1249 	proc_klist_unlock();
1250 	return res;
1251 }
1252 
1253 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1254 	.f_attach  = filt_procattach,
1255 	.f_detach  = filt_procdetach,
1256 	.f_event   = filt_procevent,
1257 	.f_touch   = filt_proctouch,
1258 	.f_process = filt_procprocess,
1259 };
1260 
1261 #pragma mark timer_filtops
1262 
1263 struct filt_timer_params {
1264 	uint64_t deadline; /* deadline in abs/cont time
1265 	                    *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1266 	uint64_t leeway;   /* leeway in abstime, or 0 if none */
1267 	uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1268 };
1269 
1270 /*
1271  * Values stored in the knote at rest (using Mach absolute time units)
1272  *
1273  * kn->kn_thcall        where the thread_call object is stored
1274  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1275  * kn->kn_ext[1]        leeway value
1276  * kn->kn_sdata         interval timer: the interval
1277  *                      absolute/deadline timer: 0
1278  * kn->kn_hook32        timer state (with gencount)
1279  *
1280  * TIMER_IDLE:
1281  *   The timer has either never been scheduled or been cancelled.
1282  *   It is safe to schedule a new one in this state.
1283  *
1284  * TIMER_ARMED:
1285  *   The timer has been scheduled
1286  *
1287  * TIMER_FIRED
1288  *   The timer has fired and an event needs to be delivered.
1289  *   When in this state, the callout may still be running.
1290  *
1291  * TIMER_IMMEDIATE
1292  *   The timer has fired at registration time, and the callout was never
1293  *   dispatched.
1294  */
1295 #define TIMER_IDLE       0x0
1296 #define TIMER_ARMED      0x1
1297 #define TIMER_FIRED      0x2
1298 #define TIMER_IMMEDIATE  0x3
1299 #define TIMER_STATE_MASK 0x3
1300 #define TIMER_GEN_INC    0x4
1301 
1302 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1303 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1304 {
1305 	kn->kn_ext[0] = params->deadline;
1306 	kn->kn_ext[1] = params->leeway;
1307 	kn->kn_sdata  = params->interval;
1308 }
1309 
1310 /*
1311  * filt_timervalidate - process data from user
1312  *
1313  * Sets up the deadline, interval, and leeway from the provided user data
1314  *
1315  * Input:
1316  *      kn_sdata        timer deadline or interval time
1317  *      kn_sfflags      style of timer, unit of measurement
1318  *
1319  * Output:
1320  *      struct filter_timer_params to apply to the filter with
1321  *      filt_timer_set_params when changes are ready to be commited.
1322  *
1323  * Returns:
1324  *      EINVAL          Invalid user data parameters
1325  *      ERANGE          Various overflows with the parameters
1326  *
1327  * Called with timer filter lock held.
1328  */
1329 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1330 filt_timervalidate(const struct kevent_qos_s *kev,
1331     struct filt_timer_params *params)
1332 {
1333 	/*
1334 	 * There are 5 knobs that need to be chosen for a timer registration:
1335 	 *
1336 	 * A) Units of time (what is the time duration of the specified number)
1337 	 *      Absolute and interval take:
1338 	 *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1339 	 *      Defaults to milliseconds if not specified
1340 	 *
1341 	 * B) Clock epoch (what is the zero point of the specified number)
1342 	 *      For interval, there is none
1343 	 *      For absolute, defaults to the gettimeofday/calendar epoch
1344 	 *      With NOTE_MACHTIME, uses mach_absolute_time()
1345 	 *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1346 	 *
1347 	 * C) The knote's behavior on delivery
1348 	 *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1349 	 *      Absolute is a forced one-shot timer which deletes on delivery
1350 	 *      TODO: Add a way for absolute to be not forced one-shot
1351 	 *
1352 	 * D) Whether the time duration is relative to now or absolute
1353 	 *      Interval fires at now + duration when it is set up
1354 	 *      Absolute fires at now + difference between now walltime and passed in walltime
1355 	 *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1356 	 *
1357 	 * E) Whether the timer continues to tick across sleep
1358 	 *      By default all three do not.
1359 	 *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1360 	 *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1361 	 *              expires when mach_continuous_time() is > the passed in value.
1362 	 */
1363 
1364 	uint64_t multiplier;
1365 
1366 	boolean_t use_abstime = FALSE;
1367 
1368 	switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1369 	case NOTE_SECONDS:
1370 		multiplier = NSEC_PER_SEC;
1371 		break;
1372 	case NOTE_USECONDS:
1373 		multiplier = NSEC_PER_USEC;
1374 		break;
1375 	case NOTE_NSECONDS:
1376 		multiplier = 1;
1377 		break;
1378 	case NOTE_MACHTIME:
1379 		multiplier = 0;
1380 		use_abstime = TRUE;
1381 		break;
1382 	case 0: /* milliseconds (default) */
1383 		multiplier = NSEC_PER_SEC / 1000;
1384 		break;
1385 	default:
1386 		return EINVAL;
1387 	}
1388 
1389 	/* transform the leeway in kn_ext[1] to same time scale */
1390 	if (kev->fflags & NOTE_LEEWAY) {
1391 		uint64_t leeway_abs;
1392 
1393 		if (use_abstime) {
1394 			leeway_abs = (uint64_t)kev->ext[1];
1395 		} else {
1396 			uint64_t leeway_ns;
1397 			if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1398 				return ERANGE;
1399 			}
1400 
1401 			nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1402 		}
1403 
1404 		params->leeway = leeway_abs;
1405 	} else {
1406 		params->leeway = 0;
1407 	}
1408 
1409 	if (kev->fflags & NOTE_ABSOLUTE) {
1410 		uint64_t deadline_abs;
1411 
1412 		if (use_abstime) {
1413 			deadline_abs = (uint64_t)kev->data;
1414 		} else {
1415 			uint64_t calendar_deadline_ns;
1416 
1417 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1418 				return ERANGE;
1419 			}
1420 
1421 			/* calendar_deadline_ns is in nanoseconds since the epoch */
1422 
1423 			clock_sec_t seconds;
1424 			clock_nsec_t nanoseconds;
1425 
1426 			/*
1427 			 * Note that the conversion through wall-time is only done once.
1428 			 *
1429 			 * If the relationship between MAT and gettimeofday changes,
1430 			 * the underlying timer does not update.
1431 			 *
1432 			 * TODO: build a wall-time denominated timer_call queue
1433 			 * and a flag to request DTRTing with wall-time timers
1434 			 */
1435 			clock_get_calendar_nanotime(&seconds, &nanoseconds);
1436 
1437 			uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1438 
1439 			/* if deadline is in the future */
1440 			if (calendar_now_ns < calendar_deadline_ns) {
1441 				uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1442 				uint64_t interval_abs;
1443 
1444 				nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1445 
1446 				/*
1447 				 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1448 				 * causes the timer to keep ticking across sleep, but
1449 				 * it does not change the calendar timebase.
1450 				 */
1451 
1452 				if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1453 					clock_continuoustime_interval_to_deadline(interval_abs,
1454 					    &deadline_abs);
1455 				} else {
1456 					clock_absolutetime_interval_to_deadline(interval_abs,
1457 					    &deadline_abs);
1458 				}
1459 			} else {
1460 				deadline_abs = 0; /* cause immediate expiration */
1461 			}
1462 		}
1463 
1464 		params->deadline = deadline_abs;
1465 		params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1466 	} else if (kev->data < 0) {
1467 		/*
1468 		 * Negative interval timers fire immediately, once.
1469 		 *
1470 		 * Ideally a negative interval would be an error, but certain clients
1471 		 * pass negative values on accident, and expect an event back.
1472 		 *
1473 		 * In the old implementation the timer would repeat with no delay
1474 		 * N times until mach_absolute_time() + (N * interval) underflowed,
1475 		 * then it would wait ~forever by accidentally arming a timer for the far future.
1476 		 *
1477 		 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1478 		 */
1479 
1480 		params->deadline = 0; /* expire immediately */
1481 		params->interval = 0; /* non-repeating */
1482 	} else {
1483 		uint64_t interval_abs = 0;
1484 
1485 		if (use_abstime) {
1486 			interval_abs = (uint64_t)kev->data;
1487 		} else {
1488 			uint64_t interval_ns;
1489 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1490 				return ERANGE;
1491 			}
1492 
1493 			nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1494 		}
1495 
1496 		uint64_t deadline = 0;
1497 
1498 		if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1499 			clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1500 		} else {
1501 			clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1502 		}
1503 
1504 		params->deadline = deadline;
1505 		params->interval = interval_abs;
1506 	}
1507 
1508 	return 0;
1509 }
1510 
1511 /*
1512  * filt_timerexpire - the timer callout routine
1513  */
1514 static void
filt_timerexpire(void * knx,void * state_on_arm)1515 filt_timerexpire(void *knx, void *state_on_arm)
1516 {
1517 	struct knote *kn = knx;
1518 
1519 	uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1520 	uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1521 
1522 	if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1523 		// our f_event always would say FILTER_ACTIVE,
1524 		// so be leaner and just do it.
1525 		struct kqueue *kq = knote_get_kq(kn);
1526 		kqlock(kq);
1527 		knote_activate(kq, kn, FILTER_ACTIVE);
1528 		kqunlock(kq);
1529 	} else {
1530 		/*
1531 		 * The timer has been reprogrammed or canceled since it was armed,
1532 		 * and this is a late firing for the timer, just ignore it.
1533 		 */
1534 	}
1535 }
1536 
1537 /*
1538  * Does this deadline needs a timer armed for it, or has it expired?
1539  */
1540 static bool
filt_timer_is_ready(struct knote * kn)1541 filt_timer_is_ready(struct knote *kn)
1542 {
1543 	uint64_t now, deadline = kn->kn_ext[0];
1544 
1545 	if (deadline == 0) {
1546 		return true;
1547 	}
1548 
1549 	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1550 		now = mach_continuous_time();
1551 	} else {
1552 		now = mach_absolute_time();
1553 	}
1554 	return deadline <= now;
1555 }
1556 
1557 /*
1558  * Arm a timer
1559  *
1560  * It is the responsibility of the caller to make sure the timer call
1561  * has completed or been cancelled properly prior to arming it.
1562  */
1563 static void
filt_timerarm(struct knote * kn)1564 filt_timerarm(struct knote *kn)
1565 {
1566 	uint64_t deadline = kn->kn_ext[0];
1567 	uint64_t leeway   = kn->kn_ext[1];
1568 	uint32_t state;
1569 
1570 	int filter_flags = kn->kn_sfflags;
1571 	unsigned int timer_flags = 0;
1572 
1573 	if (filter_flags & NOTE_CRITICAL) {
1574 		timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1575 	} else if (filter_flags & NOTE_BACKGROUND) {
1576 		timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1577 	} else {
1578 		timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1579 	}
1580 
1581 	if (filter_flags & NOTE_LEEWAY) {
1582 		timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1583 	}
1584 
1585 	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1586 		timer_flags |= THREAD_CALL_CONTINUOUS;
1587 	}
1588 
1589 	/*
1590 	 * Move to ARMED.
1591 	 *
1592 	 * We increase the gencount, and setup the thread call with this expected
1593 	 * state. It means that if there was a previous generation of the timer in
1594 	 * flight that needs to be ignored, then 3 things are possible:
1595 	 *
1596 	 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1597 	 *   but we clobber it with ARMED and a new gencount. The knote will still
1598 	 *   be activated, but filt_timerprocess() which is serialized with this
1599 	 *   call will not see the FIRED bit set and will not deliver an event.
1600 	 *
1601 	 * - this code runs first, but filt_timerexpire() comes second. Because it
1602 	 *   knows an old gencount, it will debounce and not activate the knote.
1603 	 *
1604 	 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1605 	 *   will just cancel it properly.
1606 	 *
1607 	 * This is important as userspace expects to never be woken up for past
1608 	 * timers after filt_timertouch ran.
1609 	 */
1610 	state = os_atomic_load(&kn->kn_hook32, relaxed);
1611 	state &= ~TIMER_STATE_MASK;
1612 	state += TIMER_GEN_INC + TIMER_ARMED;
1613 	os_atomic_store(&kn->kn_hook32, state, relaxed);
1614 
1615 	thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1616 	    (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1617 }
1618 
1619 /*
1620  * Mark a timer as "already fired" when it is being reprogrammed
1621  *
1622  * If there is a timer in flight, this will do a best effort at canceling it,
1623  * but will not wait. If the thread call was in flight, having set the
1624  * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1625  * cancelation.
1626  */
1627 static void
filt_timerfire_immediate(struct knote * kn)1628 filt_timerfire_immediate(struct knote *kn)
1629 {
1630 	uint32_t state;
1631 
1632 	static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1633 	    "validate that this atomic or will transition to IMMEDIATE");
1634 	state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1635 
1636 	if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1637 		thread_call_cancel(kn->kn_thcall);
1638 	}
1639 }
1640 
1641 /*
1642  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1643  */
1644 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1645 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1646 {
1647 	thread_call_t callout;
1648 	struct filt_timer_params params;
1649 	int error;
1650 
1651 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1652 		knote_set_error(kn, error);
1653 		return 0;
1654 	}
1655 
1656 	callout = thread_call_allocate_with_options(filt_timerexpire,
1657 	    (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1658 	    THREAD_CALL_OPTIONS_ONCE);
1659 
1660 	if (NULL == callout) {
1661 		knote_set_error(kn, ENOMEM);
1662 		return 0;
1663 	}
1664 
1665 	filt_timer_set_params(kn, &params);
1666 	kn->kn_thcall = callout;
1667 	kn->kn_flags |= EV_CLEAR;
1668 	os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1669 
1670 	/* NOTE_ABSOLUTE implies EV_ONESHOT */
1671 	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1672 		kn->kn_flags |= EV_ONESHOT;
1673 	}
1674 
1675 	if (filt_timer_is_ready(kn)) {
1676 		os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1677 		return FILTER_ACTIVE;
1678 	} else {
1679 		filt_timerarm(kn);
1680 		return 0;
1681 	}
1682 }
1683 
1684 /*
1685  * Shut down the timer if it's running, and free the callout.
1686  */
1687 static void
filt_timerdetach(struct knote * kn)1688 filt_timerdetach(struct knote *kn)
1689 {
1690 	__assert_only boolean_t freed;
1691 
1692 	/*
1693 	 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1694 	 * running anymore.
1695 	 */
1696 	thread_call_cancel_wait(kn->kn_thcall);
1697 	freed = thread_call_free(kn->kn_thcall);
1698 	assert(freed);
1699 }
1700 
1701 /*
1702  * filt_timertouch - update timer knote with new user input
1703  *
1704  * Cancel and restart the timer based on new user data. When
1705  * the user picks up a knote, clear the count of how many timer
1706  * pops have gone off (in kn_data).
1707  */
1708 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1709 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1710 {
1711 	struct filt_timer_params params;
1712 	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1713 	int error;
1714 
1715 	if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1716 	    !_pthread_priority_thread_qos(kev->qos)) {
1717 		/* validate usage of FILTER_UPDATE_REQ_QOS */
1718 		kev->flags |= EV_ERROR;
1719 		kev->data = ERANGE;
1720 		return 0;
1721 	}
1722 
1723 	if (changed_flags & NOTE_ABSOLUTE) {
1724 		kev->flags |= EV_ERROR;
1725 		kev->data = EINVAL;
1726 		return 0;
1727 	}
1728 
1729 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1730 		kev->flags |= EV_ERROR;
1731 		kev->data = error;
1732 		return 0;
1733 	}
1734 
1735 	/* capture the new values used to compute deadline */
1736 	filt_timer_set_params(kn, &params);
1737 	kn->kn_sfflags = kev->fflags;
1738 
1739 	if (filt_timer_is_ready(kn)) {
1740 		filt_timerfire_immediate(kn);
1741 		return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1742 	} else {
1743 		filt_timerarm(kn);
1744 		return FILTER_UPDATE_REQ_QOS;
1745 	}
1746 }
1747 
1748 /*
1749  * filt_timerprocess - query state of knote and snapshot event data
1750  *
1751  * Determine if the timer has fired in the past, snapshot the state
1752  * of the kevent for returning to user-space, and clear pending event
1753  * counters for the next time.
1754  */
1755 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1756 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1757 {
1758 	uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1759 
1760 	/*
1761 	 * filt_timerprocess is serialized with any filter routine except for
1762 	 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1763 	 * transition, and on success, activates the knote.
1764 	 *
1765 	 * Hence, we don't need atomic modifications of the state, only to peek at
1766 	 * whether we see any of the "FIRED" state, and if we do, it is safe to
1767 	 * do simple state machine transitions.
1768 	 */
1769 	switch (state & TIMER_STATE_MASK) {
1770 	case TIMER_IDLE:
1771 	case TIMER_ARMED:
1772 		/*
1773 		 * This can happen if a touch resets a timer that had fired
1774 		 * without being processed
1775 		 */
1776 		return 0;
1777 	}
1778 
1779 	os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1780 
1781 	/*
1782 	 * Copy out the interesting kevent state,
1783 	 * but don't leak out the raw time calculations.
1784 	 *
1785 	 * TODO: potential enhancements - tell the user about:
1786 	 *      - deadline to which this timer thought it was expiring
1787 	 *      - return kn_sfflags in the fflags field so the client can know
1788 	 *        under what flags the timer fired
1789 	 */
1790 	knote_fill_kevent(kn, kev, 1);
1791 	kev->ext[0] = 0;
1792 	/* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1793 
1794 	if (kn->kn_sdata != 0) {
1795 		/*
1796 		 * This is a 'repeating' timer, so we have to emit
1797 		 * how many intervals expired between the arm
1798 		 * and the process.
1799 		 *
1800 		 * A very strange style of interface, because
1801 		 * this could easily be done in the client...
1802 		 */
1803 
1804 		uint64_t now;
1805 
1806 		if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1807 			now = mach_continuous_time();
1808 		} else {
1809 			now = mach_absolute_time();
1810 		}
1811 
1812 		uint64_t first_deadline = kn->kn_ext[0];
1813 		uint64_t interval_abs   = kn->kn_sdata;
1814 		uint64_t orig_arm_time  = first_deadline - interval_abs;
1815 
1816 		assert(now > orig_arm_time);
1817 		assert(now > first_deadline);
1818 
1819 		uint64_t elapsed = now - orig_arm_time;
1820 
1821 		uint64_t num_fired = elapsed / interval_abs;
1822 
1823 		/*
1824 		 * To reach this code, we must have seen the timer pop
1825 		 * and be in repeating mode, so therefore it must have been
1826 		 * more than 'interval' time since the attach or last
1827 		 * successful touch.
1828 		 */
1829 		assert(num_fired > 0);
1830 
1831 		/* report how many intervals have elapsed to the user */
1832 		kev->data = (int64_t)num_fired;
1833 
1834 		/* We only need to re-arm the timer if it's not about to be destroyed */
1835 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
1836 			/* fire at the end of the next interval */
1837 			uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1838 
1839 			assert(new_deadline > now);
1840 
1841 			kn->kn_ext[0] = new_deadline;
1842 
1843 			/*
1844 			 * This can't shortcut setting up the thread call, because
1845 			 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1846 			 */
1847 			filt_timerarm(kn);
1848 		}
1849 	}
1850 
1851 	return FILTER_ACTIVE;
1852 }
1853 
1854 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1855 	.f_extended_codes = true,
1856 	.f_attach   = filt_timerattach,
1857 	.f_detach   = filt_timerdetach,
1858 	.f_event    = filt_bad_event,
1859 	.f_touch    = filt_timertouch,
1860 	.f_process  = filt_timerprocess,
1861 };
1862 
1863 #pragma mark user_filtops
1864 
1865 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1866 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1867 {
1868 	if (kn->kn_sfflags & NOTE_TRIGGER) {
1869 		kn->kn_hook32 = FILTER_ACTIVE;
1870 	} else {
1871 		kn->kn_hook32 = 0;
1872 	}
1873 	return kn->kn_hook32;
1874 }
1875 
1876 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1877 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1878 {
1879 	uint32_t ffctrl;
1880 	int fflags;
1881 
1882 	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1883 	fflags = kev->fflags & NOTE_FFLAGSMASK;
1884 	switch (ffctrl) {
1885 	case NOTE_FFNOP:
1886 		break;
1887 	case NOTE_FFAND:
1888 		kn->kn_sfflags &= fflags;
1889 		break;
1890 	case NOTE_FFOR:
1891 		kn->kn_sfflags |= fflags;
1892 		break;
1893 	case NOTE_FFCOPY:
1894 		kn->kn_sfflags = fflags;
1895 		break;
1896 	}
1897 	kn->kn_sdata = kev->data;
1898 
1899 	if (kev->fflags & NOTE_TRIGGER) {
1900 		kn->kn_hook32 = FILTER_ACTIVE;
1901 	}
1902 	return (int)kn->kn_hook32;
1903 }
1904 
1905 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1906 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1907 {
1908 	int result = (int)kn->kn_hook32;
1909 
1910 	if (result) {
1911 		/* EVFILT_USER returns the data that was passed in */
1912 		knote_fill_kevent_with_sdata(kn, kev);
1913 		kev->fflags = kn->kn_sfflags;
1914 		if (kn->kn_flags & EV_CLEAR) {
1915 			/* knote_fill_kevent cleared kn_fflags */
1916 			kn->kn_hook32 = 0;
1917 		}
1918 	}
1919 
1920 	return result;
1921 }
1922 
1923 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1924 	.f_extended_codes = true,
1925 	.f_attach  = filt_userattach,
1926 	.f_detach  = filt_no_detach,
1927 	.f_event   = filt_bad_event,
1928 	.f_touch   = filt_usertouch,
1929 	.f_process = filt_userprocess,
1930 };
1931 
1932 #pragma mark workloop_filtops
1933 
1934 #define EPREEMPTDISABLED (-1)
1935 
1936 static inline void
filt_wllock(struct kqworkloop * kqwl)1937 filt_wllock(struct kqworkloop *kqwl)
1938 {
1939 	lck_spin_lock(&kqwl->kqwl_statelock);
1940 }
1941 
1942 static inline void
filt_wlunlock(struct kqworkloop * kqwl)1943 filt_wlunlock(struct kqworkloop *kqwl)
1944 {
1945 	lck_spin_unlock(&kqwl->kqwl_statelock);
1946 }
1947 
1948 /*
1949  * Returns true when the interlock for the turnstile is the workqueue lock
1950  *
1951  * When this is the case, all turnstiles operations are delegated
1952  * to the workqueue subsystem.
1953  *
1954  * This is required because kqueue_threadreq_bind_prepost only holds the
1955  * workqueue lock but needs to move the inheritor from the workloop turnstile
1956  * away from the creator thread, so that this now fulfilled request cannot be
1957  * picked anymore by other threads.
1958  */
1959 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)1960 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1961 {
1962 	return kqr_thread_requested_pending(&kqwl->kqwl_request);
1963 }
1964 
1965 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)1966 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
1967     turnstile_update_flags_t flags)
1968 {
1969 	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1970 	workq_threadreq_t kqr = &kqwl->kqwl_request;
1971 
1972 	/*
1973 	 * binding to the workq should always happen through
1974 	 * workq_kern_threadreq_update_inheritor()
1975 	 */
1976 	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1977 
1978 	if ((inheritor = kqwl->kqwl_owner)) {
1979 		flags |= TURNSTILE_INHERITOR_THREAD;
1980 	} else if ((inheritor = kqr_thread(kqr))) {
1981 		flags |= TURNSTILE_INHERITOR_THREAD;
1982 	}
1983 
1984 	turnstile_update_inheritor(ts, inheritor, flags);
1985 }
1986 
1987 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
1988 #define FILT_WLATTACH 0
1989 #define FILT_WLTOUCH  1
1990 #define FILT_WLDROP   2
1991 
1992 __result_use_check
1993 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)1994 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
1995     struct kevent_qos_s *kev, kq_index_t qos_index, int op)
1996 {
1997 	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1998 	workq_threadreq_t kqr = &kqwl->kqwl_request;
1999 	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2000 	kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2001 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2002 	int action = KQWL_UTQ_NONE, error = 0;
2003 	bool wl_inheritor_updated = false, needs_wake = false;
2004 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2005 	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2006 	uint64_t udata = 0;
2007 	struct turnstile *ts = TURNSTILE_NULL;
2008 
2009 	filt_wllock(kqwl);
2010 
2011 again:
2012 	new_owner = cur_owner = kqwl->kqwl_owner;
2013 
2014 	/*
2015 	 * Phase 1:
2016 	 *
2017 	 * If asked, load the uint64 value at the user provided address and compare
2018 	 * it against the passed in mask and expected value.
2019 	 *
2020 	 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2021 	 * a thread reference.
2022 	 *
2023 	 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2024 	 * the current thread, then end ownership.
2025 	 *
2026 	 * Lastly decide whether we need to perform a QoS update.
2027 	 */
2028 	if (uaddr) {
2029 		/*
2030 		 * Until <rdar://problem/24999882> exists,
2031 		 * disabling preemption copyin forces any
2032 		 * vm_fault we encounter to fail.
2033 		 */
2034 		error = copyin_atomic64(uaddr, &udata);
2035 
2036 		/*
2037 		 * If we get EFAULT, drop locks, and retry.
2038 		 * If we still get an error report it,
2039 		 * else assume the memory has been faulted
2040 		 * and attempt to copyin under lock again.
2041 		 */
2042 		switch (error) {
2043 		case 0:
2044 			break;
2045 		case EFAULT:
2046 			if (efault_retry-- > 0) {
2047 				filt_wlunlock(kqwl);
2048 				error = copyin_atomic64(uaddr, &udata);
2049 				filt_wllock(kqwl);
2050 				if (error == 0) {
2051 					goto again;
2052 				}
2053 			}
2054 			OS_FALLTHROUGH;
2055 		default:
2056 			goto out;
2057 		}
2058 
2059 		/* Update state as copied in.  */
2060 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2061 
2062 		if ((udata & mask) != (kdata & mask)) {
2063 			error = ESTALE;
2064 		} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2065 			/*
2066 			 * Decipher the owner port name, and translate accordingly.
2067 			 * The low 2 bits were borrowed for other flags, so mask them off.
2068 			 *
2069 			 * Then attempt translation to a thread reference or fail.
2070 			 */
2071 			mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2072 			if (name != MACH_PORT_NULL) {
2073 				name = ipc_entry_name_mask(name);
2074 				extra_thread_ref = port_name_to_thread(name,
2075 				    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2076 				if (extra_thread_ref == THREAD_NULL) {
2077 					error = EOWNERDEAD;
2078 					goto out;
2079 				}
2080 				new_owner = extra_thread_ref;
2081 			}
2082 		}
2083 	}
2084 
2085 	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2086 		new_owner = THREAD_NULL;
2087 	}
2088 
2089 	if (error == 0) {
2090 		if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2091 			action = KQWL_UTQ_SET_QOS_INDEX;
2092 		} else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2093 			action = KQWL_UTQ_SET_QOS_INDEX;
2094 		}
2095 
2096 		if (op == FILT_WLTOUCH) {
2097 			/*
2098 			 * Save off any additional fflags/data we just accepted
2099 			 * But only keep the last round of "update" bits we acted on which helps
2100 			 * debugging a lot.
2101 			 */
2102 			kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2103 			kn->kn_sfflags |= kev->fflags;
2104 			if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2105 				needs_wake = (kn->kn_thread != THREAD_NULL);
2106 			}
2107 		} else if (op == FILT_WLDROP) {
2108 			if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2109 			    NOTE_WL_SYNC_WAIT) {
2110 				/*
2111 				 * When deleting a SYNC_WAIT knote that hasn't been woken up
2112 				 * explicitly, issue a wake up.
2113 				 */
2114 				kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2115 				needs_wake = (kn->kn_thread != THREAD_NULL);
2116 			}
2117 		}
2118 	}
2119 
2120 	/*
2121 	 * Phase 2:
2122 	 *
2123 	 * Commit ownership and QoS changes if any, possibly wake up waiters
2124 	 */
2125 
2126 	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2127 		goto out;
2128 	}
2129 
2130 	kqlock(kqwl);
2131 
2132 	/* If already tracked as servicer, don't track as owner */
2133 	if (new_owner == kqr_thread(kqr)) {
2134 		new_owner = THREAD_NULL;
2135 	}
2136 
2137 	if (cur_owner != new_owner) {
2138 		kqwl->kqwl_owner = new_owner;
2139 		if (new_owner == extra_thread_ref) {
2140 			/* we just transfered this ref to kqwl_owner */
2141 			extra_thread_ref = THREAD_NULL;
2142 		}
2143 		cur_override = kqworkloop_override(kqwl);
2144 
2145 		if (new_owner) {
2146 			/* override it before we drop the old */
2147 			if (cur_override != THREAD_QOS_UNSPECIFIED) {
2148 				thread_add_kevent_override(new_owner, cur_override);
2149 			}
2150 			if (kqr_thread_requested_pending(kqr)) {
2151 				if (action == KQWL_UTQ_NONE) {
2152 					action = KQWL_UTQ_REDRIVE_EVENTS;
2153 				}
2154 			}
2155 		} else if (action == KQWL_UTQ_NONE &&
2156 		    !kqr_thread_requested(kqr) &&
2157 		    kqwl->kqwl_wakeup_qos) {
2158 			action = KQWL_UTQ_REDRIVE_EVENTS;
2159 		}
2160 	}
2161 
2162 	if (action != KQWL_UTQ_NONE) {
2163 		kqworkloop_update_threads_qos(kqwl, action, qos_index);
2164 	}
2165 
2166 	ts = kqwl->kqwl_turnstile;
2167 	if (cur_owner != new_owner && ts) {
2168 		if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2169 			/*
2170 			 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2171 			 * the code went through workq_kern_threadreq_initiate()
2172 			 * and the workqueue has set the inheritor already
2173 			 */
2174 			assert(filt_wlturnstile_interlock_is_workq(kqwl));
2175 		} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2176 			workq_kern_threadreq_lock(kqwl->kqwl_p);
2177 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2178 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
2179 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2180 			if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2181 				/*
2182 				 * If the workq is no longer the interlock, then
2183 				 * workq_kern_threadreq_update_inheritor() has finished a bind
2184 				 * and we need to fallback to the regular path.
2185 				 */
2186 				filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2187 			}
2188 			wl_inheritor_updated = true;
2189 		} else {
2190 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2191 			wl_inheritor_updated = true;
2192 		}
2193 
2194 		/*
2195 		 * We need a turnstile reference because we are dropping the interlock
2196 		 * and the caller has not called turnstile_prepare.
2197 		 */
2198 		if (wl_inheritor_updated) {
2199 			turnstile_reference(ts);
2200 		}
2201 	}
2202 
2203 	if (needs_wake && ts) {
2204 		waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2205 		    kn->kn_thread, THREAD_AWAKENED);
2206 		if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2207 			disable_preemption();
2208 			error = EPREEMPTDISABLED;
2209 		}
2210 	}
2211 
2212 	kqunlock(kqwl);
2213 
2214 out:
2215 	/*
2216 	 * Phase 3:
2217 	 *
2218 	 * Unlock and cleanup various lingering references and things.
2219 	 */
2220 	filt_wlunlock(kqwl);
2221 
2222 #if CONFIG_WORKLOOP_DEBUG
2223 	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2224 		.updater = current_thread(),
2225 		.servicer = kqr_thread(kqr), /* Note: racy */
2226 		.old_owner = cur_owner,
2227 		.new_owner = new_owner,
2228 
2229 		.kev_ident  = kev->ident,
2230 		.error      = (int16_t)error,
2231 		.kev_flags  = kev->flags,
2232 		.kev_fflags = kev->fflags,
2233 
2234 		.kev_mask   = mask,
2235 		.kev_value  = kdata,
2236 		.in_value   = udata,
2237 	});
2238 #endif // CONFIG_WORKLOOP_DEBUG
2239 
2240 	if (wl_inheritor_updated) {
2241 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2242 		turnstile_deallocate_safe(ts);
2243 	}
2244 
2245 	if (cur_owner && new_owner != cur_owner) {
2246 		if (cur_override != THREAD_QOS_UNSPECIFIED) {
2247 			thread_drop_kevent_override(cur_owner);
2248 		}
2249 		thread_deallocate_safe(cur_owner);
2250 	}
2251 	if (extra_thread_ref) {
2252 		thread_deallocate_safe(extra_thread_ref);
2253 	}
2254 	return error;
2255 }
2256 
2257 /*
2258  * Remembers the last updated that came in from userspace for debugging reasons.
2259  * - fflags is mirrored from the userspace kevent
2260  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2261  * - ext[VALUE] is set to what the kernel loaded atomically
2262  * - data is set to the error if any
2263  */
2264 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2265 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2266     int error)
2267 {
2268 	kn->kn_fflags = kev->fflags;
2269 	kn->kn_sdata = error;
2270 	memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2271 }
2272 
2273 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2274 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2275     struct kevent_qos_s *kev, int op)
2276 {
2277 	user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2278 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2279 	uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
2280 	uint64_t udata = 0;
2281 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2282 	int error = 0;
2283 
2284 	if (op == FILT_WLATTACH) {
2285 		(void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2286 	} else if (uaddr == 0) {
2287 		return 0;
2288 	}
2289 
2290 	filt_wllock(kqwl);
2291 
2292 again:
2293 
2294 	/*
2295 	 * Do the debounce thing, the lock serializing the state is the knote lock.
2296 	 */
2297 	if (uaddr) {
2298 		/*
2299 		 * Until <rdar://problem/24999882> exists,
2300 		 * disabling preemption copyin forces any
2301 		 * vm_fault we encounter to fail.
2302 		 */
2303 		error = copyin_atomic64(uaddr, &udata);
2304 
2305 		/*
2306 		 * If we get EFAULT, drop locks, and retry.
2307 		 * If we still get an error report it,
2308 		 * else assume the memory has been faulted
2309 		 * and attempt to copyin under lock again.
2310 		 */
2311 		switch (error) {
2312 		case 0:
2313 			break;
2314 		case EFAULT:
2315 			if (efault_retry-- > 0) {
2316 				filt_wlunlock(kqwl);
2317 				error = copyin_atomic64(uaddr, &udata);
2318 				filt_wllock(kqwl);
2319 				if (error == 0) {
2320 					goto again;
2321 				}
2322 			}
2323 			OS_FALLTHROUGH;
2324 		default:
2325 			goto out;
2326 		}
2327 
2328 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2329 		kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2330 
2331 		if ((udata & mask) != (kdata & mask)) {
2332 			error = ESTALE;
2333 			goto out;
2334 		}
2335 	}
2336 
2337 	if (op == FILT_WLATTACH) {
2338 		error = filt_wlattach_sync_ipc(kn);
2339 		if (error == 0) {
2340 			disable_preemption();
2341 			error = EPREEMPTDISABLED;
2342 		}
2343 	}
2344 
2345 out:
2346 	filt_wlunlock(kqwl);
2347 	return error;
2348 }
2349 
2350 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2351 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2352 {
2353 	struct kqueue *kq = knote_get_kq(kn);
2354 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2355 	int error = 0, result = 0;
2356 	kq_index_t qos_index = 0;
2357 
2358 	if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2359 		error = ENOTSUP;
2360 		goto out;
2361 	}
2362 
2363 	uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2364 	switch (command) {
2365 	case NOTE_WL_THREAD_REQUEST:
2366 		if (kn->kn_id != kqwl->kqwl_dynamicid) {
2367 			error = EINVAL;
2368 			goto out;
2369 		}
2370 		qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2371 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
2372 			error = ERANGE;
2373 			goto out;
2374 		}
2375 		if (kqwl->kqwl_request.tr_kq_qos_index) {
2376 			/*
2377 			 * There already is a thread request, and well, you're only allowed
2378 			 * one per workloop, so fail the attach.
2379 			 */
2380 			error = EALREADY;
2381 			goto out;
2382 		}
2383 		break;
2384 	case NOTE_WL_SYNC_WAIT:
2385 	case NOTE_WL_SYNC_WAKE:
2386 		if (kn->kn_id == kqwl->kqwl_dynamicid) {
2387 			error = EINVAL;
2388 			goto out;
2389 		}
2390 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2391 			error = EINVAL;
2392 			goto out;
2393 		}
2394 		if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2395 			error = EINVAL;
2396 			goto out;
2397 		}
2398 		break;
2399 
2400 	case NOTE_WL_SYNC_IPC:
2401 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2402 			error = EINVAL;
2403 			goto out;
2404 		}
2405 		if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2406 			error = EINVAL;
2407 			goto out;
2408 		}
2409 		break;
2410 	default:
2411 		error = EINVAL;
2412 		goto out;
2413 	}
2414 
2415 	if (command == NOTE_WL_SYNC_IPC) {
2416 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2417 	} else {
2418 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2419 	}
2420 
2421 	if (error == EPREEMPTDISABLED) {
2422 		error = 0;
2423 		result = FILTER_THREADREQ_NODEFEER;
2424 	}
2425 out:
2426 	if (error) {
2427 		/* If userland wants ESTALE to be hidden, fail the attach anyway */
2428 		if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2429 			error = 0;
2430 		}
2431 		knote_set_error(kn, error);
2432 		return result;
2433 	}
2434 	if (command == NOTE_WL_SYNC_WAIT) {
2435 		return kevent_register_wait_prepare(kn, kev, result);
2436 	}
2437 	/* Just attaching the thread request successfully will fire it */
2438 	if (command == NOTE_WL_THREAD_REQUEST) {
2439 		/*
2440 		 * Thread Request knotes need an explicit touch to be active again,
2441 		 * so delivering an event needs to also consume it.
2442 		 */
2443 		kn->kn_flags |= EV_CLEAR;
2444 		return result | FILTER_ACTIVE;
2445 	}
2446 	return result;
2447 }
2448 
2449 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2450 filt_wlwait_continue(void *parameter, wait_result_t wr)
2451 {
2452 	struct _kevent_register *cont_args = parameter;
2453 	struct kqworkloop *kqwl = cont_args->kqwl;
2454 
2455 	kqlock(kqwl);
2456 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2457 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2458 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2459 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2460 	} else {
2461 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2462 	}
2463 	kqunlock(kqwl);
2464 
2465 	turnstile_cleanup();
2466 
2467 	if (wr == THREAD_INTERRUPTED) {
2468 		cont_args->kev.flags |= EV_ERROR;
2469 		cont_args->kev.data = EINTR;
2470 	} else if (wr != THREAD_AWAKENED) {
2471 		panic("Unexpected wait result: %d", wr);
2472 	}
2473 
2474 	kevent_register_wait_return(cont_args);
2475 }
2476 
2477 /*
2478  * Called with the workloop mutex held, most of the time never returns as it
2479  * calls filt_wlwait_continue through a continuation.
2480  */
2481 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2482 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2483     struct _kevent_register *cont_args)
2484 {
2485 	struct kqworkloop *kqwl = cont_args->kqwl;
2486 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2487 	struct turnstile *ts;
2488 	bool workq_locked = false;
2489 
2490 	kqlock_held(kqwl);
2491 
2492 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2493 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2494 		workq_locked = true;
2495 	}
2496 
2497 	ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2498 	    TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2499 
2500 	if (workq_locked) {
2501 		workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2502 		    &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2503 		    TURNSTILE_DELAYED_UPDATE);
2504 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2505 			/*
2506 			 * if the interlock is no longer the workqueue lock,
2507 			 * then we don't need to hold it anymore.
2508 			 */
2509 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2510 			workq_locked = false;
2511 		}
2512 	}
2513 	if (!workq_locked) {
2514 		/*
2515 		 * If the interlock is the workloop's, then it's our responsibility to
2516 		 * call update_inheritor, so just do it.
2517 		 */
2518 		filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2519 	}
2520 
2521 	thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2522 	waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2523 	    THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2524 
2525 	if (workq_locked) {
2526 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2527 	}
2528 
2529 	thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2530 	if (thread) {
2531 		thread_reference(thread);
2532 	}
2533 
2534 	kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2535 }
2536 
2537 /* called in stackshot context to report the thread responsible for blocking this thread */
2538 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2539 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2540     event64_t event, thread_waitinfo_t *waitinfo)
2541 {
2542 	struct knote *kn = (struct knote *)event;
2543 
2544 	zone_require(knote_zone, kn);
2545 
2546 	assert(kn->kn_thread == thread);
2547 
2548 	struct kqueue *kq = knote_get_kq(kn);
2549 
2550 	zone_require(kqworkloop_zone, kq);
2551 	assert(kq->kq_state & KQ_WORKLOOP);
2552 
2553 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2554 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2555 
2556 	thread_t kqwl_owner = kqwl->kqwl_owner;
2557 
2558 	if (kqwl_owner != THREAD_NULL) {
2559 		thread_require(kqwl_owner);
2560 		waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2561 	} else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2562 		thread_require(kqr->tr_thread);
2563 		waitinfo->owner = thread_tid(kqr->tr_thread);
2564 	} else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2565 		waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2566 	} else {
2567 		waitinfo->owner = 0;
2568 	}
2569 
2570 	waitinfo->context = kqwl->kqwl_dynamicid;
2571 }
2572 
2573 static void
filt_wldetach(struct knote * kn)2574 filt_wldetach(struct knote *kn)
2575 {
2576 	if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2577 		filt_wldetach_sync_ipc(kn);
2578 	} else if (kn->kn_thread) {
2579 		kevent_register_wait_cleanup(kn);
2580 	}
2581 }
2582 
2583 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2584 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2585     thread_qos_t *qos_index)
2586 {
2587 	uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2588 	uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2589 
2590 	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2591 		return EINVAL;
2592 	}
2593 	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2594 		if (kev->flags & EV_DELETE) {
2595 			return EINVAL;
2596 		}
2597 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2598 			return EINVAL;
2599 		}
2600 		if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2601 			return ERANGE;
2602 		}
2603 	}
2604 
2605 	switch (new_commands) {
2606 	case NOTE_WL_THREAD_REQUEST:
2607 		/* thread requests can only update themselves */
2608 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2609 			return EINVAL;
2610 		}
2611 		break;
2612 
2613 	case NOTE_WL_SYNC_WAIT:
2614 		if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2615 			return EINVAL;
2616 		}
2617 		goto sync_checks;
2618 
2619 	case NOTE_WL_SYNC_WAKE:
2620 sync_checks:
2621 		if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2622 			return EINVAL;
2623 		}
2624 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2625 			return EINVAL;
2626 		}
2627 		break;
2628 
2629 	case NOTE_WL_SYNC_IPC:
2630 		if (sav_commands != NOTE_WL_SYNC_IPC) {
2631 			return EINVAL;
2632 		}
2633 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2634 			return EINVAL;
2635 		}
2636 		break;
2637 
2638 	default:
2639 		return EINVAL;
2640 	}
2641 	return 0;
2642 }
2643 
2644 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2645 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2646 {
2647 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2648 	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2649 	int result = 0;
2650 
2651 	int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2652 	if (error) {
2653 		goto out;
2654 	}
2655 
2656 	uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2657 	if (command == NOTE_WL_SYNC_IPC) {
2658 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2659 	} else {
2660 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2661 		filt_wlremember_last_update(kn, kev, error);
2662 	}
2663 	if (error == EPREEMPTDISABLED) {
2664 		error = 0;
2665 		result = FILTER_THREADREQ_NODEFEER;
2666 	}
2667 
2668 out:
2669 	if (error) {
2670 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2671 			/* If userland wants ESTALE to be hidden, do not activate */
2672 			return result;
2673 		}
2674 		kev->flags |= EV_ERROR;
2675 		kev->data = error;
2676 		return result;
2677 	}
2678 	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2679 		return kevent_register_wait_prepare(kn, kev, result);
2680 	}
2681 	/* Just touching the thread request successfully will fire it */
2682 	if (command == NOTE_WL_THREAD_REQUEST) {
2683 		if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2684 			result |= FILTER_UPDATE_REQ_QOS;
2685 		}
2686 		result |= FILTER_ACTIVE;
2687 	}
2688 	return result;
2689 }
2690 
2691 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2692 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2693 {
2694 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2695 
2696 	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2697 	if (error) {
2698 		goto out;
2699 	}
2700 
2701 	uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2702 	if (command == NOTE_WL_SYNC_IPC) {
2703 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2704 	} else {
2705 		error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2706 		filt_wlremember_last_update(kn, kev, error);
2707 	}
2708 	assert(error != EPREEMPTDISABLED);
2709 
2710 out:
2711 	if (error) {
2712 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2713 			return false;
2714 		}
2715 		kev->flags |= EV_ERROR;
2716 		kev->data = error;
2717 		return false;
2718 	}
2719 	return true;
2720 }
2721 
2722 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2723 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2724 {
2725 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2726 	int rc = 0;
2727 
2728 	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2729 
2730 	kqlock(kqwl);
2731 
2732 	if (kqwl->kqwl_owner) {
2733 		/*
2734 		 * <rdar://problem/33584321> userspace sometimes due to events being
2735 		 * delivered but not triggering a drain session can cause a process
2736 		 * of the thread request knote.
2737 		 *
2738 		 * When that happens, the automatic deactivation due to process
2739 		 * would swallow the event, so we have to activate the knote again.
2740 		 */
2741 		knote_activate(kqwl, kn, FILTER_ACTIVE);
2742 	} else {
2743 #if DEBUG || DEVELOPMENT
2744 		if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2745 			/*
2746 			 * see src/queue_internal.h in libdispatch
2747 			 */
2748 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2749 			user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2750 			task_t t = current_task();
2751 			uint64_t val;
2752 			if (addr && task_is_active(t) && !task_is_halting(t) &&
2753 			    copyin_atomic64(addr, &val) == 0 &&
2754 			    val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2755 			    (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2756 				panic("kevent: workloop %#016llx is not enqueued "
2757 				    "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2758 				    kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2759 			}
2760 		}
2761 #endif
2762 		knote_fill_kevent(kn, kev, 0);
2763 		kev->fflags = kn->kn_sfflags;
2764 		rc |= FILTER_ACTIVE;
2765 	}
2766 
2767 	kqunlock(kqwl);
2768 
2769 	if (rc & FILTER_ACTIVE) {
2770 		workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2771 	}
2772 	return rc;
2773 }
2774 
2775 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2776 	.f_extended_codes = true,
2777 	.f_attach  = filt_wlattach,
2778 	.f_detach  = filt_wldetach,
2779 	.f_event   = filt_bad_event,
2780 	.f_touch   = filt_wltouch,
2781 	.f_process = filt_wlprocess,
2782 	.f_allow_drop = filt_wlallow_drop,
2783 	.f_post_register_wait = filt_wlpost_register_wait,
2784 };
2785 
2786 #pragma mark - kqueues allocation and deallocation
2787 
2788 OS_NOINLINE
2789 static void
2790 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2791 
2792 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2793 kqworkloop_try_retain(struct kqworkloop *kqwl)
2794 {
2795 	return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2796 }
2797 
2798 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2799 kqworkloop_retain(struct kqworkloop *kqwl)
2800 {
2801 	return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2802 }
2803 
2804 OS_ALWAYS_INLINE
2805 static inline void
kqueue_retain(kqueue_t kqu)2806 kqueue_retain(kqueue_t kqu)
2807 {
2808 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2809 		kqworkloop_retain(kqu.kqwl);
2810 	}
2811 }
2812 
2813 OS_ALWAYS_INLINE
2814 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2815 kqworkloop_release_live(struct kqworkloop *kqwl)
2816 {
2817 	os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2818 }
2819 
2820 OS_ALWAYS_INLINE
2821 static inline void
kqueue_release_live(kqueue_t kqu)2822 kqueue_release_live(kqueue_t kqu)
2823 {
2824 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2825 		kqworkloop_release_live(kqu.kqwl);
2826 	}
2827 }
2828 
2829 OS_ALWAYS_INLINE
2830 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2831 kqworkloop_release(struct kqworkloop *kqwl)
2832 {
2833 	if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2834 		kqworkloop_dealloc(kqwl, true);
2835 	}
2836 }
2837 
2838 OS_ALWAYS_INLINE
2839 static inline void
kqueue_release(kqueue_t kqu)2840 kqueue_release(kqueue_t kqu)
2841 {
2842 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2843 		kqworkloop_release(kqu.kqwl);
2844 	}
2845 }
2846 
2847 /*!
2848  * @function kqueue_destroy
2849  *
2850  * @brief
2851  * Common part to all kqueue dealloc functions.
2852  */
2853 OS_NOINLINE
2854 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2855 kqueue_destroy(kqueue_t kqu, zone_t zone)
2856 {
2857 	lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2858 
2859 	zfree(zone, kqu.kq);
2860 }
2861 
2862 /*!
2863  * @function kqueue_init
2864  *
2865  * @brief
2866  * Common part to all kqueue alloc functions.
2867  */
2868 static kqueue_t
kqueue_init(kqueue_t kqu)2869 kqueue_init(kqueue_t kqu)
2870 {
2871 	lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2872 	return kqu;
2873 }
2874 
2875 #pragma mark kqfile allocation and deallocation
2876 
2877 /*!
2878  * @function kqueue_dealloc
2879  *
2880  * @brief
2881  * Detach all knotes from a kqfile and free it.
2882  *
2883  * @discussion
2884  * We walk each list looking for knotes referencing this
2885  * this kqueue.  If we find one, we try to drop it.  But
2886  * if we fail to get a drop reference, that will wait
2887  * until it is dropped.  So, we can just restart again
2888  * safe in the assumption that the list will eventually
2889  * not contain any more references to this kqueue (either
2890  * we dropped them all, or someone else did).
2891  *
2892  * Assumes no new events are being added to the kqueue.
2893  * Nothing locked on entry or exit.
2894  */
2895 void
kqueue_dealloc(struct kqueue * kq)2896 kqueue_dealloc(struct kqueue *kq)
2897 {
2898 	KNOTE_LOCK_CTX(knlc);
2899 	struct proc *p = kq->kq_p;
2900 	struct filedesc *fdp = &p->p_fd;
2901 	struct knote *kn;
2902 
2903 	assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2904 
2905 	proc_fdlock(p);
2906 	for (int i = 0; i < fdp->fd_knlistsize; i++) {
2907 		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2908 		while (kn != NULL) {
2909 			if (kq == knote_get_kq(kn)) {
2910 				kqlock(kq);
2911 				proc_fdunlock(p);
2912 				if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2913 					knote_drop(kq, kn, &knlc);
2914 				}
2915 				proc_fdlock(p);
2916 				/* start over at beginning of list */
2917 				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2918 				continue;
2919 			}
2920 			kn = SLIST_NEXT(kn, kn_link);
2921 		}
2922 	}
2923 
2924 	knhash_lock(fdp);
2925 	proc_fdunlock(p);
2926 
2927 	if (fdp->fd_knhashmask != 0) {
2928 		for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2929 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2930 			while (kn != NULL) {
2931 				if (kq == knote_get_kq(kn)) {
2932 					kqlock(kq);
2933 					knhash_unlock(fdp);
2934 					if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2935 						knote_drop(kq, kn, &knlc);
2936 					}
2937 					knhash_lock(fdp);
2938 					/* start over at beginning of list */
2939 					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2940 					continue;
2941 				}
2942 				kn = SLIST_NEXT(kn, kn_link);
2943 			}
2944 		}
2945 	}
2946 	knhash_unlock(fdp);
2947 
2948 	kqueue_destroy(kq, kqfile_zone);
2949 }
2950 
2951 /*!
2952  * @function kqueue_alloc
2953  *
2954  * @brief
2955  * Allocate a kqfile.
2956  */
2957 struct kqueue *
kqueue_alloc(struct proc * p)2958 kqueue_alloc(struct proc *p)
2959 {
2960 	struct kqfile *kqf;
2961 
2962 	/*
2963 	 * kqfiles are created with kqueue() so we need to wait for
2964 	 * the first kevent syscall to know which bit among
2965 	 * KQ_KEV_{32,64,QOS} will be set in kqf_state
2966 	 */
2967 	kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
2968 	kqf->kqf_p = p;
2969 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
2970 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
2971 
2972 	return kqueue_init(kqf).kq;
2973 }
2974 
2975 /*!
2976  * @function kqueue_internal
2977  *
2978  * @brief
2979  * Core implementation for kqueue and guarded_kqueue_np()
2980  */
2981 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)2982 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
2983 {
2984 	struct kqueue *kq;
2985 	struct fileproc *fp;
2986 	int fd, error;
2987 
2988 	error = falloc_withinit(p, &fp, &fd, vfs_context_current(),
2989 	    fp_init, initarg);
2990 	if (error) {
2991 		return error;
2992 	}
2993 
2994 	kq = kqueue_alloc(p);
2995 	if (kq == NULL) {
2996 		fp_free(p, fd, fp);
2997 		return ENOMEM;
2998 	}
2999 
3000 	fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3001 	fp->f_flag = FREAD | FWRITE;
3002 	fp->f_ops = &kqueueops;
3003 	fp_set_data(fp, kq);
3004 	fp->f_lflags |= FG_CONFINED;
3005 
3006 	proc_fdlock(p);
3007 	procfdtbl_releasefd(p, fd, NULL);
3008 	fp_drop(p, fd, fp, 1);
3009 	proc_fdunlock(p);
3010 
3011 	*retval = fd;
3012 	return error;
3013 }
3014 
3015 /*!
3016  * @function kqueue
3017  *
3018  * @brief
3019  * The kqueue syscall.
3020  */
3021 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3022 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3023 {
3024 	return kqueue_internal(p, NULL, NULL, retval);
3025 }
3026 
3027 #pragma mark kqworkq allocation and deallocation
3028 
3029 /*!
3030  * @function kqworkq_dealloc
3031  *
3032  * @brief
3033  * Deallocates a workqueue kqueue.
3034  *
3035  * @discussion
3036  * This only happens at process death, or for races with concurrent
3037  * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3038  * this kqueue, either there are none, or someone else took care of them.
3039  */
3040 void
kqworkq_dealloc(struct kqworkq * kqwq)3041 kqworkq_dealloc(struct kqworkq *kqwq)
3042 {
3043 	kqueue_destroy(kqwq, kqworkq_zone);
3044 }
3045 
3046 /*!
3047  * @function kqworkq_alloc
3048  *
3049  * @brief
3050  * Allocates a workqueue kqueue.
3051  *
3052  * @discussion
3053  * This is the slow path of kevent_get_kqwq.
3054  * This takes care of making sure procs have a single workq kqueue.
3055  */
3056 OS_NOINLINE
3057 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3058 kqworkq_alloc(struct proc *p, unsigned int flags)
3059 {
3060 	struct kqworkq *kqwq, *tmp;
3061 
3062 	kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3063 
3064 	assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3065 	if (flags & KEVENT_FLAG_LEGACY64) {
3066 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3067 	} else {
3068 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3069 	}
3070 	kqwq->kqwq_p = p;
3071 
3072 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3073 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3074 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3075 	}
3076 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3077 		/*
3078 		 * Because of how the bucketized system works, we mix overcommit
3079 		 * sources with not overcommit: each time we move a knote from
3080 		 * one bucket to the next due to overrides, we'd had to track
3081 		 * overcommitness, and it's really not worth it in the workloop
3082 		 * enabled world that track this faithfully.
3083 		 *
3084 		 * Incidentally, this behaves like the original manager-based
3085 		 * kqwq where event delivery always happened (hence is
3086 		 * "overcommit")
3087 		 */
3088 		kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3089 		kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3090 		if (i != KQWQ_QOS_MANAGER) {
3091 			kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3092 		}
3093 		kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3094 	}
3095 
3096 	kqueue_init(kqwq);
3097 
3098 	if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3099 		kqworkq_dealloc(kqwq);
3100 		return tmp;
3101 	}
3102 
3103 	return kqwq;
3104 }
3105 
3106 #pragma mark kqworkloop allocation and deallocation
3107 
3108 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3109 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3110 
3111 OS_ALWAYS_INLINE
3112 static inline void
kqhash_lock(struct filedesc * fdp)3113 kqhash_lock(struct filedesc *fdp)
3114 {
3115 	lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3116 }
3117 
3118 OS_ALWAYS_INLINE
3119 static inline void
kqhash_unlock(struct filedesc * fdp)3120 kqhash_unlock(struct filedesc *fdp)
3121 {
3122 	lck_mtx_unlock(&fdp->fd_kqhashlock);
3123 }
3124 
3125 OS_ALWAYS_INLINE
3126 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3127 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3128     struct kqworkloop *kqwl)
3129 {
3130 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3131 	LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3132 }
3133 
3134 OS_ALWAYS_INLINE
3135 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3136 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3137 {
3138 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3139 	struct kqworkloop *kqwl;
3140 
3141 	LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3142 		if (kqwl->kqwl_dynamicid == id) {
3143 			return kqwl;
3144 		}
3145 	}
3146 	return NULL;
3147 }
3148 
3149 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3150 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3151 {
3152 	struct kqworkloop *kqwl = NULL;
3153 
3154 	kqhash_lock(fdp);
3155 	if (__probable(fdp->fd_kqhash)) {
3156 		kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3157 		if (kqwl && !kqworkloop_try_retain(kqwl)) {
3158 			kqwl = NULL;
3159 		}
3160 	}
3161 	kqhash_unlock(fdp);
3162 	return kqwl;
3163 }
3164 
3165 OS_NOINLINE
3166 static void
kqworkloop_hash_init(struct filedesc * fdp)3167 kqworkloop_hash_init(struct filedesc *fdp)
3168 {
3169 	struct kqwllist *alloc_hash;
3170 	u_long alloc_mask;
3171 
3172 	kqhash_unlock(fdp);
3173 	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3174 	kqhash_lock(fdp);
3175 
3176 	/* See if we won the race */
3177 	if (__probable(fdp->fd_kqhashmask == 0)) {
3178 		fdp->fd_kqhash = alloc_hash;
3179 		fdp->fd_kqhashmask = alloc_mask;
3180 	} else {
3181 		kqhash_unlock(fdp);
3182 		hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3183 		kqhash_lock(fdp);
3184 	}
3185 }
3186 
3187 /*
3188  * kqueue iotier override is only supported for kqueue that has
3189  * only one port as a mach port source. Updating the iotier
3190  * override on the mach port source will update the override
3191  * on kqueue as well. Since kqueue with iotier override will
3192  * only have one port attached, there is no logic for saturation
3193  * like qos override, the iotier override of mach port source
3194  * would be reflected in kevent iotier override.
3195  */
3196 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3197 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3198 {
3199 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3200 		return;
3201 	}
3202 
3203 	struct kqworkloop *kqwl = kqu.kqwl;
3204 	os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3205 }
3206 
3207 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3208 kqueue_get_iotier_override(kqueue_t kqu)
3209 {
3210 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3211 		return THROTTLE_LEVEL_END;
3212 	}
3213 
3214 	struct kqworkloop *kqwl = kqu.kqwl;
3215 	return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3216 }
3217 
3218 #if CONFIG_PREADOPT_TG
3219 /*
3220  * This function is called with a borrowed reference on the thread group without
3221  * kq lock held with the mqueue lock held. It may or may not have the knote lock
3222  * (called from both fevent as well as fattach/ftouch). Upon success, an
3223  * additional reference on the TG is taken
3224  */
3225 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3226 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3227 {
3228 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3229 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3230 		    (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3231 		return;
3232 	}
3233 
3234 	struct kqworkloop *kqwl = kqu.kqwl;
3235 
3236 	assert(qos < THREAD_QOS_LAST);
3237 
3238 	thread_group_retain(tg);
3239 
3240 	thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3241 	int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3242 		if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3243 		        os_atomic_rmw_loop_give_up(break);
3244 		}
3245 
3246 		if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3247 		        /*
3248 		         * Note that old_tg could be a NULL TG pointer but with a QoS
3249 		         * set. See also workq_thread_reset_pri.
3250 		         *
3251 		         * Compare the QoS of existing preadopted tg with new one and
3252 		         * only overwrite the thread group if we have one with a higher
3253 		         * QoS.
3254 		         */
3255 		        thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3256 		        if (existing_qos >= qos) {
3257 		                os_atomic_rmw_loop_give_up(break);
3258 			}
3259 		}
3260 
3261 		// Transfer the ref taken earlier in the function to the kqwl
3262 		new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3263 	});
3264 
3265 	if (ret) {
3266 		KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3267 
3268 		if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3269 			thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3270 		}
3271 
3272 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3273 	} else {
3274 		// We failed to write to the kqwl_preadopt_tg, drop the ref we took
3275 		// earlier in the function
3276 		thread_group_deallocate_safe(tg);
3277 	}
3278 }
3279 
3280 /*
3281  * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3282  */
3283 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3284 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3285 {
3286 	bool success = false;
3287 	if (kq->kq_state & KQ_WORKLOOP) {
3288 		struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3289 		thread_group_qos_t old_tg;
3290 		success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3291 		    KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3292 		    &old_tg, relaxed);
3293 		if (success) {
3294 			thread_set_preadopt_thread_group(thread, tg);
3295 		}
3296 
3297 		__assert_only thread_group_qos_t preadopt_tg;
3298 		preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3299 		assert(preadopt_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3300 		    preadopt_tg == KQWL_PREADOPTED_TG_NEVER);
3301 	}
3302 
3303 	return success;
3304 }
3305 #endif
3306 
3307 /*!
3308  * @function kqworkloop_dealloc
3309  *
3310  * @brief
3311  * Deallocates a workloop kqueue.
3312  *
3313  * @discussion
3314  * Knotes hold references on the workloop, so we can't really reach this
3315  * function unless all of these are already gone.
3316  *
3317  * Nothing locked on entry or exit.
3318  *
3319  * @param hash_remove
3320  * Whether to remove the workloop from its hash table.
3321  */
3322 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3323 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3324 {
3325 	thread_t cur_owner;
3326 
3327 	cur_owner = kqwl->kqwl_owner;
3328 	if (cur_owner) {
3329 		if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3330 			thread_drop_kevent_override(cur_owner);
3331 		}
3332 		thread_deallocate(cur_owner);
3333 		kqwl->kqwl_owner = THREAD_NULL;
3334 	}
3335 
3336 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3337 		struct turnstile *ts;
3338 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3339 		    &ts, TURNSTILE_WORKLOOPS);
3340 		turnstile_cleanup();
3341 		turnstile_deallocate(ts);
3342 	}
3343 
3344 	if (hash_remove) {
3345 		struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3346 
3347 		kqhash_lock(fdp);
3348 		LIST_REMOVE(kqwl, kqwl_hashlink);
3349 		kqhash_unlock(fdp);
3350 	}
3351 
3352 #if CONFIG_PREADOPT_TG
3353 	thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3354 	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3355 		thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3356 	}
3357 #endif
3358 
3359 	assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3360 	assert(kqwl->kqwl_owner == THREAD_NULL);
3361 	assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3362 
3363 	lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3364 	kqueue_destroy(kqwl, kqworkloop_zone);
3365 }
3366 
3367 /*!
3368  * @function kqworkloop_alloc
3369  *
3370  * @brief
3371  * Allocates a workloop kqueue.
3372  */
3373 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp)3374 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3375     kqueue_id_t id, workq_threadreq_param_t *trp)
3376 {
3377 	kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3378 	os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3379 	kqwl->kqwl_dynamicid = id;
3380 	kqwl->kqwl_p         = p;
3381 	if (trp) {
3382 		kqwl->kqwl_params = trp->trp_value;
3383 	}
3384 
3385 	workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3386 	if (trp) {
3387 		if (trp->trp_flags & TRP_PRIORITY) {
3388 			tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3389 		}
3390 		if (trp->trp_flags) {
3391 			tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3392 		}
3393 	}
3394 	kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3395 	kqwl->kqwl_request.tr_flags = tr_flags;
3396 	os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3397 #if CONFIG_PREADOPT_TG
3398 	if (task_is_app(current_task())) {
3399 		/* Apps will never adopt a thread group that is not their own. This is a
3400 		 * gross hack to simulate the post-process that is done in the voucher
3401 		 * subsystem today for thread groups */
3402 		os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3403 	}
3404 #endif
3405 
3406 	for (int i = 0; i < KQWL_NBUCKETS; i++) {
3407 		TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3408 	}
3409 	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3410 
3411 	lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3412 
3413 	kqueue_init(kqwl);
3414 }
3415 
3416 /*!
3417  * @function kqworkloop_get_or_create
3418  *
3419  * @brief
3420  * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
3421  *
3422  * @returns
3423  * 0:      success
3424  * EINVAL: invalid parameters
3425  * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3426  * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3427  * ENOMEM: allocation failed
3428  */
3429 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,unsigned int flags,struct kqworkloop ** kqwlp)3430 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3431     workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp)
3432 {
3433 	struct filedesc *fdp = &p->p_fd;
3434 	struct kqworkloop *alloc_kqwl = NULL;
3435 	struct kqworkloop *kqwl = NULL;
3436 	int error = 0;
3437 
3438 	assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3439 
3440 	if (id == 0 || id == (kqueue_id_t)-1) {
3441 		return EINVAL;
3442 	}
3443 
3444 	for (;;) {
3445 		kqhash_lock(fdp);
3446 		if (__improbable(fdp->fd_kqhash == NULL)) {
3447 			kqworkloop_hash_init(fdp);
3448 		}
3449 
3450 		kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3451 		if (kqwl) {
3452 			if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3453 				/*
3454 				 * If MUST_NOT_EXIST was passed, even if we would have failed
3455 				 * the try_retain, it could have gone the other way, and
3456 				 * userspace can't tell. Let'em fix their race.
3457 				 */
3458 				error = EEXIST;
3459 				break;
3460 			}
3461 
3462 			if (__probable(kqworkloop_try_retain(kqwl))) {
3463 				/*
3464 				 * This is a valid live workloop !
3465 				 */
3466 				*kqwlp = kqwl;
3467 				error = 0;
3468 				break;
3469 			}
3470 		}
3471 
3472 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3473 			error = ENOENT;
3474 			break;
3475 		}
3476 
3477 		/*
3478 		 * We didn't find what we were looking for.
3479 		 *
3480 		 * If this is the second time we reach this point (alloc_kqwl != NULL),
3481 		 * then we're done.
3482 		 *
3483 		 * If this is the first time we reach this point (alloc_kqwl == NULL),
3484 		 * then try to allocate one without blocking.
3485 		 */
3486 		if (__probable(alloc_kqwl == NULL)) {
3487 			alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3488 		}
3489 		if (__probable(alloc_kqwl)) {
3490 			kqworkloop_init(alloc_kqwl, p, id, trp);
3491 			kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3492 			kqhash_unlock(fdp);
3493 			*kqwlp = alloc_kqwl;
3494 			return 0;
3495 		}
3496 
3497 		/*
3498 		 * We have to block to allocate a workloop, drop the lock,
3499 		 * allocate one, but then we need to retry lookups as someone
3500 		 * else could race with us.
3501 		 */
3502 		kqhash_unlock(fdp);
3503 
3504 		alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3505 	}
3506 
3507 	kqhash_unlock(fdp);
3508 
3509 	if (__improbable(alloc_kqwl)) {
3510 		zfree(kqworkloop_zone, alloc_kqwl);
3511 	}
3512 
3513 	return error;
3514 }
3515 
3516 #pragma mark - knotes
3517 
3518 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3519 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3520 {
3521 	knote_set_error(kn, ENOTSUP);
3522 	return 0;
3523 }
3524 
3525 static void
filt_no_detach(__unused struct knote * kn)3526 filt_no_detach(__unused struct knote *kn)
3527 {
3528 }
3529 
3530 static int __dead2
filt_bad_event(struct knote * kn,long hint)3531 filt_bad_event(struct knote *kn, long hint)
3532 {
3533 	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3534 }
3535 
3536 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3537 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3538 {
3539 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3540 }
3541 
3542 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3543 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3544 {
3545 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3546 }
3547 
3548 /*
3549  * knotes_dealloc - detach all knotes for the process and drop them
3550  *
3551  *		Process is in such a state that it will not try to allocate
3552  *		any more knotes during this process (stopped for exit or exec).
3553  */
3554 void
knotes_dealloc(proc_t p)3555 knotes_dealloc(proc_t p)
3556 {
3557 	struct filedesc *fdp = &p->p_fd;
3558 	struct kqueue *kq;
3559 	struct knote *kn;
3560 	struct  klist *kn_hash = NULL;
3561 	u_long kn_hashmask;
3562 	int i;
3563 
3564 	proc_fdlock(p);
3565 
3566 	/* Close all the fd-indexed knotes up front */
3567 	if (fdp->fd_knlistsize > 0) {
3568 		for (i = 0; i < fdp->fd_knlistsize; i++) {
3569 			while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3570 				kq = knote_get_kq(kn);
3571 				kqlock(kq);
3572 				proc_fdunlock(p);
3573 				knote_drop(kq, kn, NULL);
3574 				proc_fdlock(p);
3575 			}
3576 		}
3577 		/* free the table */
3578 		kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3579 	}
3580 	fdp->fd_knlistsize = 0;
3581 
3582 	proc_fdunlock(p);
3583 
3584 	knhash_lock(fdp);
3585 
3586 	/* Clean out all the hashed knotes as well */
3587 	if (fdp->fd_knhashmask != 0) {
3588 		for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3589 			while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3590 				kq = knote_get_kq(kn);
3591 				kqlock(kq);
3592 				knhash_unlock(fdp);
3593 				knote_drop(kq, kn, NULL);
3594 				knhash_lock(fdp);
3595 			}
3596 		}
3597 		kn_hash = fdp->fd_knhash;
3598 		kn_hashmask = fdp->fd_knhashmask;
3599 		fdp->fd_knhashmask = 0;
3600 		fdp->fd_knhash = NULL;
3601 	}
3602 
3603 	knhash_unlock(fdp);
3604 
3605 	if (kn_hash) {
3606 		hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3607 	}
3608 }
3609 
3610 /*
3611  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3612  * scheduling parameters
3613  *
3614  *		Process is in such a state that it will not try to allocate
3615  *		any more knotes during this process (stopped for exit or exec).
3616  */
3617 void
kqworkloops_dealloc(proc_t p)3618 kqworkloops_dealloc(proc_t p)
3619 {
3620 	struct filedesc *fdp = &p->p_fd;
3621 	struct kqworkloop *kqwl, *kqwln;
3622 	struct kqwllist tofree;
3623 
3624 	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3625 		return;
3626 	}
3627 
3628 	kqhash_lock(fdp);
3629 
3630 	if (fdp->fd_kqhashmask == 0) {
3631 		kqhash_unlock(fdp);
3632 		return;
3633 	}
3634 
3635 	LIST_INIT(&tofree);
3636 
3637 	for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3638 		LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3639 			/*
3640 			 * kqworkloops that have scheduling parameters have an
3641 			 * implicit retain from kqueue_workloop_ctl that needs
3642 			 * to be balanced on process exit.
3643 			 */
3644 			assert(kqwl->kqwl_params);
3645 			LIST_REMOVE(kqwl, kqwl_hashlink);
3646 			LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3647 		}
3648 	}
3649 
3650 	kqhash_unlock(fdp);
3651 
3652 	LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3653 		uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3654 		if (ref != 1) {
3655 			panic("kq(%p) invalid refcount %d", kqwl, ref);
3656 		}
3657 		kqworkloop_dealloc(kqwl, false);
3658 	}
3659 }
3660 
3661 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3662 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3663     struct kevent_qos_s *kev)
3664 {
3665 	/* We don't care about the priority of a disabled or deleted knote */
3666 	if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3667 		return 0;
3668 	}
3669 
3670 	if (kq->kq_state & KQ_WORKLOOP) {
3671 		/*
3672 		 * Workloops need valid priorities with a QOS (excluding manager) for
3673 		 * any enabled knote.
3674 		 *
3675 		 * When it is pre-existing, just make sure it has a valid QoS as
3676 		 * kevent_register() will not use the incoming priority (filters who do
3677 		 * have the responsibility to validate it again, see filt_wltouch).
3678 		 *
3679 		 * If the knote is being made, validate the incoming priority.
3680 		 */
3681 		if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3682 			return ERANGE;
3683 		}
3684 	}
3685 
3686 	return 0;
3687 }
3688 
3689 /*
3690  * Prepare a filter for waiting after register.
3691  *
3692  * The f_post_register_wait hook will be called later by kevent_register()
3693  * and should call kevent_register_wait_block()
3694  */
3695 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3696 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3697 {
3698 	thread_t thread = current_thread();
3699 
3700 	assert(knote_fops(kn)->f_extended_codes);
3701 
3702 	if (kn->kn_thread == NULL) {
3703 		thread_reference(thread);
3704 		kn->kn_thread = thread;
3705 	} else if (kn->kn_thread != thread) {
3706 		/*
3707 		 * kn_thread may be set from a previous aborted wait
3708 		 * However, it has to be from the same thread.
3709 		 */
3710 		kev->flags |= EV_ERROR;
3711 		kev->data = EXDEV;
3712 		return 0;
3713 	}
3714 
3715 	return FILTER_REGISTER_WAIT | rc;
3716 }
3717 
3718 /*
3719  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3720  * aborted instead of properly woken up with thread_wakeup_thread().
3721  */
3722 static void
kevent_register_wait_cleanup(struct knote * kn)3723 kevent_register_wait_cleanup(struct knote *kn)
3724 {
3725 	thread_t thread = kn->kn_thread;
3726 	kn->kn_thread = NULL;
3727 	thread_deallocate(thread);
3728 }
3729 
3730 /*
3731  * Must be called at the end of a f_post_register_wait call from a filter.
3732  */
3733 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3734 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3735     thread_continue_t cont, struct _kevent_register *cont_args)
3736 {
3737 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3738 	kqunlock(cont_args->kqwl);
3739 	cont_args->handoff_thread = thread;
3740 	thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3741 }
3742 
3743 /*
3744  * Called by Filters using a f_post_register_wait to return from their wait.
3745  */
3746 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3747 kevent_register_wait_return(struct _kevent_register *cont_args)
3748 {
3749 	struct kqworkloop *kqwl = cont_args->kqwl;
3750 	struct kevent_qos_s *kev = &cont_args->kev;
3751 	int error = 0;
3752 
3753 	if (cont_args->handoff_thread) {
3754 		thread_deallocate(cont_args->handoff_thread);
3755 	}
3756 
3757 	if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3758 		if ((kev->flags & EV_ERROR) == 0) {
3759 			kev->flags |= EV_ERROR;
3760 			kev->data = 0;
3761 		}
3762 		error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3763 		if (error == 0) {
3764 			cont_args->eventout++;
3765 		}
3766 	}
3767 
3768 	kqworkloop_release(kqwl);
3769 	if (error == 0) {
3770 		*(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3771 	}
3772 	unix_syscall_return(error);
3773 }
3774 
3775 /*
3776  * kevent_register - add a new event to a kqueue
3777  *
3778  *	Creates a mapping between the event source and
3779  *	the kqueue via a knote data structure.
3780  *
3781  *	Because many/most the event sources are file
3782  *	descriptor related, the knote is linked off
3783  *	the filedescriptor table for quick access.
3784  *
3785  *	called with nothing locked
3786  *	caller holds a reference on the kqueue
3787  */
3788 
3789 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)3790 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
3791     struct knote **kn_out)
3792 {
3793 	struct proc *p = kq->kq_p;
3794 	const struct filterops *fops;
3795 	struct knote *kn = NULL;
3796 	int result = 0, error = 0;
3797 	unsigned short kev_flags = kev->flags;
3798 	KNOTE_LOCK_CTX(knlc);
3799 
3800 	if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
3801 		fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
3802 	} else {
3803 		error = EINVAL;
3804 		goto out;
3805 	}
3806 
3807 	/* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3808 	if (__improbable((kev->flags & EV_VANISHED) &&
3809 	    (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
3810 		error = EINVAL;
3811 		goto out;
3812 	}
3813 
3814 	/* Simplify the flags - delete and disable overrule */
3815 	if (kev->flags & EV_DELETE) {
3816 		kev->flags &= ~EV_ADD;
3817 	}
3818 	if (kev->flags & EV_DISABLE) {
3819 		kev->flags &= ~EV_ENABLE;
3820 	}
3821 
3822 	if (kq->kq_state & KQ_WORKLOOP) {
3823 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3824 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
3825 		    kev->udata, kev->flags, kev->filter);
3826 	} else if (kq->kq_state & KQ_WORKQ) {
3827 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3828 		    0, kev->udata, kev->flags, kev->filter);
3829 	} else {
3830 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3831 		    VM_KERNEL_UNSLIDE_OR_PERM(kq),
3832 		    kev->udata, kev->flags, kev->filter);
3833 	}
3834 
3835 restart:
3836 	/* find the matching knote from the fd tables/hashes */
3837 	kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
3838 	error = kevent_register_validate_priority(kq, kn, kev);
3839 	result = 0;
3840 	if (error) {
3841 		if (kn) {
3842 			kqunlock(kq);
3843 		}
3844 		goto out;
3845 	}
3846 
3847 	if (kn == NULL && (kev->flags & EV_ADD) == 0) {
3848 		/*
3849 		 * No knote found, EV_ADD wasn't specified
3850 		 */
3851 
3852 		if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3853 		    (kq->kq_state & KQ_WORKLOOP)) {
3854 			/*
3855 			 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3856 			 * that doesn't care about ENOENT, so just pretend the deletion
3857 			 * happened.
3858 			 */
3859 		} else {
3860 			error = ENOENT;
3861 		}
3862 		goto out;
3863 	} else if (kn == NULL) {
3864 		/*
3865 		 * No knote found, need to attach a new one (attach)
3866 		 */
3867 
3868 		struct fileproc *knote_fp = NULL;
3869 
3870 		/* grab a file reference for the new knote */
3871 		if (fops->f_isfd) {
3872 			if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
3873 				goto out;
3874 			}
3875 		}
3876 
3877 		kn = knote_alloc();
3878 		kn->kn_fp = knote_fp;
3879 		kn->kn_is_fd = fops->f_isfd;
3880 		kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3881 		kn->kn_status = 0;
3882 
3883 		/* was vanish support requested */
3884 		if (kev->flags & EV_VANISHED) {
3885 			kev->flags &= ~EV_VANISHED;
3886 			kn->kn_status |= KN_REQVANISH;
3887 		}
3888 
3889 		/* snapshot matching/dispatching protocol flags into knote */
3890 		if (kev->flags & EV_DISABLE) {
3891 			kn->kn_status |= KN_DISABLED;
3892 		}
3893 
3894 		/*
3895 		 * copy the kevent state into knote
3896 		 * protocol is that fflags and data
3897 		 * are saved off, and cleared before
3898 		 * calling the attach routine.
3899 		 *
3900 		 * - kn->kn_sfflags aliases with kev->xflags
3901 		 * - kn->kn_sdata   aliases with kev->data
3902 		 * - kn->kn_filter  is the top 8 bits of kev->filter
3903 		 */
3904 		kn->kn_kevent  = *(struct kevent_internal_s *)kev;
3905 		kn->kn_sfflags = kev->fflags;
3906 		kn->kn_filtid  = (uint8_t)~kev->filter;
3907 		kn->kn_fflags  = 0;
3908 		knote_reset_priority(kq, kn, kev->qos);
3909 
3910 		/* Add the knote for lookup thru the fd table */
3911 		error = kq_add_knote(kq, kn, &knlc, p);
3912 		if (error) {
3913 			knote_free(kn);
3914 			if (knote_fp != NULL) {
3915 				fp_drop(p, (int)kev->ident, knote_fp, 0);
3916 			}
3917 
3918 			if (error == ERESTART) {
3919 				goto restart;
3920 			}
3921 			goto out;
3922 		}
3923 
3924 		/* fp reference count now applies to knote */
3925 
3926 		/*
3927 		 * we can't use filter_call() because f_attach can change the filter ops
3928 		 * for a filter that supports f_extended_codes, so we need to reload
3929 		 * knote_fops() and not use `fops`.
3930 		 */
3931 		result = fops->f_attach(kn, kev);
3932 		if (result && !knote_fops(kn)->f_extended_codes) {
3933 			result = FILTER_ACTIVE;
3934 		}
3935 
3936 		kqlock(kq);
3937 
3938 		if (result & FILTER_THREADREQ_NODEFEER) {
3939 			enable_preemption();
3940 		}
3941 
3942 		if (kn->kn_flags & EV_ERROR) {
3943 			/*
3944 			 * Failed to attach correctly, so drop.
3945 			 */
3946 			kn->kn_filtid = EVFILTID_DETACHED;
3947 			error = (int)kn->kn_sdata;
3948 			knote_drop(kq, kn, &knlc);
3949 			result = 0;
3950 			goto out;
3951 		}
3952 
3953 		/*
3954 		 * end "attaching" phase - now just attached
3955 		 *
3956 		 * Mark the thread request overcommit, if appropos
3957 		 *
3958 		 * If the attach routine indicated that an
3959 		 * event is already fired, activate the knote.
3960 		 */
3961 		if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
3962 		    (kq->kq_state & KQ_WORKLOOP)) {
3963 			kqworkloop_set_overcommit((struct kqworkloop *)kq);
3964 		}
3965 	} else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3966 		/*
3967 		 * The knote was dropped while we were waiting for the lock,
3968 		 * we need to re-evaluate entirely
3969 		 */
3970 
3971 		goto restart;
3972 	} else if (kev->flags & EV_DELETE) {
3973 		/*
3974 		 * Deletion of a knote (drop)
3975 		 *
3976 		 * If the filter wants to filter drop events, let it do so.
3977 		 *
3978 		 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
3979 		 * we must wait for the knote to be re-enabled (unless it is being
3980 		 * re-enabled atomically here).
3981 		 */
3982 
3983 		if (knote_fops(kn)->f_allow_drop) {
3984 			bool drop;
3985 
3986 			kqunlock(kq);
3987 			drop = knote_fops(kn)->f_allow_drop(kn, kev);
3988 			kqlock(kq);
3989 
3990 			if (!drop) {
3991 				goto out_unlock;
3992 			}
3993 		}
3994 
3995 		if ((kev->flags & EV_ENABLE) == 0 &&
3996 		    (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
3997 		    (kn->kn_status & KN_DISABLED) != 0) {
3998 			kn->kn_status |= KN_DEFERDELETE;
3999 			error = EINPROGRESS;
4000 			goto out_unlock;
4001 		}
4002 
4003 		knote_drop(kq, kn, &knlc);
4004 		goto out;
4005 	} else {
4006 		/*
4007 		 * Regular update of a knote (touch)
4008 		 *
4009 		 * Call touch routine to notify filter of changes in filter values
4010 		 * (and to re-determine if any events are fired).
4011 		 *
4012 		 * If the knote is in defer-delete, avoid calling the filter touch
4013 		 * routine (it has delivered its last event already).
4014 		 *
4015 		 * If the touch routine had no failure,
4016 		 * apply the requested side effects to the knote.
4017 		 */
4018 
4019 		if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4020 			if (kev->flags & EV_ENABLE) {
4021 				result = FILTER_ACTIVE;
4022 			}
4023 		} else {
4024 			kqunlock(kq);
4025 			result = filter_call(knote_fops(kn), f_touch(kn, kev));
4026 			kqlock(kq);
4027 			if (result & FILTER_THREADREQ_NODEFEER) {
4028 				enable_preemption();
4029 			}
4030 		}
4031 
4032 		if (kev->flags & EV_ERROR) {
4033 			result = 0;
4034 			goto out_unlock;
4035 		}
4036 
4037 		if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4038 		    kn->kn_udata != kev->udata) {
4039 			// this allows klist_copy_udata() not to take locks
4040 			os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4041 		}
4042 		if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4043 			kn->kn_status |= KN_DISABLED;
4044 			knote_dequeue(kq, kn);
4045 		}
4046 	}
4047 
4048 	/* accept new kevent state */
4049 	knote_apply_touch(kq, kn, kev, result);
4050 
4051 out_unlock:
4052 	/*
4053 	 * When the filter asked for a post-register wait,
4054 	 * we leave the kqueue locked for kevent_register()
4055 	 * to call the filter's f_post_register_wait hook.
4056 	 */
4057 	if (result & FILTER_REGISTER_WAIT) {
4058 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4059 		*kn_out = kn;
4060 	} else {
4061 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4062 	}
4063 
4064 out:
4065 	/* output local errors through the kevent */
4066 	if (error) {
4067 		kev->flags |= EV_ERROR;
4068 		kev->data = error;
4069 	}
4070 	return result;
4071 }
4072 
4073 /*
4074  * knote_process - process a triggered event
4075  *
4076  *	Validate that it is really still a triggered event
4077  *	by calling the filter routines (if necessary).  Hold
4078  *	a use reference on the knote to avoid it being detached.
4079  *
4080  *	If it is still considered triggered, we will have taken
4081  *	a copy of the state under the filter lock.  We use that
4082  *	snapshot to dispatch the knote for future processing (or
4083  *	not, if this was a lost event).
4084  *
4085  *	Our caller assures us that nobody else can be processing
4086  *	events from this knote during the whole operation. But
4087  *	others can be touching or posting events to the knote
4088  *	interspersed with our processing it.
4089  *
4090  *	caller holds a reference on the kqueue.
4091  *	kqueue locked on entry and exit - but may be dropped
4092  */
4093 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4094 knote_process(struct knote *kn, kevent_ctx_t kectx,
4095     kevent_callback_t callback)
4096 {
4097 	struct kevent_qos_s kev;
4098 	struct kqueue *kq = knote_get_kq(kn);
4099 	KNOTE_LOCK_CTX(knlc);
4100 	int result = FILTER_ACTIVE;
4101 	int error = 0;
4102 	bool drop = false;
4103 
4104 	/*
4105 	 * Must be active
4106 	 * Must be queued and not disabled/suppressed or dropping
4107 	 */
4108 	assert(kn->kn_status & KN_QUEUED);
4109 	assert(kn->kn_status & KN_ACTIVE);
4110 	assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4111 
4112 	if (kq->kq_state & KQ_WORKLOOP) {
4113 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4114 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4115 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4116 		    kn->kn_filtid);
4117 	} else if (kq->kq_state & KQ_WORKQ) {
4118 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4119 		    0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4120 		    kn->kn_filtid);
4121 	} else {
4122 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4123 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4124 		    kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4125 	}
4126 
4127 	if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4128 		/*
4129 		 * When the knote is dropping or has dropped,
4130 		 * then there's nothing we want to process.
4131 		 */
4132 		return EJUSTRETURN;
4133 	}
4134 
4135 	/*
4136 	 * While waiting for the knote lock, we may have dropped the kq lock.
4137 	 * and a touch may have disabled and dequeued the knote.
4138 	 */
4139 	if (!(kn->kn_status & KN_QUEUED)) {
4140 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4141 		return EJUSTRETURN;
4142 	}
4143 
4144 	/*
4145 	 * For deferred-drop or vanished events, we just create a fake
4146 	 * event to acknowledge end-of-life.  Otherwise, we call the
4147 	 * filter's process routine to snapshot the kevent state under
4148 	 * the filter's locking protocol.
4149 	 *
4150 	 * suppress knotes to avoid returning the same event multiple times in
4151 	 * a single call.
4152 	 */
4153 	knote_suppress(kq, kn);
4154 
4155 	if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4156 		uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4157 		if (kn->kn_status & KN_DEFERDELETE) {
4158 			kev_flags |= EV_DELETE;
4159 		} else {
4160 			kev_flags |= EV_VANISHED;
4161 		}
4162 
4163 		/* create fake event */
4164 		kev = (struct kevent_qos_s){
4165 			.filter = kn->kn_filter,
4166 			.ident  = kn->kn_id,
4167 			.flags  = kev_flags,
4168 			.udata  = kn->kn_udata,
4169 		};
4170 	} else {
4171 		kqunlock(kq);
4172 		kev = (struct kevent_qos_s) { };
4173 		result = filter_call(knote_fops(kn), f_process(kn, &kev));
4174 		kqlock(kq);
4175 	}
4176 
4177 	/*
4178 	 * Determine how to dispatch the knote for future event handling.
4179 	 * not-fired: just return (do not callout, leave deactivated).
4180 	 * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4181 	 *            is the deferred delete event delivery itself).  Otherwise,
4182 	 *            drop it.
4183 	 * Dispatch:  don't clear state, just mark it disabled.
4184 	 * Cleared:   just leave it deactivated.
4185 	 * Others:    re-activate as there may be more events to handle.
4186 	 *            This will not wake up more handlers right now, but
4187 	 *            at the completion of handling events it may trigger
4188 	 *            more handler threads (TODO: optimize based on more than
4189 	 *            just this one event being detected by the filter).
4190 	 */
4191 	if ((result & FILTER_ACTIVE) == 0) {
4192 		if ((kn->kn_status & KN_ACTIVE) == 0) {
4193 			/*
4194 			 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4195 			 * within f_process() but that doesn't necessarily make them
4196 			 * ready to process, so we should leave them be.
4197 			 *
4198 			 * For other knotes, since we will not return an event,
4199 			 * there's no point keeping the knote suppressed.
4200 			 */
4201 			knote_unsuppress(kq, kn);
4202 		}
4203 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4204 		return EJUSTRETURN;
4205 	}
4206 
4207 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4208 		knote_adjust_qos(kq, kn, result);
4209 	}
4210 
4211 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4212 		kqueue_update_iotier_override(kq);
4213 	}
4214 
4215 	kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4216 
4217 	if (kev.flags & EV_ONESHOT) {
4218 		if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4219 		    (kn->kn_status & KN_DEFERDELETE) == 0) {
4220 			/* defer dropping non-delete oneshot dispatch2 events */
4221 			kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4222 		} else {
4223 			drop = true;
4224 		}
4225 	} else if (kn->kn_flags & EV_DISPATCH) {
4226 		/* disable all dispatch knotes */
4227 		kn->kn_status |= KN_DISABLED;
4228 	} else if ((kn->kn_flags & EV_CLEAR) == 0) {
4229 		/* re-activate in case there are more events */
4230 		knote_activate(kq, kn, FILTER_ACTIVE);
4231 	}
4232 
4233 	/*
4234 	 * callback to handle each event as we find it.
4235 	 * If we have to detach and drop the knote, do
4236 	 * it while we have the kq unlocked.
4237 	 */
4238 	if (drop) {
4239 		knote_drop(kq, kn, &knlc);
4240 	} else {
4241 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4242 	}
4243 
4244 	if (kev.flags & EV_VANISHED) {
4245 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4246 		    kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4247 		    kn->kn_filtid);
4248 	}
4249 
4250 	error = (callback)(&kev, kectx);
4251 	kqlock(kq);
4252 	return error;
4253 }
4254 
4255 /*
4256  * Returns -1 if the kqueue was unbound and processing should not happen
4257  */
4258 #define KQWQAE_BEGIN_PROCESSING 1
4259 #define KQWQAE_END_PROCESSING   2
4260 #define KQWQAE_UNBIND           3
4261 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4262 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4263     int kevent_flags, int kqwqae_op)
4264 {
4265 	struct knote *kn;
4266 	int rc = 0;
4267 	bool unbind;
4268 	struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4269 	struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4270 
4271 	kqlock_held(&kqwq->kqwq_kqueue);
4272 
4273 	/*
4274 	 * Return suppressed knotes to their original state.
4275 	 * For workq kqueues, suppressed ones that are still
4276 	 * truly active (not just forced into the queue) will
4277 	 * set flags we check below to see if anything got
4278 	 * woken up.
4279 	 */
4280 	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4281 		knote_unsuppress(kqwq, kn);
4282 	}
4283 
4284 	if (kqwqae_op == KQWQAE_UNBIND) {
4285 		unbind = true;
4286 	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4287 		unbind = false;
4288 	} else {
4289 		unbind = TAILQ_EMPTY(queue);
4290 	}
4291 	if (unbind) {
4292 		thread_t thread = kqr_thread_fast(kqr);
4293 		thread_qos_t old_override;
4294 
4295 #if DEBUG || DEVELOPMENT
4296 		thread_t self = current_thread();
4297 		struct uthread *ut = get_bsdthread_info(self);
4298 
4299 		assert(thread == self);
4300 		assert(ut->uu_kqr_bound == kqr);
4301 #endif // DEBUG || DEVELOPMENT
4302 
4303 		old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4304 		if (!TAILQ_EMPTY(queue)) {
4305 			/*
4306 			 * Request a new thread if we didn't process the whole
4307 			 * queue.
4308 			 */
4309 			kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4310 			    kqr->tr_kq_qos_index, 0);
4311 		}
4312 		if (old_override) {
4313 			thread_drop_kevent_override(thread);
4314 		}
4315 		rc = -1;
4316 	}
4317 
4318 	return rc;
4319 }
4320 
4321 /*
4322  * Return 0 to indicate that processing should proceed,
4323  * -1 if there is nothing to process.
4324  *
4325  * Called with kqueue locked and returns the same way,
4326  * but may drop lock temporarily.
4327  */
4328 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4329 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4330     int kevent_flags)
4331 {
4332 	int rc = 0;
4333 
4334 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4335 	    0, kqr->tr_kq_qos_index);
4336 
4337 	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4338 	    KQWQAE_BEGIN_PROCESSING);
4339 
4340 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4341 	    thread_tid(kqr_thread(kqr)),
4342 	    !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4343 
4344 	return rc;
4345 }
4346 
4347 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4348 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4349 {
4350 	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4351 	struct knote *kn, *tmp;
4352 
4353 	kqlock_held(kqwl);
4354 
4355 	TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4356 		/*
4357 		 * If a knote that can adjust QoS is disabled because of the automatic
4358 		 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4359 		 * further overrides keep pushing.
4360 		 */
4361 		if (knote_fops(kn)->f_adjusts_qos &&
4362 		    (kn->kn_status & KN_DISABLED) != 0 &&
4363 		    (kn->kn_status & KN_DROPPING) == 0 &&
4364 		    (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4365 			qos = MAX(qos, kn->kn_qos_override);
4366 			continue;
4367 		}
4368 		knote_unsuppress(kqwl, kn);
4369 	}
4370 
4371 	return qos;
4372 }
4373 
4374 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4375 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4376 {
4377 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4378 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4379 	int rc = 0, op = KQWL_UTQ_NONE;
4380 
4381 	kqlock_held(kq);
4382 
4383 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4384 	    kqwl->kqwl_dynamicid, 0, 0);
4385 
4386 	/* nobody else should still be processing */
4387 	assert((kq->kq_state & KQ_PROCESSING) == 0);
4388 
4389 	kq->kq_state |= KQ_PROCESSING;
4390 
4391 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4392 		/*
4393 		 * When "parking" we want to process events and if no events are found
4394 		 * unbind.
4395 		 *
4396 		 * However, non overcommit threads sometimes park even when they have
4397 		 * more work so that the pool can narrow.  For these, we need to unbind
4398 		 * early, so that calling kqworkloop_update_threads_qos() can ask the
4399 		 * workqueue subsystem whether the thread should park despite having
4400 		 * pending events.
4401 		 */
4402 		if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4403 			op = KQWL_UTQ_PARKING;
4404 		} else {
4405 			op = KQWL_UTQ_UNBINDING;
4406 		}
4407 	} else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4408 		op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4409 	}
4410 
4411 	if (op != KQWL_UTQ_NONE) {
4412 		thread_qos_t qos_override;
4413 		thread_t thread = kqr_thread_fast(kqr);
4414 
4415 		qos_override = kqworkloop_acknowledge_events(kqwl);
4416 
4417 		if (op == KQWL_UTQ_UNBINDING) {
4418 			kqworkloop_unbind_locked(kqwl, thread,
4419 			    KQWL_OVERRIDE_DROP_IMMEDIATELY);
4420 			kqworkloop_release_live(kqwl);
4421 		}
4422 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
4423 		if (op == KQWL_UTQ_PARKING &&
4424 		    (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4425 			kqworkloop_unbind_locked(kqwl, thread,
4426 			    KQWL_OVERRIDE_DROP_DELAYED);
4427 			kqworkloop_release_live(kqwl);
4428 			rc = -1;
4429 		} else if (op == KQWL_UTQ_UNBINDING &&
4430 		    kqr_thread(kqr) != thread) {
4431 			rc = -1;
4432 		}
4433 
4434 		if (rc == -1) {
4435 			kq->kq_state &= ~KQ_PROCESSING;
4436 			kqworkloop_unbind_delayed_override_drop(thread);
4437 		}
4438 	}
4439 
4440 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4441 	    kqwl->kqwl_dynamicid, 0, 0);
4442 
4443 	return rc;
4444 }
4445 
4446 /*
4447  * Return 0 to indicate that processing should proceed,
4448  * -1 if there is nothing to process.
4449  * EBADF if the kqueue is draining
4450  *
4451  * Called with kqueue locked and returns the same way,
4452  * but may drop lock temporarily.
4453  * May block.
4454  */
4455 static int
kqfile_begin_processing(struct kqfile * kq)4456 kqfile_begin_processing(struct kqfile *kq)
4457 {
4458 	kqlock_held(kq);
4459 
4460 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4461 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4462 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4463 
4464 	/* wait to become the exclusive processing thread */
4465 	while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4466 		kq->kqf_state |= KQ_PROCWAIT;
4467 		lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4468 		    &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4469 	}
4470 
4471 	if (kq->kqf_state & KQ_DRAIN) {
4472 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4473 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4474 		return EBADF;
4475 	}
4476 
4477 	/* Nobody else processing */
4478 
4479 	/* anything left to process? */
4480 	if (kq->kqf_count == 0) {
4481 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4482 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4483 		return -1;
4484 	}
4485 
4486 	/* convert to processing mode */
4487 	kq->kqf_state |= KQ_PROCESSING;
4488 
4489 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4490 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4491 	return 0;
4492 }
4493 
4494 /*
4495  * Try to end the processing, only called when a workq thread is attempting to
4496  * park (KEVENT_FLAG_PARKING is set).
4497  *
4498  * When returning -1, the kqworkq is setup again so that it is ready to be
4499  * processed.
4500  */
4501 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4502 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4503     int kevent_flags)
4504 {
4505 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4506 		/*
4507 		 * if acknowledge events "succeeds" it means there are events,
4508 		 * which is a failure condition for end_processing.
4509 		 */
4510 		int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4511 		    KQWQAE_END_PROCESSING);
4512 		if (rc == 0) {
4513 			return -1;
4514 		}
4515 	}
4516 
4517 	return 0;
4518 }
4519 
4520 /*
4521  * Try to end the processing, only called when a workq thread is attempting to
4522  * park (KEVENT_FLAG_PARKING is set).
4523  *
4524  * When returning -1, the kqworkq is setup again so that it is ready to be
4525  * processed (as if kqworkloop_begin_processing had just been called).
4526  *
4527  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4528  * the kqworkloop is unbound from its servicer as a side effect.
4529  */
4530 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4531 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4532 {
4533 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4534 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4535 	int rc = 0;
4536 
4537 	kqlock_held(kq);
4538 
4539 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4540 	    kqwl->kqwl_dynamicid, 0, 0);
4541 
4542 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4543 		thread_t thread = kqr_thread_fast(kqr);
4544 		thread_qos_t qos_override;
4545 
4546 		/*
4547 		 * When KEVENT_FLAG_PARKING is set, we need to attempt
4548 		 * an unbind while still under the lock.
4549 		 *
4550 		 * So we do everything kqworkloop_unbind() would do, but because
4551 		 * we're inside kqueue_process(), if the workloop actually
4552 		 * received events while our locks were dropped, we have
4553 		 * the opportunity to fail the end processing and loop again.
4554 		 *
4555 		 * This avoids going through the process-wide workqueue lock
4556 		 * hence scales better.
4557 		 */
4558 		assert(flags & KQ_PROCESSING);
4559 		qos_override = kqworkloop_acknowledge_events(kqwl);
4560 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4561 
4562 		if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4563 			rc = -1;
4564 		} else {
4565 			kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4566 			kqworkloop_release_live(kqwl);
4567 			kq->kq_state &= ~flags;
4568 			kqworkloop_unbind_delayed_override_drop(thread);
4569 		}
4570 	} else {
4571 		kq->kq_state &= ~flags;
4572 		kq->kq_state |= KQ_R2K_ARMED;
4573 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4574 	}
4575 
4576 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4577 	    kqwl->kqwl_dynamicid, 0, 0);
4578 
4579 	return rc;
4580 }
4581 
4582 /*
4583  * Called with kqueue lock held.
4584  *
4585  * 0: no more events
4586  * -1: has more events
4587  * EBADF: kqueue is in draining mode
4588  */
4589 static int
kqfile_end_processing(struct kqfile * kq)4590 kqfile_end_processing(struct kqfile *kq)
4591 {
4592 	struct knote *kn;
4593 	int procwait;
4594 
4595 	kqlock_held(kq);
4596 
4597 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4598 
4599 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4600 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4601 
4602 	/*
4603 	 * Return suppressed knotes to their original state.
4604 	 */
4605 	while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4606 		knote_unsuppress(kq, kn);
4607 	}
4608 
4609 	procwait = (kq->kqf_state & KQ_PROCWAIT);
4610 	kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4611 
4612 	if (procwait) {
4613 		/* first wake up any thread already waiting to process */
4614 		thread_wakeup(&kq->kqf_suppressed);
4615 	}
4616 
4617 	if (kq->kqf_state & KQ_DRAIN) {
4618 		return EBADF;
4619 	}
4620 	return kq->kqf_count != 0 ? -1 : 0;
4621 }
4622 
4623 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4624 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4625     struct kqueue_workloop_params *params, int *retval)
4626 {
4627 	int error = 0;
4628 	struct kqworkloop *kqwl;
4629 	struct filedesc *fdp = &p->p_fd;
4630 	workq_threadreq_param_t trp = { };
4631 
4632 	switch (cmd) {
4633 	case KQ_WORKLOOP_CREATE:
4634 		if (!params->kqwlp_flags) {
4635 			error = EINVAL;
4636 			break;
4637 		}
4638 
4639 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4640 		    (params->kqwlp_sched_pri < 1 ||
4641 		    params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4642 			error = EINVAL;
4643 			break;
4644 		}
4645 
4646 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4647 		    invalid_policy(params->kqwlp_sched_pol)) {
4648 			error = EINVAL;
4649 			break;
4650 		}
4651 
4652 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4653 		    (params->kqwlp_cpu_percent <= 0 ||
4654 		    params->kqwlp_cpu_percent > 100 ||
4655 		    params->kqwlp_cpu_refillms <= 0 ||
4656 		    params->kqwlp_cpu_refillms > 0x00ffffff)) {
4657 			error = EINVAL;
4658 			break;
4659 		}
4660 
4661 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4662 			trp.trp_flags |= TRP_PRIORITY;
4663 			trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4664 		}
4665 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4666 			trp.trp_flags |= TRP_POLICY;
4667 			trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4668 		}
4669 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4670 			trp.trp_flags |= TRP_CPUPERCENT;
4671 			trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4672 			trp.trp_refillms = params->kqwlp_cpu_refillms;
4673 		}
4674 
4675 		error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
4676 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4677 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
4678 		if (error) {
4679 			break;
4680 		}
4681 
4682 		if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
4683 			/* FD_WORKLOOP indicates we've ever created a workloop
4684 			 * via this syscall but its only ever added to a process, never
4685 			 * removed.
4686 			 */
4687 			proc_fdlock(p);
4688 			fdt_flag_set(fdp, FD_WORKLOOP);
4689 			proc_fdunlock(p);
4690 		}
4691 		break;
4692 	case KQ_WORKLOOP_DESTROY:
4693 		error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
4694 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4695 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
4696 		if (error) {
4697 			break;
4698 		}
4699 		kqlock(kqwl);
4700 		trp.trp_value = kqwl->kqwl_params;
4701 		if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4702 			trp.trp_flags |= TRP_RELEASED;
4703 			kqwl->kqwl_params = trp.trp_value;
4704 			kqworkloop_release_live(kqwl);
4705 		} else {
4706 			error = EINVAL;
4707 		}
4708 		kqunlock(kqwl);
4709 		kqworkloop_release(kqwl);
4710 		break;
4711 	}
4712 	*retval = 0;
4713 	return error;
4714 }
4715 
4716 int
kqueue_workloop_ctl(proc_t p,struct kqueue_workloop_ctl_args * uap,int * retval)4717 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
4718 {
4719 	struct kqueue_workloop_params params = {
4720 		.kqwlp_id = 0,
4721 	};
4722 	if (uap->sz < sizeof(params.kqwlp_version)) {
4723 		return EINVAL;
4724 	}
4725 
4726 	size_t copyin_sz = MIN(sizeof(params), uap->sz);
4727 	int rv = copyin(uap->addr, &params, copyin_sz);
4728 	if (rv) {
4729 		return rv;
4730 	}
4731 
4732 	if (params.kqwlp_version != (int)uap->sz) {
4733 		return EINVAL;
4734 	}
4735 
4736 	return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
4737 	           retval);
4738 }
4739 
4740 static int
kqueue_select(struct fileproc * fp,int which,void * wql,__unused vfs_context_t ctx)4741 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
4742 {
4743 	struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
4744 	int retnum = 0;
4745 
4746 	assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4747 
4748 	if (which == FREAD) {
4749 		kqlock(kq);
4750 		if (kqfile_begin_processing(kq) == 0) {
4751 			retnum = kq->kqf_count;
4752 			kqfile_end_processing(kq);
4753 		} else if ((kq->kqf_state & KQ_DRAIN) == 0) {
4754 			selrecord(kq->kqf_p, &kq->kqf_sel, wql);
4755 		}
4756 		kqunlock(kq);
4757 	}
4758 	return retnum;
4759 }
4760 
4761 /*
4762  * kqueue_close -
4763  */
4764 static int
kqueue_close(struct fileglob * fg,__unused vfs_context_t ctx)4765 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4766 {
4767 	struct kqfile *kqf = fg_get_data(fg);
4768 
4769 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4770 	kqlock(kqf);
4771 	selthreadclear(&kqf->kqf_sel);
4772 	kqunlock(kqf);
4773 	kqueue_dealloc(&kqf->kqf_kqueue);
4774 	fg_set_data(fg, NULL);
4775 	return 0;
4776 }
4777 
4778 /*
4779  * Max depth of the nested kq path that can be created.
4780  * Note that this has to be less than the size of kq_level
4781  * to avoid wrapping around and mislabeling the level. We also
4782  * want to be aggressive about this so that we don't overflow the
4783  * kernel stack while posting kevents
4784  */
4785 #define MAX_NESTED_KQ 10
4786 
4787 /*
4788  * The callers has taken a use-count reference on this kqueue and will donate it
4789  * to the kqueue we are being added to.  This keeps the kqueue from closing until
4790  * that relationship is torn down.
4791  */
4792 static int
kqueue_kqfilter(struct fileproc * fp,struct knote * kn,__unused struct kevent_qos_s * kev)4793 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
4794     __unused struct kevent_qos_s *kev)
4795 {
4796 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
4797 	struct kqueue *kq = &kqf->kqf_kqueue;
4798 	struct kqueue *parentkq = knote_get_kq(kn);
4799 
4800 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4801 
4802 	if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
4803 		knote_set_error(kn, EINVAL);
4804 		return 0;
4805 	}
4806 
4807 	/*
4808 	 * We have to avoid creating a cycle when nesting kqueues
4809 	 * inside another.  Rather than trying to walk the whole
4810 	 * potential DAG of nested kqueues, we just use a simple
4811 	 * ceiling protocol.  When a kqueue is inserted into another,
4812 	 * we check that the (future) parent is not already nested
4813 	 * into another kqueue at a lower level than the potenial
4814 	 * child (because it could indicate a cycle).  If that test
4815 	 * passes, we just mark the nesting levels accordingly.
4816 	 *
4817 	 * Only up to MAX_NESTED_KQ can be nested.
4818 	 *
4819 	 * Note: kqworkq and kqworkloop cannot be nested and have reused their
4820 	 *       kq_level field, so ignore these as parent.
4821 	 */
4822 
4823 	kqlock(parentkq);
4824 
4825 	if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
4826 		if (parentkq->kq_level > 0 &&
4827 		    parentkq->kq_level < kq->kq_level) {
4828 			kqunlock(parentkq);
4829 			knote_set_error(kn, EINVAL);
4830 			return 0;
4831 		}
4832 
4833 		/* set parent level appropriately */
4834 		uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
4835 		if (plevel < kq->kq_level + 1) {
4836 			if (kq->kq_level + 1 > MAX_NESTED_KQ) {
4837 				kqunlock(parentkq);
4838 				knote_set_error(kn, EINVAL);
4839 				return 0;
4840 			}
4841 			plevel = kq->kq_level + 1;
4842 		}
4843 
4844 		parentkq->kq_level = plevel;
4845 	}
4846 
4847 	kqunlock(parentkq);
4848 
4849 	kn->kn_filtid = EVFILTID_KQREAD;
4850 	kqlock(kq);
4851 	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
4852 	/* indicate nesting in child, if needed */
4853 	if (kq->kq_level == 0) {
4854 		kq->kq_level = 1;
4855 	}
4856 
4857 	int count = kq->kq_count;
4858 	kqunlock(kq);
4859 	return count > 0;
4860 }
4861 
4862 __attribute__((noinline))
4863 static void
kqfile_wakeup(struct kqfile * kqf,long hint,wait_result_t wr)4864 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
4865 {
4866 	/* wakeup a thread waiting on this queue */
4867 	selwakeup(&kqf->kqf_sel);
4868 
4869 	/* wake up threads in kqueue_scan() */
4870 	if (kqf->kqf_state & KQ_SLEEP) {
4871 		kqf->kqf_state &= ~KQ_SLEEP;
4872 		thread_wakeup_with_result(&kqf->kqf_count, wr);
4873 	}
4874 
4875 	if (hint == NOTE_REVOKE) {
4876 		/* wakeup threads waiting their turn to process */
4877 		if (kqf->kqf_state & KQ_PROCWAIT) {
4878 			assert(kqf->kqf_state & KQ_PROCESSING);
4879 			kqf->kqf_state &= ~KQ_PROCWAIT;
4880 			thread_wakeup(&kqf->kqf_suppressed);
4881 		}
4882 
4883 		/* no need to KNOTE: knote_fdclose() takes care of it */
4884 	} else {
4885 		/* wakeup other kqueues/select sets we're inside */
4886 		KNOTE(&kqf->kqf_sel.si_note, hint);
4887 	}
4888 }
4889 
4890 /*
4891  * kqueue_drain - called when kq is closed
4892  */
4893 static int
kqueue_drain(struct fileproc * fp,__unused vfs_context_t ctx)4894 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
4895 {
4896 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
4897 
4898 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4899 
4900 	kqlock(kqf);
4901 	kqf->kqf_state |= KQ_DRAIN;
4902 	kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
4903 	kqunlock(kqf);
4904 	return 0;
4905 }
4906 
4907 int
kqueue_stat(struct kqueue * kq,void * ub,int isstat64,proc_t p)4908 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
4909 {
4910 	assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4911 
4912 	kqlock(kq);
4913 	if (isstat64 != 0) {
4914 		struct stat64 *sb64 = (struct stat64 *)ub;
4915 
4916 		bzero((void *)sb64, sizeof(*sb64));
4917 		sb64->st_size = kq->kq_count;
4918 		if (kq->kq_state & KQ_KEV_QOS) {
4919 			sb64->st_blksize = sizeof(struct kevent_qos_s);
4920 		} else if (kq->kq_state & KQ_KEV64) {
4921 			sb64->st_blksize = sizeof(struct kevent64_s);
4922 		} else if (IS_64BIT_PROCESS(p)) {
4923 			sb64->st_blksize = sizeof(struct user64_kevent);
4924 		} else {
4925 			sb64->st_blksize = sizeof(struct user32_kevent);
4926 		}
4927 		sb64->st_mode = S_IFIFO;
4928 	} else {
4929 		struct stat *sb = (struct stat *)ub;
4930 
4931 		bzero((void *)sb, sizeof(*sb));
4932 		sb->st_size = kq->kq_count;
4933 		if (kq->kq_state & KQ_KEV_QOS) {
4934 			sb->st_blksize = sizeof(struct kevent_qos_s);
4935 		} else if (kq->kq_state & KQ_KEV64) {
4936 			sb->st_blksize = sizeof(struct kevent64_s);
4937 		} else if (IS_64BIT_PROCESS(p)) {
4938 			sb->st_blksize = sizeof(struct user64_kevent);
4939 		} else {
4940 			sb->st_blksize = sizeof(struct user32_kevent);
4941 		}
4942 		sb->st_mode = S_IFIFO;
4943 	}
4944 	kqunlock(kq);
4945 	return 0;
4946 }
4947 
4948 static inline bool
kqueue_threadreq_can_use_ast(struct kqueue * kq)4949 kqueue_threadreq_can_use_ast(struct kqueue *kq)
4950 {
4951 	if (current_proc() == kq->kq_p) {
4952 		/*
4953 		 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
4954 		 * do combined send/receive and in the case of self-IPC, the AST may bet
4955 		 * set on a thread that will not return to userspace and needs the
4956 		 * thread the AST would create to unblock itself.
4957 		 *
4958 		 * At this time, we really want to target:
4959 		 *
4960 		 * - kevent variants that can cause thread creations, and dispatch
4961 		 *   really only uses kevent_qos and kevent_id,
4962 		 *
4963 		 * - workq_kernreturn (directly about thread creations)
4964 		 *
4965 		 * - bsdthread_ctl which is used for qos changes and has direct impact
4966 		 *   on the creator thread scheduling decisions.
4967 		 */
4968 		switch (current_uthread()->syscall_code) {
4969 		case SYS_kevent_qos:
4970 		case SYS_kevent_id:
4971 		case SYS_workq_kernreturn:
4972 		case SYS_bsdthread_ctl:
4973 			return true;
4974 		}
4975 	}
4976 	return false;
4977 }
4978 
4979 /*
4980  * Interact with the pthread kext to request a servicing there at a specific QoS
4981  * level.
4982  *
4983  * - Caller holds the kqlock
4984  *
4985  * - May be called with the kqueue's wait queue set locked,
4986  *   so cannot do anything that could recurse on that.
4987  */
4988 static void
kqueue_threadreq_initiate(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,int flags)4989 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
4990     kq_index_t qos, int flags)
4991 {
4992 	assert(kqr_thread(kqr) == THREAD_NULL);
4993 	assert(!kqr_thread_requested(kqr));
4994 	struct turnstile *ts = TURNSTILE_NULL;
4995 
4996 	if (workq_is_exiting(kqu.kq->kq_p)) {
4997 		return;
4998 	}
4999 
5000 	kqlock_held(kqu);
5001 
5002 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5003 		struct kqworkloop *kqwl = kqu.kqwl;
5004 
5005 		assert(kqwl->kqwl_owner == THREAD_NULL);
5006 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5007 		    kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5008 		ts = kqwl->kqwl_turnstile;
5009 		/* Add a thread request reference on the kqueue. */
5010 		kqworkloop_retain(kqwl);
5011 
5012 #if CONFIG_PREADOPT_TG
5013 		/* This thread is the one which is ack-ing the thread group on the kqwl
5014 		 * under the kqlock and will take action accordingly, pairs with the
5015 		 * release barrier in kqueue_set_preadopted_thread_group */
5016 		uint16_t tg_acknowledged;
5017 		if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5018 		    KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5019 		    &tg_acknowledged, acquire)) {
5020 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5021 		}
5022 #endif
5023 	} else {
5024 		assert(kqu.kq->kq_state & KQ_WORKQ);
5025 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5026 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5027 	}
5028 
5029 	/*
5030 	 * New-style thread request supported.
5031 	 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5032 	 * its use until a corresponding kqueue_threadreq_bind callback.
5033 	 */
5034 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5035 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5036 	}
5037 	if (qos == KQWQ_QOS_MANAGER) {
5038 		qos = WORKQ_THREAD_QOS_MANAGER;
5039 	}
5040 
5041 	if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5042 		/*
5043 		 * Process is shutting down or exec'ing.
5044 		 * All the kqueues are going to be cleaned up
5045 		 * soon. Forget we even asked for a thread -
5046 		 * and make sure we don't ask for more.
5047 		 */
5048 		kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5049 		kqueue_release_live(kqu);
5050 	}
5051 }
5052 
5053 /*
5054  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5055  *
5056  * This is used when kqueue_threadreq_bind may cause a lock inversion.
5057  */
5058 __attribute__((always_inline))
5059 void
kqueue_threadreq_bind_prepost(struct proc * p __unused,workq_threadreq_t kqr,struct uthread * ut)5060 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5061     struct uthread *ut)
5062 {
5063 	ut->uu_kqr_bound = kqr;
5064 	kqr->tr_thread = get_machthread(ut);
5065 	kqr->tr_state = WORKQ_TR_STATE_BINDING;
5066 }
5067 
5068 /*
5069  * kqueue_threadreq_bind_commit - commit a bind prepost
5070  *
5071  * The workq code has to commit any binding prepost before the thread has
5072  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5073  */
5074 void
kqueue_threadreq_bind_commit(struct proc * p,thread_t thread)5075 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5076 {
5077 	struct uthread *ut = get_bsdthread_info(thread);
5078 	workq_threadreq_t kqr = ut->uu_kqr_bound;
5079 	kqueue_t kqu = kqr_kqueue(p, kqr);
5080 
5081 	kqlock(kqu);
5082 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5083 		kqueue_threadreq_bind(p, kqr, thread, 0);
5084 	}
5085 	kqunlock(kqu);
5086 }
5087 
5088 static void
kqueue_threadreq_modify(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,workq_kern_threadreq_flags_t flags)5089 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5090     workq_kern_threadreq_flags_t flags)
5091 {
5092 	assert(kqr_thread_requested_pending(kqr));
5093 
5094 	kqlock_held(kqu);
5095 
5096 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5097 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5098 	}
5099 
5100 #if CONFIG_PREADOPT_TG
5101 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5102 		uint16_t tg_ack_status;
5103 		struct kqworkloop *kqwl = kqu.kqwl;
5104 
5105 		/* This thread is the one which is ack-ing the thread group on the kqwl
5106 		 * under the kqlock and will take action accordingly, needs acquire
5107 		 * barrier */
5108 		if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5109 		    KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5110 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5111 		}
5112 	}
5113 #endif
5114 
5115 	workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5116 }
5117 
5118 /*
5119  * kqueue_threadreq_bind - bind thread to processing kqrequest
5120  *
5121  * The provided thread will be responsible for delivering events
5122  * associated with the given kqrequest.  Bind it and get ready for
5123  * the thread to eventually arrive.
5124  */
5125 void
kqueue_threadreq_bind(struct proc * p,workq_threadreq_t kqr,thread_t thread,unsigned int flags)5126 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5127     unsigned int flags)
5128 {
5129 	kqueue_t kqu = kqr_kqueue(p, kqr);
5130 	struct uthread *ut = get_bsdthread_info(thread);
5131 
5132 	kqlock_held(kqu);
5133 
5134 	assert(ut->uu_kqueue_override == 0);
5135 
5136 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5137 		assert(ut->uu_kqr_bound == kqr);
5138 		assert(kqr->tr_thread == thread);
5139 	} else {
5140 		assert(kqr_thread_requested_pending(kqr));
5141 		assert(kqr->tr_thread == THREAD_NULL);
5142 		assert(ut->uu_kqr_bound == NULL);
5143 		ut->uu_kqr_bound = kqr;
5144 		kqr->tr_thread = thread;
5145 	}
5146 
5147 	kqr->tr_state = WORKQ_TR_STATE_BOUND;
5148 
5149 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5150 		struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5151 
5152 		if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5153 			/*
5154 			 * <rdar://problem/38626999> shows that asserting here is not ok.
5155 			 *
5156 			 * This is not supposed to happen for correct use of the interface,
5157 			 * but it is sadly possible for userspace (with the help of memory
5158 			 * corruption, such as over-release of a dispatch queue) to make
5159 			 * the creator thread the "owner" of a workloop.
5160 			 *
5161 			 * Once that happens, and that creator thread picks up the same
5162 			 * workloop as a servicer, we trip this codepath. We need to fixup
5163 			 * the state to forget about this thread being the owner, as the
5164 			 * entire workloop state machine expects servicers to never be
5165 			 * owners and everything would basically go downhill from here.
5166 			 */
5167 			kqu.kqwl->kqwl_owner = THREAD_NULL;
5168 			if (kqworkloop_override(kqu.kqwl)) {
5169 				thread_drop_kevent_override(thread);
5170 			}
5171 		}
5172 
5173 		if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5174 			/*
5175 			 * Past this point, the interlock is the kq req lock again,
5176 			 * so we can fix the inheritor for good.
5177 			 */
5178 			filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5179 			turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5180 		}
5181 
5182 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5183 		    thread_tid(thread), kqr->tr_kq_qos_index,
5184 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5185 
5186 		ut->uu_kqueue_override = kqr->tr_kq_override_index;
5187 		if (kqr->tr_kq_override_index) {
5188 			thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5189 		}
5190 
5191 #if CONFIG_PREADOPT_TG
5192 		/* Remove reference from kqwl and mark it as bound with the SENTINEL */
5193 		thread_group_qos_t old_tg;
5194 		thread_group_qos_t new_tg;
5195 		int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5196 			if (old_tg == KQWL_PREADOPTED_TG_NEVER) {
5197 			        os_atomic_rmw_loop_give_up(break); // It's an app, nothing to do
5198 			}
5199 			assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5200 			new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5201 		});
5202 
5203 		if (ret) {
5204 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5205 
5206 			if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5207 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5208 				assert(tg != NULL);
5209 
5210 				thread_set_preadopt_thread_group(thread, tg);
5211 				thread_group_release_live(tg); // The thread has a reference
5212 			} else {
5213 				/*
5214 				 * The thread may already have a preadopt thread group on it -
5215 				 * we need to make sure to clear that.
5216 				 */
5217 				thread_set_preadopt_thread_group(thread, NULL);
5218 			}
5219 
5220 			/* We have taken action on the preadopted thread group set on the
5221 			 * set on the kqwl, clear any redrive requests */
5222 			os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5223 		}
5224 #endif
5225 		kqueue_update_iotier_override(kqu);
5226 	} else {
5227 		assert(kqr->tr_kq_override_index == 0);
5228 
5229 #if CONFIG_PREADOPT_TG
5230 		/*
5231 		 * The thread may have a preadopt thread group on it already because it
5232 		 * got tagged with it as a creator thread. So we need to make sure to
5233 		 * clear that since we don't have preadopt thread groups for non-kqwl
5234 		 * cases
5235 		 */
5236 		thread_set_preadopt_thread_group(thread, NULL);
5237 #endif
5238 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5239 		    thread_tid(thread), kqr->tr_kq_qos_index,
5240 		    (kqr->tr_kq_override_index << 16) |
5241 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5242 	}
5243 }
5244 
5245 /*
5246  * kqueue_threadreq_cancel - abort a pending thread request
5247  *
5248  * Called when exiting/exec'ing. Forget our pending request.
5249  */
5250 void
kqueue_threadreq_cancel(struct proc * p,workq_threadreq_t kqr)5251 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5252 {
5253 	kqueue_release(kqr_kqueue(p, kqr));
5254 }
5255 
5256 workq_threadreq_param_t
kqueue_threadreq_workloop_param(workq_threadreq_t kqr)5257 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5258 {
5259 	struct kqworkloop *kqwl;
5260 	workq_threadreq_param_t trp;
5261 
5262 	assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5263 	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5264 	trp.trp_value = kqwl->kqwl_params;
5265 	return trp;
5266 }
5267 
5268 /*
5269  *	kqueue_threadreq_unbind - unbind thread from processing kqueue
5270  *
5271  *	End processing the per-QoS bucket of events and allow other threads
5272  *	to be requested for future servicing.
5273  *
5274  *	caller holds a reference on the kqueue.
5275  */
5276 void
kqueue_threadreq_unbind(struct proc * p,workq_threadreq_t kqr)5277 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5278 {
5279 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5280 		kqworkloop_unbind(kqr_kqworkloop(kqr));
5281 	} else {
5282 		kqworkq_unbind(p, kqr);
5283 	}
5284 }
5285 
5286 /*
5287  * If we aren't already busy processing events [for this QoS],
5288  * request workq thread support as appropriate.
5289  *
5290  * TBD - for now, we don't segregate out processing by QoS.
5291  *
5292  * - May be called with the kqueue's wait queue set locked,
5293  *   so cannot do anything that could recurse on that.
5294  */
5295 static void
kqworkq_wakeup(struct kqworkq * kqwq,kq_index_t qos_index)5296 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5297 {
5298 	workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5299 
5300 	/* convert to thread qos value */
5301 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5302 
5303 	if (!kqr_thread_requested(kqr)) {
5304 		kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5305 	}
5306 }
5307 
5308 /*
5309  * This represent the asynchronous QoS a given workloop contributes,
5310  * hence is the max of the current active knotes (override index)
5311  * and the workloop max qos (userspace async qos).
5312  */
5313 static kq_index_t
kqworkloop_override(struct kqworkloop * kqwl)5314 kqworkloop_override(struct kqworkloop *kqwl)
5315 {
5316 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5317 	return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5318 }
5319 
5320 static inline void
kqworkloop_request_fire_r2k_notification(struct kqworkloop * kqwl)5321 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5322 {
5323 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5324 
5325 	kqlock_held(kqwl);
5326 
5327 	if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5328 		kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5329 		act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5330 	}
5331 }
5332 
5333 static void
kqworkloop_update_threads_qos(struct kqworkloop * kqwl,int op,kq_index_t qos)5334 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5335 {
5336 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5337 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5338 	kq_index_t old_override = kqworkloop_override(kqwl);
5339 
5340 	kqlock_held(kqwl);
5341 
5342 	switch (op) {
5343 	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5344 		kqwl->kqwl_wakeup_qos = qos;
5345 		kqworkloop_request_fire_r2k_notification(kqwl);
5346 		goto recompute;
5347 
5348 	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5349 		kqr->tr_kq_override_index = qos;
5350 		goto recompute;
5351 
5352 	case KQWL_UTQ_PARKING:
5353 	case KQWL_UTQ_UNBINDING:
5354 		kqr->tr_kq_override_index = qos;
5355 		OS_FALLTHROUGH;
5356 
5357 	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5358 		if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5359 			assert(qos == THREAD_QOS_UNSPECIFIED);
5360 		}
5361 		if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5362 			kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5363 		}
5364 		kqwl->kqwl_wakeup_qos = 0;
5365 		for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5366 			if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5367 				kqwl->kqwl_wakeup_qos = i;
5368 				kqworkloop_request_fire_r2k_notification(kqwl);
5369 				break;
5370 			}
5371 		}
5372 		OS_FALLTHROUGH;
5373 
5374 	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5375 recompute:
5376 		/*
5377 		 * When modifying the wakeup QoS or the override QoS, we always need to
5378 		 * maintain our invariant that kqr_override_index is at least as large
5379 		 * as the highest QoS for which an event is fired.
5380 		 *
5381 		 * However this override index can be larger when there is an overriden
5382 		 * suppressed knote pushing on the kqueue.
5383 		 */
5384 		if (qos < kqwl->kqwl_wakeup_qos) {
5385 			qos = kqwl->kqwl_wakeup_qos;
5386 		}
5387 		if (kqr->tr_kq_override_index < qos) {
5388 			kqr->tr_kq_override_index = qos;
5389 		}
5390 		break;
5391 
5392 	case KQWL_UTQ_REDRIVE_EVENTS:
5393 		break;
5394 
5395 	case KQWL_UTQ_SET_QOS_INDEX:
5396 		kqr->tr_kq_qos_index = qos;
5397 		break;
5398 
5399 	default:
5400 		panic("unknown kqwl thread qos update operation: %d", op);
5401 	}
5402 
5403 	thread_t kqwl_owner = kqwl->kqwl_owner;
5404 	thread_t servicer = kqr_thread(kqr);
5405 	boolean_t qos_changed = FALSE;
5406 	kq_index_t new_override = kqworkloop_override(kqwl);
5407 
5408 	/*
5409 	 * Apply the diffs to the owner if applicable
5410 	 */
5411 	if (kqwl_owner) {
5412 #if 0
5413 		/* JMM - need new trace hooks for owner overrides */
5414 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5415 		    kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5416 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5417 #endif
5418 		if (new_override == old_override) {
5419 			// nothing to do
5420 		} else if (old_override == THREAD_QOS_UNSPECIFIED) {
5421 			thread_add_kevent_override(kqwl_owner, new_override);
5422 		} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5423 			thread_drop_kevent_override(kqwl_owner);
5424 		} else { /*  old_override != new_override */
5425 			thread_update_kevent_override(kqwl_owner, new_override);
5426 		}
5427 	}
5428 
5429 	/*
5430 	 * apply the diffs to the servicer
5431 	 */
5432 
5433 	if (!kqr_thread_requested(kqr)) {
5434 		/*
5435 		 * No servicer, nor thread-request
5436 		 *
5437 		 * Make a new thread request, unless there is an owner (or the workloop
5438 		 * is suspended in userland) or if there is no asynchronous work in the
5439 		 * first place.
5440 		 */
5441 
5442 		if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5443 			int initiate_flags = 0;
5444 			if (op == KQWL_UTQ_UNBINDING) {
5445 				initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5446 			}
5447 
5448 			/* kqueue_threadreq_initiate handles the acknowledgement of the TG
5449 			 * if needed */
5450 			kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5451 		}
5452 	} else if (servicer) {
5453 		/*
5454 		 * Servicer in flight
5455 		 *
5456 		 * Just apply the diff to the servicer
5457 		 */
5458 
5459 #if CONFIG_PREADOPT_TG
5460 		/* When there's a servicer for the kqwl already, then the servicer will
5461 		 * adopt the thread group in the kqr, we don't need to poke the
5462 		 * workqueue subsystem to make different decisions due to the thread
5463 		 * group. Consider the current request ack-ed.
5464 		 */
5465 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5466 #endif
5467 
5468 		struct uthread *ut = get_bsdthread_info(servicer);
5469 		if (ut->uu_kqueue_override != new_override) {
5470 			if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5471 				thread_add_servicer_override(servicer, new_override);
5472 			} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5473 				thread_drop_servicer_override(servicer);
5474 			} else { /* ut->uu_kqueue_override != new_override */
5475 				thread_update_servicer_override(servicer, new_override);
5476 			}
5477 			ut->uu_kqueue_override = new_override;
5478 			qos_changed = TRUE;
5479 		}
5480 	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5481 		/*
5482 		 * No events to deliver anymore.
5483 		 *
5484 		 * However canceling with turnstiles is challenging, so the fact that
5485 		 * the request isn't useful will be discovered by the servicer himself
5486 		 * later on.
5487 		 */
5488 	} else if (old_override != new_override) {
5489 		/*
5490 		 * Request is in flight
5491 		 *
5492 		 * Apply the diff to the thread request.
5493 		 */
5494 		kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
5495 		qos_changed = TRUE;
5496 	}
5497 
5498 	if (qos_changed) {
5499 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5500 		    thread_tid(servicer), kqr->tr_kq_qos_index,
5501 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5502 	}
5503 }
5504 
5505 static void
kqworkloop_update_iotier_override(struct kqworkloop * kqwl)5506 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
5507 {
5508 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5509 	thread_t servicer = kqr_thread(kqr);
5510 	uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
5511 
5512 	kqlock_held(kqwl);
5513 
5514 	if (servicer) {
5515 		thread_update_servicer_iotier_override(servicer, iotier);
5516 	}
5517 }
5518 
5519 static void
kqworkloop_wakeup(struct kqworkloop * kqwl,kq_index_t qos)5520 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5521 {
5522 	if (qos <= kqwl->kqwl_wakeup_qos) {
5523 		/*
5524 		 * Shortcut wakeups that really do nothing useful
5525 		 */
5526 		return;
5527 	}
5528 
5529 	if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5530 	    kqr_thread(&kqwl->kqwl_request) == current_thread()) {
5531 		/*
5532 		 * kqworkloop_end_processing() will perform the required QoS
5533 		 * computations when it unsets the processing mode.
5534 		 */
5535 		return;
5536 	}
5537 
5538 	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5539 }
5540 
5541 static struct kqtailq *
kqueue_get_suppressed_queue(kqueue_t kq,struct knote * kn)5542 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5543 {
5544 	if (kq.kq->kq_state & KQ_WORKLOOP) {
5545 		return &kq.kqwl->kqwl_suppressed;
5546 	} else if (kq.kq->kq_state & KQ_WORKQ) {
5547 		return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
5548 	} else {
5549 		return &kq.kqf->kqf_suppressed;
5550 	}
5551 }
5552 
5553 struct turnstile *
kqueue_alloc_turnstile(kqueue_t kqu)5554 kqueue_alloc_turnstile(kqueue_t kqu)
5555 {
5556 	struct kqworkloop *kqwl = kqu.kqwl;
5557 	kq_state_t kq_state;
5558 
5559 	kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5560 	if (kq_state & KQ_HAS_TURNSTILE) {
5561 		/* force a dependency to pair with the atomic or with release below */
5562 		return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5563 		           (uintptr_t)kq_state);
5564 	}
5565 
5566 	if (!(kq_state & KQ_WORKLOOP)) {
5567 		return TURNSTILE_NULL;
5568 	}
5569 
5570 	struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
5571 	bool workq_locked = false;
5572 
5573 	kqlock(kqu);
5574 
5575 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5576 		workq_locked = true;
5577 		workq_kern_threadreq_lock(kqwl->kqwl_p);
5578 	}
5579 
5580 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5581 		free_ts = ts;
5582 		ts = kqwl->kqwl_turnstile;
5583 	} else {
5584 		ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
5585 		    ts, TURNSTILE_WORKLOOPS);
5586 
5587 		/* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5588 		os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5589 
5590 		if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5591 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
5592 			    &kqwl->kqwl_request, kqwl->kqwl_owner,
5593 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
5594 			/*
5595 			 * The workq may no longer be the interlock after this.
5596 			 * In which case the inheritor wasn't updated.
5597 			 */
5598 		}
5599 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5600 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5601 		}
5602 	}
5603 
5604 	if (workq_locked) {
5605 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
5606 	}
5607 
5608 	kqunlock(kqu);
5609 
5610 	if (free_ts) {
5611 		turnstile_deallocate(free_ts);
5612 	} else {
5613 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
5614 	}
5615 	return ts;
5616 }
5617 
5618 __attribute__((always_inline))
5619 struct turnstile *
kqueue_turnstile(kqueue_t kqu)5620 kqueue_turnstile(kqueue_t kqu)
5621 {
5622 	kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5623 	if (kq_state & KQ_WORKLOOP) {
5624 		return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5625 	}
5626 	return TURNSTILE_NULL;
5627 }
5628 
5629 __attribute__((always_inline))
5630 struct turnstile *
kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)5631 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5632 {
5633 	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5634 	if (kqwl) {
5635 		return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5636 	}
5637 	return TURNSTILE_NULL;
5638 }
5639 
5640 static void
kqworkloop_set_overcommit(struct kqworkloop * kqwl)5641 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5642 {
5643 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5644 
5645 	/*
5646 	 * This test is racy, but since we never remove this bit,
5647 	 * it allows us to avoid taking a lock.
5648 	 */
5649 	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5650 		return;
5651 	}
5652 
5653 	kqlock_held(kqwl);
5654 
5655 	if (kqr_thread_requested_pending(kqr)) {
5656 		kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
5657 		    WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5658 	} else {
5659 		kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
5660 	}
5661 }
5662 
5663 static void
kqworkq_update_override(struct kqworkq * kqwq,struct knote * kn,kq_index_t override_index)5664 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
5665     kq_index_t override_index)
5666 {
5667 	workq_threadreq_t kqr;
5668 	kq_index_t old_override_index;
5669 	kq_index_t queue_index = kn->kn_qos_index;
5670 
5671 	if (override_index <= queue_index) {
5672 		return;
5673 	}
5674 
5675 	kqr = kqworkq_get_request(kqwq, queue_index);
5676 
5677 	kqlock_held(kqwq);
5678 
5679 	old_override_index = kqr->tr_kq_override_index;
5680 	if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5681 		thread_t servicer = kqr_thread(kqr);
5682 		kqr->tr_kq_override_index = override_index;
5683 
5684 		/* apply the override to [incoming?] servicing thread */
5685 		if (servicer) {
5686 			if (old_override_index) {
5687 				thread_update_kevent_override(servicer, override_index);
5688 			} else {
5689 				thread_add_kevent_override(servicer, override_index);
5690 			}
5691 		}
5692 	}
5693 }
5694 
5695 static void
kqueue_update_iotier_override(kqueue_t kqu)5696 kqueue_update_iotier_override(kqueue_t kqu)
5697 {
5698 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5699 		kqworkloop_update_iotier_override(kqu.kqwl);
5700 	}
5701 }
5702 
5703 static void
kqueue_update_override(kqueue_t kqu,struct knote * kn,thread_qos_t qos)5704 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5705 {
5706 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5707 		kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5708 		    qos);
5709 	} else {
5710 		kqworkq_update_override(kqu.kqwq, kn, qos);
5711 	}
5712 }
5713 
5714 static void
kqworkloop_unbind_locked(struct kqworkloop * kqwl,thread_t thread,enum kqwl_unbind_locked_mode how)5715 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5716     enum kqwl_unbind_locked_mode how)
5717 {
5718 	struct uthread *ut = get_bsdthread_info(thread);
5719 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5720 
5721 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5722 	    thread_tid(thread), 0, 0);
5723 
5724 	kqlock_held(kqwl);
5725 
5726 	assert(ut->uu_kqr_bound == kqr);
5727 	ut->uu_kqr_bound = NULL;
5728 	if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5729 	    ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5730 		thread_drop_servicer_override(thread);
5731 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5732 	}
5733 
5734 	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5735 		turnstile_update_inheritor(kqwl->kqwl_turnstile,
5736 		    TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
5737 		turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
5738 		    TURNSTILE_INTERLOCK_HELD);
5739 	}
5740 
5741 #if CONFIG_PREADOPT_TG
5742 	/* The kqueue is able to adopt a thread group again */
5743 
5744 	thread_group_qos_t old_tg, new_tg = NULL;
5745 	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5746 		new_tg = old_tg;
5747 		if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
5748 		        new_tg = KQWL_PREADOPTED_TG_NULL;
5749 		}
5750 	});
5751 	KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
5752 
5753 	if (ret) {
5754 		// Servicer can drop any preadopt thread group it has since it has
5755 		// unbound.
5756 		thread_set_preadopt_thread_group(thread, NULL);
5757 	}
5758 #endif
5759 	thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
5760 
5761 	kqr->tr_thread = THREAD_NULL;
5762 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
5763 	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5764 }
5765 
5766 static void
kqworkloop_unbind_delayed_override_drop(thread_t thread)5767 kqworkloop_unbind_delayed_override_drop(thread_t thread)
5768 {
5769 	struct uthread *ut = get_bsdthread_info(thread);
5770 	assert(ut->uu_kqr_bound == NULL);
5771 	if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5772 		thread_drop_servicer_override(thread);
5773 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5774 	}
5775 }
5776 
5777 /*
5778  *	kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5779  *
5780  *	It will acknowledge events, and possibly request a new thread if:
5781  *	- there were active events left
5782  *	- we pended waitq hook callouts during processing
5783  *	- we pended wakeups while processing (or unsuppressing)
5784  *
5785  *	Called with kqueue lock held.
5786  */
5787 static void
kqworkloop_unbind(struct kqworkloop * kqwl)5788 kqworkloop_unbind(struct kqworkloop *kqwl)
5789 {
5790 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5791 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5792 	thread_t thread = kqr_thread_fast(kqr);
5793 	int op = KQWL_UTQ_PARKING;
5794 	kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
5795 
5796 	assert(thread == current_thread());
5797 
5798 	kqlock(kqwl);
5799 
5800 	/*
5801 	 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
5802 	 * unsuppressing knotes not to be applied until the eventual call to
5803 	 * kqworkloop_update_threads_qos() below.
5804 	 */
5805 	assert((kq->kq_state & KQ_PROCESSING) == 0);
5806 	if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5807 		kq->kq_state |= KQ_PROCESSING;
5808 		qos_override = kqworkloop_acknowledge_events(kqwl);
5809 		kq->kq_state &= ~KQ_PROCESSING;
5810 	}
5811 
5812 	kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
5813 	kqworkloop_update_threads_qos(kqwl, op, qos_override);
5814 
5815 	kqunlock(kqwl);
5816 
5817 	/*
5818 	 * Drop the override on the current thread last, after the call to
5819 	 * kqworkloop_update_threads_qos above.
5820 	 */
5821 	kqworkloop_unbind_delayed_override_drop(thread);
5822 
5823 	/* If last reference, dealloc the workloop kq */
5824 	kqworkloop_release(kqwl);
5825 }
5826 
5827 static thread_qos_t
kqworkq_unbind_locked(struct kqworkq * kqwq,workq_threadreq_t kqr,thread_t thread)5828 kqworkq_unbind_locked(struct kqworkq *kqwq,
5829     workq_threadreq_t kqr, thread_t thread)
5830 {
5831 	struct uthread *ut = get_bsdthread_info(thread);
5832 	kq_index_t old_override = kqr->tr_kq_override_index;
5833 
5834 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
5835 	    thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
5836 
5837 	kqlock_held(kqwq);
5838 
5839 	assert(ut->uu_kqr_bound == kqr);
5840 	ut->uu_kqr_bound = NULL;
5841 	kqr->tr_thread = THREAD_NULL;
5842 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
5843 	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5844 	kqwq->kqwq_state &= ~KQ_R2K_ARMED;
5845 
5846 	return old_override;
5847 }
5848 
5849 /*
5850  *	kqworkq_unbind - unbind of a workq kqueue from a thread
5851  *
5852  *	We may have to request new threads.
5853  *	This can happen there are no waiting processing threads and:
5854  *	- there were active events we never got to (count > 0)
5855  *	- we pended waitq hook callouts during processing
5856  *	- we pended wakeups while processing (or unsuppressing)
5857  */
5858 static void
kqworkq_unbind(proc_t p,workq_threadreq_t kqr)5859 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
5860 {
5861 	struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
5862 	__assert_only int rc;
5863 
5864 	kqlock(kqwq);
5865 	rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
5866 	assert(rc == -1);
5867 	kqunlock(kqwq);
5868 }
5869 
5870 workq_threadreq_t
kqworkq_get_request(struct kqworkq * kqwq,kq_index_t qos_index)5871 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
5872 {
5873 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5874 	return &kqwq->kqwq_request[qos_index - 1];
5875 }
5876 
5877 static void
knote_reset_priority(kqueue_t kqu,struct knote * kn,pthread_priority_t pp)5878 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
5879 {
5880 	kq_index_t qos = _pthread_priority_thread_qos(pp);
5881 
5882 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5883 		assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
5884 		pp = _pthread_priority_normalize(pp);
5885 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
5886 		if (qos == THREAD_QOS_UNSPECIFIED) {
5887 			/* On workqueues, outside of QoS means MANAGER */
5888 			qos = KQWQ_QOS_MANAGER;
5889 			pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
5890 		} else {
5891 			pp = _pthread_priority_normalize(pp);
5892 		}
5893 	} else {
5894 		pp = _pthread_unspecified_priority();
5895 		qos = THREAD_QOS_UNSPECIFIED;
5896 	}
5897 
5898 	kn->kn_qos = (int32_t)pp;
5899 
5900 	if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
5901 		/* Never lower QoS when in "Merge" mode */
5902 		kn->kn_qos_override = qos;
5903 	}
5904 
5905 	/* only adjust in-use qos index when not suppressed */
5906 	if (kn->kn_status & KN_SUPPRESSED) {
5907 		kqueue_update_override(kqu, kn, qos);
5908 	} else if (kn->kn_qos_index != qos) {
5909 		knote_dequeue(kqu, kn);
5910 		kn->kn_qos_index = qos;
5911 	}
5912 }
5913 
5914 static void
knote_adjust_qos(struct kqueue * kq,struct knote * kn,int result)5915 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
5916 {
5917 	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
5918 
5919 	kqlock_held(kq);
5920 
5921 	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
5922 	assert(qos_index < THREAD_QOS_LAST);
5923 
5924 	/*
5925 	 * Early exit for knotes that should not change QoS
5926 	 */
5927 	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
5928 		panic("filter %d cannot change QoS", kn->kn_filtid);
5929 	} else if (__improbable(!knote_has_qos(kn))) {
5930 		return;
5931 	}
5932 
5933 	/*
5934 	 * knotes with the FALLBACK flag will only use their registration QoS if the
5935 	 * incoming event has no QoS, else, the registration QoS acts as a floor.
5936 	 */
5937 	thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
5938 	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
5939 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
5940 			qos_index = req_qos;
5941 		}
5942 	} else {
5943 		if (qos_index < req_qos) {
5944 			qos_index = req_qos;
5945 		}
5946 	}
5947 	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
5948 		/* Never lower QoS when in "Merge" mode */
5949 		return;
5950 	}
5951 
5952 	if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
5953 		/*
5954 		 * When we're trying to update the QoS override and that both an
5955 		 * f_event() and other f_* calls are running concurrently, any of these
5956 		 * in flight calls may want to perform overrides that aren't properly
5957 		 * serialized with each other.
5958 		 *
5959 		 * The first update that observes this racy situation enters a "Merge"
5960 		 * mode which causes subsequent override requests to saturate the
5961 		 * override instead of replacing its value.
5962 		 *
5963 		 * This mode is left when knote_unlock() or knote_post()
5964 		 * observe that no other f_* routine is in flight.
5965 		 */
5966 		kn->kn_status |= KN_MERGE_QOS;
5967 	}
5968 
5969 	/*
5970 	 * Now apply the override if it changed.
5971 	 */
5972 
5973 	if (kn->kn_qos_override == qos_index) {
5974 		return;
5975 	}
5976 
5977 	kn->kn_qos_override = qos_index;
5978 
5979 	if (kn->kn_status & KN_SUPPRESSED) {
5980 		/*
5981 		 * For suppressed events, the kn_qos_index field cannot be touched as it
5982 		 * allows us to know on which supress queue the knote is for a kqworkq.
5983 		 *
5984 		 * Also, there's no natural push applied on the kqueues when this field
5985 		 * changes anyway. We hence need to apply manual overrides in this case,
5986 		 * which will be cleared when the events are later acknowledged.
5987 		 */
5988 		kqueue_update_override(kq, kn, qos_index);
5989 	} else if (kn->kn_qos_index != qos_index) {
5990 		knote_dequeue(kq, kn);
5991 		kn->kn_qos_index = qos_index;
5992 	}
5993 }
5994 
5995 void
klist_init(struct klist * list)5996 klist_init(struct klist *list)
5997 {
5998 	SLIST_INIT(list);
5999 }
6000 
6001 
6002 /*
6003  *	Query/Post each knote in the object's list
6004  *
6005  *	The object lock protects the list. It is assumed that the filter/event
6006  *	routine for the object can determine that the object is already locked (via
6007  *	the hint) and not deadlock itself.
6008  *
6009  *	Autodetach is a specific contract which will detach all knotes from the
6010  *	object prior to posting the final event for that knote. This is done while
6011  *	under the object lock. A breadcrumb is left in the knote's next pointer to
6012  *	indicate to future calls to f_detach routines that they need not reattempt
6013  *	to knote_detach from the object's klist again. This is currently used by
6014  *	EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6015  *
6016  */
6017 void
knote(struct klist * list,long hint,bool autodetach)6018 knote(struct klist *list, long hint, bool autodetach)
6019 {
6020 	struct knote *kn;
6021 	struct knote *tmp_kn;
6022 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6023 		/*
6024 		 * We can modify the knote's next pointer since since we are holding the
6025 		 * object lock and the list can't be concurrently modified. Anyone
6026 		 * determining auto-detached-ness of a knote should take the primitive lock
6027 		 * to synchronize.
6028 		 *
6029 		 * Note that we do this here instead of the filter's f_event since we may
6030 		 * not even post the event if the knote is being dropped.
6031 		 */
6032 		if (autodetach) {
6033 			kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6034 		}
6035 		knote_post(kn, hint);
6036 	}
6037 
6038 	/* Blast away the entire klist */
6039 	if (autodetach) {
6040 		klist_init(list);
6041 	}
6042 }
6043 
6044 /*
6045  * attach a knote to the specified list.  Return true if this is the first entry.
6046  * The list is protected by whatever lock the object it is associated with uses.
6047  */
6048 int
knote_attach(struct klist * list,struct knote * kn)6049 knote_attach(struct klist *list, struct knote *kn)
6050 {
6051 	int ret = SLIST_EMPTY(list);
6052 	SLIST_INSERT_HEAD(list, kn, kn_selnext);
6053 	return ret;
6054 }
6055 
6056 /*
6057  * detach a knote from the specified list.  Return true if that was the last
6058  * entry.  The list is protected by whatever lock the object it is associated
6059  * with uses.
6060  */
6061 int
knote_detach(struct klist * list,struct knote * kn)6062 knote_detach(struct klist *list, struct knote *kn)
6063 {
6064 	assert(!KNOTE_IS_AUTODETACHED(kn));
6065 
6066 	SLIST_REMOVE(list, kn, knote, kn_selnext);
6067 	return SLIST_EMPTY(list);
6068 }
6069 
6070 /*
6071  * knote_vanish - Indicate that the source has vanished
6072  *
6073  * Used only for vanishing ports - vanishing fds go
6074  * through knote_fdclose()
6075  *
6076  * If the knote has requested EV_VANISHED delivery,
6077  * arrange for that. Otherwise, deliver a NOTE_REVOKE
6078  * event for backward compatibility.
6079  *
6080  * The knote is marked as having vanished. The source's
6081  * reference to the knote is dropped by caller, but the knote's
6082  * source reference is only cleaned up later when the knote is dropped.
6083  *
6084  * Our caller already has the object lock held. Calling
6085  * the detach routine would try to take that lock
6086  * recursively - which likely is not supported.
6087  */
6088 void
knote_vanish(struct klist * list,bool make_active)6089 knote_vanish(struct klist *list, bool make_active)
6090 {
6091 	struct knote *kn;
6092 	struct knote *kn_next;
6093 
6094 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6095 		struct kqueue *kq = knote_get_kq(kn);
6096 
6097 		kqlock(kq);
6098 		if (__probable(kn->kn_status & KN_REQVANISH)) {
6099 			/*
6100 			 * If EV_VANISH supported - prepare to deliver one
6101 			 */
6102 			kn->kn_status |= KN_VANISHED;
6103 		} else {
6104 			/*
6105 			 * Handle the legacy way to indicate that the port/portset was
6106 			 * deallocated or left the current Mach portspace (modern technique
6107 			 * is with an EV_VANISHED protocol).
6108 			 *
6109 			 * Deliver an EV_EOF event for these changes (hopefully it will get
6110 			 * delivered before the port name recycles to the same generation
6111 			 * count and someone tries to re-register a kevent for it or the
6112 			 * events are udata-specific - avoiding a conflict).
6113 			 */
6114 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
6115 		}
6116 		if (make_active) {
6117 			knote_activate(kq, kn, FILTER_ACTIVE);
6118 		}
6119 		kqunlock(kq);
6120 	}
6121 }
6122 
6123 /*
6124  * remove all knotes referencing a specified fd
6125  *
6126  * Entered with the proc_fd lock already held.
6127  * It returns the same way, but may drop it temporarily.
6128  */
6129 void
knote_fdclose(struct proc * p,int fd)6130 knote_fdclose(struct proc *p, int fd)
6131 {
6132 	struct filedesc *fdt = &p->p_fd;
6133 	struct klist *list;
6134 	struct knote *kn;
6135 	KNOTE_LOCK_CTX(knlc);
6136 
6137 restart:
6138 	list = &fdt->fd_knlist[fd];
6139 	SLIST_FOREACH(kn, list, kn_link) {
6140 		struct kqueue *kq = knote_get_kq(kn);
6141 
6142 		kqlock(kq);
6143 
6144 		if (kq->kq_p != p) {
6145 			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6146 			    __func__, kq->kq_p, p);
6147 		}
6148 
6149 		/*
6150 		 * If the knote supports EV_VANISHED delivery,
6151 		 * transition it to vanished mode (or skip over
6152 		 * it if already vanished).
6153 		 */
6154 		if (kn->kn_status & KN_VANISHED) {
6155 			kqunlock(kq);
6156 			continue;
6157 		}
6158 
6159 		proc_fdunlock(p);
6160 		if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6161 			/* the knote was dropped by someone, nothing to do */
6162 		} else if (kn->kn_status & KN_REQVANISH) {
6163 			/*
6164 			 * Since we have REQVANISH for this knote, we need to notify clients about
6165 			 * the EV_VANISHED.
6166 			 *
6167 			 * But unlike mach ports, we want to do the detach here as well and not
6168 			 * defer it so that we can release the iocount that is on the knote and
6169 			 * close the fp.
6170 			 */
6171 			kn->kn_status |= KN_VANISHED;
6172 
6173 			/*
6174 			 * There may be a concurrent post happening, make sure to wait for it
6175 			 * before we detach. knote_wait_for_post() unlocks on kq on exit
6176 			 */
6177 			knote_wait_for_post(kq, kn);
6178 
6179 			knote_fops(kn)->f_detach(kn);
6180 			if (kn->kn_is_fd) {
6181 				fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6182 			}
6183 			kn->kn_filtid = EVFILTID_DETACHED;
6184 			kqlock(kq);
6185 
6186 			knote_activate(kq, kn, FILTER_ACTIVE);
6187 			knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6188 		} else {
6189 			knote_drop(kq, kn, &knlc);
6190 		}
6191 
6192 		proc_fdlock(p);
6193 		goto restart;
6194 	}
6195 }
6196 
6197 /*
6198  * knote_fdfind - lookup a knote in the fd table for process
6199  *
6200  * If the filter is file-based, lookup based on fd index.
6201  * Otherwise use a hash based on the ident.
6202  *
6203  * Matching is based on kq, filter, and ident. Optionally,
6204  * it may also be based on the udata field in the kevent -
6205  * allowing multiple event registration for the file object
6206  * per kqueue.
6207  *
6208  * fd_knhashlock or fdlock held on entry (and exit)
6209  */
6210 static struct knote *
knote_fdfind(struct kqueue * kq,const struct kevent_internal_s * kev,bool is_fd,struct proc * p)6211 knote_fdfind(struct kqueue *kq,
6212     const struct kevent_internal_s *kev,
6213     bool is_fd,
6214     struct proc *p)
6215 {
6216 	struct filedesc *fdp = &p->p_fd;
6217 	struct klist *list = NULL;
6218 	struct knote *kn = NULL;
6219 
6220 	/*
6221 	 * determine where to look for the knote
6222 	 */
6223 	if (is_fd) {
6224 		/* fd-based knotes are linked off the fd table */
6225 		if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6226 			list = &fdp->fd_knlist[kev->kei_ident];
6227 		}
6228 	} else if (fdp->fd_knhashmask != 0) {
6229 		/* hash non-fd knotes here too */
6230 		list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6231 	}
6232 
6233 	/*
6234 	 * scan the selected list looking for a match
6235 	 */
6236 	if (list != NULL) {
6237 		SLIST_FOREACH(kn, list, kn_link) {
6238 			if (kq == knote_get_kq(kn) &&
6239 			    kev->kei_ident == kn->kn_id &&
6240 			    kev->kei_filter == kn->kn_filter) {
6241 				if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6242 					if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6243 					    kev->kei_udata == kn->kn_udata) {
6244 						break; /* matching udata-specific knote */
6245 					}
6246 				} else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6247 					break; /* matching non-udata-specific knote */
6248 				}
6249 			}
6250 		}
6251 	}
6252 	return kn;
6253 }
6254 
6255 /*
6256  * kq_add_knote- Add knote to the fd table for process
6257  * while checking for duplicates.
6258  *
6259  * All file-based filters associate a list of knotes by file
6260  * descriptor index. All other filters hash the knote by ident.
6261  *
6262  * May have to grow the table of knote lists to cover the
6263  * file descriptor index presented.
6264  *
6265  * fd_knhashlock and fdlock unheld on entry (and exit).
6266  *
6267  * Takes a rwlock boost if inserting the knote is successful.
6268  */
6269 static int
kq_add_knote(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc,struct proc * p)6270 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6271     struct proc *p)
6272 {
6273 	struct filedesc *fdp = &p->p_fd;
6274 	struct klist *list = NULL;
6275 	int ret = 0;
6276 	bool is_fd = kn->kn_is_fd;
6277 
6278 	if (is_fd) {
6279 		proc_fdlock(p);
6280 	} else {
6281 		knhash_lock(fdp);
6282 	}
6283 
6284 	if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6285 		/* found an existing knote: we can't add this one */
6286 		ret = ERESTART;
6287 		goto out_locked;
6288 	}
6289 
6290 	/* knote was not found: add it now */
6291 	if (!is_fd) {
6292 		if (fdp->fd_knhashmask == 0) {
6293 			u_long size = 0;
6294 
6295 			list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6296 			if (list == NULL) {
6297 				ret = ENOMEM;
6298 				goto out_locked;
6299 			}
6300 
6301 			fdp->fd_knhash = list;
6302 			fdp->fd_knhashmask = size;
6303 		}
6304 
6305 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6306 		SLIST_INSERT_HEAD(list, kn, kn_link);
6307 		ret = 0;
6308 		goto out_locked;
6309 	} else {
6310 		/* knote is fd based */
6311 
6312 		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6313 			u_int size = 0;
6314 
6315 			/* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6316 			if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6317 				ret = EINVAL;
6318 				goto out_locked;
6319 			}
6320 			/* have to grow the fd_knlist */
6321 			size = fdp->fd_knlistsize;
6322 			while (size <= kn->kn_id) {
6323 				size += KQEXTENT;
6324 			}
6325 
6326 			if (size >= (UINT_MAX / sizeof(struct klist))) {
6327 				ret = EINVAL;
6328 				goto out_locked;
6329 			}
6330 
6331 			list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6332 			if (list == NULL) {
6333 				ret = ENOMEM;
6334 				goto out_locked;
6335 			}
6336 
6337 			bcopy(fdp->fd_knlist, list,
6338 			    fdp->fd_knlistsize * sizeof(struct klist));
6339 			kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6340 			fdp->fd_knlist = list;
6341 			fdp->fd_knlistsize = size;
6342 		}
6343 
6344 		list = &fdp->fd_knlist[kn->kn_id];
6345 		SLIST_INSERT_HEAD(list, kn, kn_link);
6346 		ret = 0;
6347 		goto out_locked;
6348 	}
6349 
6350 out_locked:
6351 	if (ret == 0) {
6352 		kqlock(kq);
6353 		assert((kn->kn_status & KN_LOCKED) == 0);
6354 		(void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6355 		kqueue_retain(kq); /* retain a kq ref */
6356 	}
6357 	if (is_fd) {
6358 		proc_fdunlock(p);
6359 	} else {
6360 		knhash_unlock(fdp);
6361 	}
6362 
6363 	return ret;
6364 }
6365 
6366 /*
6367  * kq_remove_knote - remove a knote from the fd table for process
6368  *
6369  * If the filter is file-based, remove based on fd index.
6370  * Otherwise remove from the hash based on the ident.
6371  *
6372  * fd_knhashlock and fdlock unheld on entry (and exit).
6373  */
6374 static void
kq_remove_knote(struct kqueue * kq,struct knote * kn,struct proc * p,struct knote_lock_ctx * knlc)6375 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6376     struct knote_lock_ctx *knlc)
6377 {
6378 	struct filedesc *fdp = &p->p_fd;
6379 	struct klist *list = NULL;
6380 	uint16_t kq_state;
6381 	bool is_fd = kn->kn_is_fd;
6382 
6383 	if (is_fd) {
6384 		proc_fdlock(p);
6385 	} else {
6386 		knhash_lock(fdp);
6387 	}
6388 
6389 	if (is_fd) {
6390 		assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6391 		list = &fdp->fd_knlist[kn->kn_id];
6392 	} else {
6393 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6394 	}
6395 	SLIST_REMOVE(list, kn, knote, kn_link);
6396 
6397 	kqlock(kq);
6398 
6399 	/* Update the servicer iotier override */
6400 	kqueue_update_iotier_override(kq);
6401 
6402 	kq_state = kq->kq_state;
6403 	if (knlc) {
6404 		knote_unlock_cancel(kq, kn, knlc);
6405 	} else {
6406 		kqunlock(kq);
6407 	}
6408 	if (is_fd) {
6409 		proc_fdunlock(p);
6410 	} else {
6411 		knhash_unlock(fdp);
6412 	}
6413 
6414 	if (kq_state & KQ_DYNAMIC) {
6415 		kqworkloop_release((struct kqworkloop *)kq);
6416 	}
6417 }
6418 
6419 /*
6420  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6421  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6422  *
6423  * fd_knhashlock or fdlock unheld on entry (and exit)
6424  */
6425 
6426 static struct knote *
kq_find_knote_and_kq_lock(struct kqueue * kq,struct kevent_qos_s * kev,bool is_fd,struct proc * p)6427 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6428     bool is_fd, struct proc *p)
6429 {
6430 	struct filedesc *fdp = &p->p_fd;
6431 	struct knote *kn;
6432 
6433 	if (is_fd) {
6434 		proc_fdlock(p);
6435 	} else {
6436 		knhash_lock(fdp);
6437 	}
6438 
6439 	/*
6440 	 * Temporary horrible hack:
6441 	 * this cast is gross and will go away in a future change.
6442 	 * It is OK to do because we don't look at xflags/s_fflags,
6443 	 * and that when we cast down the kev this way,
6444 	 * the truncated filter field works.
6445 	 */
6446 	kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
6447 
6448 	if (kn) {
6449 		kqlock(kq);
6450 		assert(knote_get_kq(kn) == kq);
6451 	}
6452 
6453 	if (is_fd) {
6454 		proc_fdunlock(p);
6455 	} else {
6456 		knhash_unlock(fdp);
6457 	}
6458 
6459 	return kn;
6460 }
6461 
6462 static struct kqtailq *
knote_get_tailq(kqueue_t kqu,struct knote * kn)6463 knote_get_tailq(kqueue_t kqu, struct knote *kn)
6464 {
6465 	kq_index_t qos_index = kn->kn_qos_index;
6466 
6467 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6468 		assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
6469 		return &kqu.kqwl->kqwl_queue[qos_index - 1];
6470 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6471 		assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6472 		return &kqu.kqwq->kqwq_queue[qos_index - 1];
6473 	} else {
6474 		assert(qos_index == QOS_INDEX_KQFILE);
6475 		return &kqu.kqf->kqf_queue;
6476 	}
6477 }
6478 
6479 static void
knote_enqueue(kqueue_t kqu,struct knote * kn)6480 knote_enqueue(kqueue_t kqu, struct knote *kn)
6481 {
6482 	kqlock_held(kqu);
6483 
6484 	if ((kn->kn_status & KN_ACTIVE) == 0) {
6485 		return;
6486 	}
6487 
6488 	if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
6489 		return;
6490 	}
6491 
6492 	struct kqtailq *queue = knote_get_tailq(kqu, kn);
6493 	bool wakeup = TAILQ_EMPTY(queue);
6494 
6495 	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6496 	kn->kn_status |= KN_QUEUED;
6497 	kqu.kq->kq_count++;
6498 
6499 	if (wakeup) {
6500 		if (kqu.kq->kq_state & KQ_WORKLOOP) {
6501 			kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
6502 		} else if (kqu.kq->kq_state & KQ_WORKQ) {
6503 			kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
6504 		} else {
6505 			kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
6506 		}
6507 	}
6508 }
6509 
6510 __attribute__((always_inline))
6511 static inline void
knote_dequeue(kqueue_t kqu,struct knote * kn)6512 knote_dequeue(kqueue_t kqu, struct knote *kn)
6513 {
6514 	if (kn->kn_status & KN_QUEUED) {
6515 		struct kqtailq *queue = knote_get_tailq(kqu, kn);
6516 
6517 		// attaching the knote calls knote_reset_priority() without
6518 		// the kqlock which is fine, so we can't call kqlock_held()
6519 		// if we're not queued.
6520 		kqlock_held(kqu);
6521 
6522 		TAILQ_REMOVE(queue, kn, kn_tqe);
6523 		kn->kn_status &= ~KN_QUEUED;
6524 		kqu.kq->kq_count--;
6525 		if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
6526 			assert((kqu.kq->kq_count == 0) ==
6527 			    (bool)TAILQ_EMPTY(queue));
6528 		}
6529 	}
6530 }
6531 
6532 /* called with kqueue lock held */
6533 static void
knote_suppress(kqueue_t kqu,struct knote * kn)6534 knote_suppress(kqueue_t kqu, struct knote *kn)
6535 {
6536 	struct kqtailq *suppressq;
6537 
6538 	kqlock_held(kqu);
6539 
6540 	assert((kn->kn_status & KN_SUPPRESSED) == 0);
6541 	assert(kn->kn_status & KN_QUEUED);
6542 
6543 	knote_dequeue(kqu, kn);
6544 	/* deactivate - so new activations indicate a wakeup */
6545 	kn->kn_status &= ~KN_ACTIVE;
6546 	kn->kn_status |= KN_SUPPRESSED;
6547 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
6548 	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6549 }
6550 
6551 __attribute__((always_inline))
6552 static inline void
knote_unsuppress_noqueue(kqueue_t kqu,struct knote * kn)6553 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6554 {
6555 	struct kqtailq *suppressq;
6556 
6557 	kqlock_held(kqu);
6558 
6559 	assert(kn->kn_status & KN_SUPPRESSED);
6560 
6561 	kn->kn_status &= ~KN_SUPPRESSED;
6562 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
6563 	TAILQ_REMOVE(suppressq, kn, kn_tqe);
6564 
6565 	/*
6566 	 * If the knote is no longer active, reset its push,
6567 	 * and resynchronize kn_qos_index with kn_qos_override
6568 	 * for knotes with a real qos.
6569 	 */
6570 	if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
6571 		kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
6572 	}
6573 	kn->kn_qos_index = kn->kn_qos_override;
6574 }
6575 
6576 /* called with kqueue lock held */
6577 static void
knote_unsuppress(kqueue_t kqu,struct knote * kn)6578 knote_unsuppress(kqueue_t kqu, struct knote *kn)
6579 {
6580 	knote_unsuppress_noqueue(kqu, kn);
6581 	knote_enqueue(kqu, kn);
6582 }
6583 
6584 __attribute__((always_inline))
6585 static inline void
knote_mark_active(struct knote * kn)6586 knote_mark_active(struct knote *kn)
6587 {
6588 	if ((kn->kn_status & KN_ACTIVE) == 0) {
6589 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6590 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
6591 		    kn->kn_filtid);
6592 	}
6593 
6594 	kn->kn_status |= KN_ACTIVE;
6595 }
6596 
6597 /* called with kqueue lock held */
6598 static void
knote_activate(kqueue_t kqu,struct knote * kn,int result)6599 knote_activate(kqueue_t kqu, struct knote *kn, int result)
6600 {
6601 	assert(result & FILTER_ACTIVE);
6602 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6603 		// may dequeue the knote
6604 		knote_adjust_qos(kqu.kq, kn, result);
6605 	}
6606 	knote_mark_active(kn);
6607 	knote_enqueue(kqu, kn);
6608 }
6609 
6610 /*
6611  * This function applies changes requested by f_attach or f_touch for
6612  * a given filter. It proceeds in a carefully chosen order to help
6613  * every single transition do the minimal amount of work possible.
6614  */
6615 static void
knote_apply_touch(kqueue_t kqu,struct knote * kn,struct kevent_qos_s * kev,int result)6616 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
6617     int result)
6618 {
6619 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6620 		kn->kn_status &= ~KN_DISABLED;
6621 
6622 		/*
6623 		 * it is possible for userland to have knotes registered for a given
6624 		 * workloop `wl_orig` but really handled on another workloop `wl_new`.
6625 		 *
6626 		 * In that case, rearming will happen from the servicer thread of
6627 		 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6628 		 * this knote to stay suppressed forever if we only relied on
6629 		 * kqworkloop_acknowledge_events to be called by `wl_orig`.
6630 		 *
6631 		 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6632 		 * unsuppress because that would mess with the processing phase of
6633 		 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6634 		 * will be called.
6635 		 */
6636 		if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6637 			if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
6638 				knote_unsuppress_noqueue(kqu, kn);
6639 			}
6640 		}
6641 	}
6642 
6643 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
6644 		kqueue_update_iotier_override(kqu);
6645 	}
6646 
6647 	if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6648 		// may dequeue the knote
6649 		knote_reset_priority(kqu, kn, kev->qos);
6650 	}
6651 
6652 	/*
6653 	 * When we unsuppress above, or because of knote_reset_priority(),
6654 	 * the knote may have been dequeued, we need to restore the invariant
6655 	 * that if the knote is active it needs to be queued now that
6656 	 * we're done applying changes.
6657 	 */
6658 	if (result & FILTER_ACTIVE) {
6659 		knote_activate(kqu, kn, result);
6660 	} else {
6661 		knote_enqueue(kqu, kn);
6662 	}
6663 
6664 	if ((result & FILTER_THREADREQ_NODEFEER) &&
6665 	    act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6666 		workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
6667 	}
6668 }
6669 
6670 /*
6671  * knote_drop - disconnect and drop the knote
6672  *
6673  * Called with the kqueue locked, returns with the kqueue unlocked.
6674  *
6675  * If a knote locking context is passed, it is canceled.
6676  *
6677  * The knote may have already been detached from
6678  * (or not yet attached to) its source object.
6679  */
6680 static void
knote_drop(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)6681 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
6682 {
6683 	struct proc *p = kq->kq_p;
6684 
6685 	kqlock_held(kq);
6686 
6687 	assert((kn->kn_status & KN_DROPPING) == 0);
6688 	if (knlc == NULL) {
6689 		assert((kn->kn_status & KN_LOCKED) == 0);
6690 	}
6691 	kn->kn_status |= KN_DROPPING;
6692 
6693 	if (kn->kn_status & KN_SUPPRESSED) {
6694 		knote_unsuppress_noqueue(kq, kn);
6695 	} else {
6696 		knote_dequeue(kq, kn);
6697 	}
6698 	knote_wait_for_post(kq, kn);
6699 
6700 	/* Even if we are autodetached, the filter may need to do cleanups of any
6701 	 * stuff stashed on the knote so always make the call and let each filter
6702 	 * handle the possibility of autodetached-ness */
6703 	knote_fops(kn)->f_detach(kn);
6704 
6705 	/* kq may be freed when kq_remove_knote() returns */
6706 	kq_remove_knote(kq, kn, p, knlc);
6707 	if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
6708 		fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6709 	}
6710 
6711 	knote_free(kn);
6712 }
6713 
6714 void
knote_init(void)6715 knote_init(void)
6716 {
6717 #if CONFIG_MEMORYSTATUS
6718 	/* Initialize the memorystatus list lock */
6719 	memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
6720 #endif
6721 }
6722 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6723 
6724 const struct filterops *
knote_fops(struct knote * kn)6725 knote_fops(struct knote *kn)
6726 {
6727 	return sysfilt_ops[kn->kn_filtid];
6728 }
6729 
6730 static struct knote *
knote_alloc(void)6731 knote_alloc(void)
6732 {
6733 	return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
6734 }
6735 
6736 static void
knote_free(struct knote * kn)6737 knote_free(struct knote *kn)
6738 {
6739 	assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
6740 	zfree(knote_zone, kn);
6741 }
6742 
6743 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6744 
6745 kevent_ctx_t
kevent_get_context(thread_t thread)6746 kevent_get_context(thread_t thread)
6747 {
6748 	uthread_t ut = get_bsdthread_info(thread);
6749 	return &ut->uu_save.uus_kevent;
6750 }
6751 
6752 static inline bool
kevent_args_requesting_events(unsigned int flags,int nevents)6753 kevent_args_requesting_events(unsigned int flags, int nevents)
6754 {
6755 	return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
6756 }
6757 
6758 static inline int
kevent_adjust_flags_for_proc(proc_t p,int flags)6759 kevent_adjust_flags_for_proc(proc_t p, int flags)
6760 {
6761 	__builtin_assume(p);
6762 	return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
6763 }
6764 
6765 /*!
6766  * @function kevent_get_kqfile
6767  *
6768  * @brief
6769  * Lookup a kqfile by fd.
6770  *
6771  * @discussion
6772  * Callers: kevent, kevent64, kevent_qos
6773  *
6774  * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6775  */
6776 OS_NOINLINE
6777 static int
kevent_get_kqfile(struct proc * p,int fd,int flags,struct fileproc ** fpp,struct kqueue ** kqp)6778 kevent_get_kqfile(struct proc *p, int fd, int flags,
6779     struct fileproc **fpp, struct kqueue **kqp)
6780 {
6781 	int error = 0;
6782 	struct kqueue *kq;
6783 
6784 	error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
6785 	if (__improbable(error)) {
6786 		return error;
6787 	}
6788 	kq = (struct kqueue *)fp_get_data((*fpp));
6789 
6790 	uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6791 	if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
6792 		kqlock(kq);
6793 		kq_state = kq->kq_state;
6794 		if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
6795 			if (flags & KEVENT_FLAG_LEGACY32) {
6796 				kq_state |= KQ_KEV32;
6797 			} else if (flags & KEVENT_FLAG_LEGACY64) {
6798 				kq_state |= KQ_KEV64;
6799 			} else {
6800 				kq_state |= KQ_KEV_QOS;
6801 			}
6802 			kq->kq_state = kq_state;
6803 		}
6804 		kqunlock(kq);
6805 	}
6806 
6807 	/*
6808 	 * kqfiles can't be used through the legacy kevent()
6809 	 * and other interfaces at the same time.
6810 	 */
6811 	if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
6812 	    (bool)(kq_state & KQ_KEV32))) {
6813 		fp_drop(p, fd, *fpp, 0);
6814 		return EINVAL;
6815 	}
6816 
6817 	*kqp = kq;
6818 	return 0;
6819 }
6820 
6821 /*!
6822  * @function kevent_get_kqwq
6823  *
6824  * @brief
6825  * Lookup or create the process kqwq (faspath).
6826  *
6827  * @discussion
6828  * Callers: kevent64, kevent_qos
6829  */
6830 OS_ALWAYS_INLINE
6831 static int
kevent_get_kqwq(proc_t p,int flags,int nevents,struct kqueue ** kqp)6832 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
6833 {
6834 	struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
6835 
6836 	if (__improbable(kevent_args_requesting_events(flags, nevents))) {
6837 		return EINVAL;
6838 	}
6839 	if (__improbable(kqwq == NULL)) {
6840 		kqwq = kqworkq_alloc(p, flags);
6841 		if (__improbable(kqwq == NULL)) {
6842 			return ENOMEM;
6843 		}
6844 	}
6845 
6846 	*kqp = &kqwq->kqwq_kqueue;
6847 	return 0;
6848 }
6849 
6850 #pragma mark kevent copyio
6851 
6852 /*!
6853  * @function kevent_get_data_size
6854  *
6855  * @brief
6856  * Copies in the extra data size from user-space.
6857  */
6858 static int
kevent_get_data_size(int flags,user_addr_t data_avail,user_addr_t data_out,kevent_ctx_t kectx)6859 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
6860     kevent_ctx_t kectx)
6861 {
6862 	if (!data_avail || !data_out) {
6863 		kectx->kec_data_size  = 0;
6864 		kectx->kec_data_resid = 0;
6865 	} else if (flags & KEVENT_FLAG_PROC64) {
6866 		user64_size_t usize = 0;
6867 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6868 		if (__improbable(error)) {
6869 			return error;
6870 		}
6871 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6872 	} else {
6873 		user32_size_t usize = 0;
6874 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6875 		if (__improbable(error)) {
6876 			return error;
6877 		}
6878 		kectx->kec_data_avail = data_avail;
6879 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6880 	}
6881 	kectx->kec_data_out   = data_out;
6882 	kectx->kec_data_avail = data_avail;
6883 	return 0;
6884 }
6885 
6886 /*!
6887  * @function kevent_put_data_size
6888  *
6889  * @brief
6890  * Copies out the residual data size to user-space if any has been used.
6891  */
6892 static int
kevent_put_data_size(unsigned int flags,kevent_ctx_t kectx)6893 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
6894 {
6895 	if (kectx->kec_data_resid == kectx->kec_data_size) {
6896 		return 0;
6897 	}
6898 	if (flags & KEVENT_FLAG_KERNEL) {
6899 		*(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
6900 		return 0;
6901 	}
6902 	if (flags & KEVENT_FLAG_PROC64) {
6903 		user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
6904 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6905 	} else {
6906 		user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
6907 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6908 	}
6909 }
6910 
6911 /*!
6912  * @function kevent_legacy_copyin
6913  *
6914  * @brief
6915  * Handles the copyin of a kevent/kevent64 event.
6916  */
6917 static int
kevent_legacy_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp,unsigned int flags)6918 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
6919 {
6920 	int error;
6921 
6922 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
6923 
6924 	if (flags & KEVENT_FLAG_LEGACY64) {
6925 		struct kevent64_s kev64;
6926 
6927 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6928 		if (__improbable(error)) {
6929 			return error;
6930 		}
6931 		*addrp += sizeof(kev64);
6932 		*kevp = (struct kevent_qos_s){
6933 			.ident  = kev64.ident,
6934 			.filter = kev64.filter,
6935 			/* Make sure user doesn't pass in any system flags */
6936 			.flags  = kev64.flags & ~EV_SYSFLAGS,
6937 			.udata  = kev64.udata,
6938 			.fflags = kev64.fflags,
6939 			.data   = kev64.data,
6940 			.ext[0] = kev64.ext[0],
6941 			.ext[1] = kev64.ext[1],
6942 		};
6943 	} else if (flags & KEVENT_FLAG_PROC64) {
6944 		struct user64_kevent kev64;
6945 
6946 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6947 		if (__improbable(error)) {
6948 			return error;
6949 		}
6950 		*addrp += sizeof(kev64);
6951 		*kevp = (struct kevent_qos_s){
6952 			.ident  = kev64.ident,
6953 			.filter = kev64.filter,
6954 			/* Make sure user doesn't pass in any system flags */
6955 			.flags  = kev64.flags & ~EV_SYSFLAGS,
6956 			.udata  = kev64.udata,
6957 			.fflags = kev64.fflags,
6958 			.data   = kev64.data,
6959 		};
6960 	} else {
6961 		struct user32_kevent kev32;
6962 
6963 		error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
6964 		if (__improbable(error)) {
6965 			return error;
6966 		}
6967 		*addrp += sizeof(kev32);
6968 		*kevp = (struct kevent_qos_s){
6969 			.ident  = (uintptr_t)kev32.ident,
6970 			.filter = kev32.filter,
6971 			/* Make sure user doesn't pass in any system flags */
6972 			.flags  = kev32.flags & ~EV_SYSFLAGS,
6973 			.udata  = CAST_USER_ADDR_T(kev32.udata),
6974 			.fflags = kev32.fflags,
6975 			.data   = (intptr_t)kev32.data,
6976 		};
6977 	}
6978 
6979 	return 0;
6980 }
6981 
6982 /*!
6983  * @function kevent_modern_copyin
6984  *
6985  * @brief
6986  * Handles the copyin of a kevent_qos/kevent_id event.
6987  */
6988 static int
kevent_modern_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp)6989 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
6990 {
6991 	int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
6992 	if (__probable(!error)) {
6993 		/* Make sure user doesn't pass in any system flags */
6994 		*addrp += sizeof(struct kevent_qos_s);
6995 		kevp->flags &= ~EV_SYSFLAGS;
6996 	}
6997 	return error;
6998 }
6999 
7000 /*!
7001  * @function kevent_legacy_copyout
7002  *
7003  * @brief
7004  * Handles the copyout of a kevent/kevent64 event.
7005  */
7006 static int
kevent_legacy_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp,unsigned int flags)7007 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7008 {
7009 	int advance;
7010 	int error;
7011 
7012 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7013 
7014 	/*
7015 	 * fully initialize the differnt output event structure
7016 	 * types from the internal kevent (and some universal
7017 	 * defaults for fields not represented in the internal
7018 	 * form).
7019 	 *
7020 	 * Note: these structures have no padding hence the C99
7021 	 *       initializers below do not leak kernel info.
7022 	 */
7023 	if (flags & KEVENT_FLAG_LEGACY64) {
7024 		struct kevent64_s kev64 = {
7025 			.ident  = kevp->ident,
7026 			.filter = kevp->filter,
7027 			.flags  = kevp->flags,
7028 			.fflags = kevp->fflags,
7029 			.data   = (int64_t)kevp->data,
7030 			.udata  = kevp->udata,
7031 			.ext[0] = kevp->ext[0],
7032 			.ext[1] = kevp->ext[1],
7033 		};
7034 		advance = sizeof(struct kevent64_s);
7035 		error = copyout((caddr_t)&kev64, *addrp, advance);
7036 	} else if (flags & KEVENT_FLAG_PROC64) {
7037 		/*
7038 		 * deal with the special case of a user-supplied
7039 		 * value of (uintptr_t)-1.
7040 		 */
7041 		uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7042 		    (uint64_t)-1LL : (uint64_t)kevp->ident;
7043 		struct user64_kevent kev64 = {
7044 			.ident  = ident,
7045 			.filter = kevp->filter,
7046 			.flags  = kevp->flags,
7047 			.fflags = kevp->fflags,
7048 			.data   = (int64_t) kevp->data,
7049 			.udata  = (user_addr_t) kevp->udata,
7050 		};
7051 		advance = sizeof(kev64);
7052 		error = copyout((caddr_t)&kev64, *addrp, advance);
7053 	} else {
7054 		struct user32_kevent kev32 = {
7055 			.ident  = (uint32_t)kevp->ident,
7056 			.filter = kevp->filter,
7057 			.flags  = kevp->flags,
7058 			.fflags = kevp->fflags,
7059 			.data   = (int32_t)kevp->data,
7060 			.udata  = (uint32_t)kevp->udata,
7061 		};
7062 		advance = sizeof(kev32);
7063 		error = copyout((caddr_t)&kev32, *addrp, advance);
7064 	}
7065 	if (__probable(!error)) {
7066 		*addrp += advance;
7067 	}
7068 	return error;
7069 }
7070 
7071 /*!
7072  * @function kevent_modern_copyout
7073  *
7074  * @brief
7075  * Handles the copyout of a kevent_qos/kevent_id event.
7076  */
7077 OS_ALWAYS_INLINE
7078 static inline int
kevent_modern_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp)7079 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7080 {
7081 	int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7082 	if (__probable(!error)) {
7083 		*addrp += sizeof(struct kevent_qos_s);
7084 	}
7085 	return error;
7086 }
7087 
7088 #pragma mark kevent core implementation
7089 
7090 /*!
7091  * @function kevent_callback_inline
7092  *
7093  * @brief
7094  * Callback for each individual event
7095  *
7096  * @discussion
7097  * This is meant to be inlined in kevent_modern_callback and
7098  * kevent_legacy_callback.
7099  */
7100 OS_ALWAYS_INLINE
7101 static inline int
kevent_callback_inline(struct kevent_qos_s * kevp,kevent_ctx_t kectx,bool legacy)7102 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7103 {
7104 	int error;
7105 
7106 	assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7107 
7108 	/*
7109 	 * Copy out the appropriate amount of event data for this user.
7110 	 */
7111 	if (legacy) {
7112 		error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7113 		    kectx->kec_process_flags);
7114 	} else {
7115 		error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7116 	}
7117 
7118 	/*
7119 	 * If there isn't space for additional events, return
7120 	 * a harmless error to stop the processing here
7121 	 */
7122 	if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7123 		error = EWOULDBLOCK;
7124 	}
7125 	return error;
7126 }
7127 
7128 /*!
7129  * @function kevent_modern_callback
7130  *
7131  * @brief
7132  * Callback for each individual modern event.
7133  *
7134  * @discussion
7135  * This callback handles kevent_qos/kevent_id events.
7136  */
7137 static int
kevent_modern_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7138 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7139 {
7140 	return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7141 }
7142 
7143 /*!
7144  * @function kevent_legacy_callback
7145  *
7146  * @brief
7147  * Callback for each individual legacy event.
7148  *
7149  * @discussion
7150  * This callback handles kevent/kevent64 events.
7151  */
7152 static int
kevent_legacy_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7153 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7154 {
7155 	return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7156 }
7157 
7158 /*!
7159  * @function kevent_cleanup
7160  *
7161  * @brief
7162  * Handles the cleanup returning from a kevent call.
7163  *
7164  * @discussion
7165  * kevent entry points will take a reference on workloops,
7166  * and a usecount on the fileglob of kqfiles.
7167  *
7168  * This function undoes this on the exit paths of kevents.
7169  *
7170  * @returns
7171  * The error to return to userspace.
7172  */
7173 static int
kevent_cleanup(kqueue_t kqu,int flags,int error,kevent_ctx_t kectx)7174 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7175 {
7176 	// poll should not call any codepath leading to this
7177 	assert((flags & KEVENT_FLAG_POLL) == 0);
7178 
7179 	if (flags & KEVENT_FLAG_WORKLOOP) {
7180 		kqworkloop_release(kqu.kqwl);
7181 	} else if (flags & KEVENT_FLAG_WORKQ) {
7182 		/* nothing held */
7183 	} else {
7184 		fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7185 	}
7186 
7187 	/* don't restart after signals... */
7188 	if (error == ERESTART) {
7189 		error = EINTR;
7190 	} else if (error == 0) {
7191 		/* don't abandon other output just because of residual copyout failures */
7192 		(void)kevent_put_data_size(flags, kectx);
7193 	}
7194 
7195 	if (flags & KEVENT_FLAG_PARKING) {
7196 		thread_t th = current_thread();
7197 		struct uthread *uth = get_bsdthread_info(th);
7198 		if (uth->uu_kqr_bound) {
7199 			thread_unfreeze_base_pri(th);
7200 		}
7201 	}
7202 	return error;
7203 }
7204 
7205 /*!
7206  * @function kqueue_process
7207  *
7208  * @brief
7209  * Process the triggered events in a kqueue.
7210  *
7211  * @discussion
7212  * Walk the queued knotes and validate that they are really still triggered
7213  * events by calling the filter routines (if necessary).
7214  *
7215  * For each event that is still considered triggered, invoke the callback
7216  * routine provided.
7217  *
7218  * caller holds a reference on the kqueue.
7219  * kqueue locked on entry and exit - but may be dropped
7220  * kqueue list locked (held for duration of call)
7221  *
7222  * This is only called by kqueue_scan() so that the compiler can inline it.
7223  *
7224  * @returns
7225  * - 0:            no event was returned, no other error occured
7226  * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
7227  * - EWOULDBLOCK:  (not an error) events have been found and we should return
7228  * - EFAULT:       copyout failed
7229  * - filter specific errors
7230  */
7231 static int
kqueue_process(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7232 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7233     kevent_callback_t callback)
7234 {
7235 	workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7236 	struct knote *kn;
7237 	int error = 0, rc = 0;
7238 	struct kqtailq *base_queue, *queue;
7239 	uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7240 
7241 	if (kq_type & KQ_WORKQ) {
7242 		rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7243 	} else if (kq_type & KQ_WORKLOOP) {
7244 		rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7245 	} else {
7246 kqfile_retry:
7247 		rc = kqfile_begin_processing(kqu.kqf);
7248 		if (rc == EBADF) {
7249 			return EBADF;
7250 		}
7251 	}
7252 
7253 	if (rc == -1) {
7254 		/* Nothing to process */
7255 		return 0;
7256 	}
7257 
7258 	/*
7259 	 * loop through the enqueued knotes associated with this request,
7260 	 * processing each one. Each request may have several queues
7261 	 * of knotes to process (depending on the type of kqueue) so we
7262 	 * have to loop through all the queues as long as we have additional
7263 	 * space.
7264 	 */
7265 
7266 process_again:
7267 	if (kq_type & KQ_WORKQ) {
7268 		base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7269 	} else if (kq_type & KQ_WORKLOOP) {
7270 		base_queue = &kqu.kqwl->kqwl_queue[0];
7271 		queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7272 	} else {
7273 		base_queue = queue = &kqu.kqf->kqf_queue;
7274 	}
7275 
7276 	do {
7277 		while ((kn = TAILQ_FIRST(queue)) != NULL) {
7278 			error = knote_process(kn, kectx, callback);
7279 			if (error == EJUSTRETURN) {
7280 				error = 0;
7281 			} else if (__improbable(error)) {
7282 				/* error is EWOULDBLOCK when the out event array is full */
7283 				goto stop_processing;
7284 			}
7285 		}
7286 	} while (queue-- > base_queue);
7287 
7288 	if (kectx->kec_process_noutputs) {
7289 		/* callers will transform this into no error */
7290 		error = EWOULDBLOCK;
7291 	}
7292 
7293 stop_processing:
7294 	/*
7295 	 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7296 	 * we want to unbind the kqrequest from the thread.
7297 	 *
7298 	 * However, because the kq locks are dropped several times during process,
7299 	 * new knotes may have fired again, in which case, we want to fail the end
7300 	 * processing and process again, until it converges.
7301 	 *
7302 	 * If we have an error or returned events, end processing never fails.
7303 	 */
7304 	if (error) {
7305 		flags &= ~KEVENT_FLAG_PARKING;
7306 	}
7307 	if (kq_type & KQ_WORKQ) {
7308 		rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7309 	} else if (kq_type & KQ_WORKLOOP) {
7310 		rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7311 	} else {
7312 		rc = kqfile_end_processing(kqu.kqf);
7313 	}
7314 
7315 	if (__probable(error)) {
7316 		return error;
7317 	}
7318 
7319 	if (__probable(rc >= 0)) {
7320 		assert(rc == 0 || rc == EBADF);
7321 		return rc;
7322 	}
7323 
7324 	if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7325 		assert(flags & KEVENT_FLAG_PARKING);
7326 		goto process_again;
7327 	} else {
7328 		goto kqfile_retry;
7329 	}
7330 }
7331 
7332 /*!
7333  * @function kqueue_scan_continue
7334  *
7335  * @brief
7336  * The continuation used by kqueue_scan for kevent entry points.
7337  *
7338  * @discussion
7339  * Assumes we inherit a use/ref count on the kq or its fileglob.
7340  *
7341  * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7342  * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7343  */
7344 OS_NORETURN OS_NOINLINE
7345 static void
kqueue_scan_continue(void * data,wait_result_t wait_result)7346 kqueue_scan_continue(void *data, wait_result_t wait_result)
7347 {
7348 	uthread_t ut = current_uthread();
7349 	kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7350 	int error = 0, flags = kectx->kec_process_flags;
7351 	struct kqueue *kq = data;
7352 
7353 	/*
7354 	 * only kevent variants call in here, so we know the callback is
7355 	 * kevent_legacy_callback or kevent_modern_callback.
7356 	 */
7357 	assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7358 
7359 	switch (wait_result) {
7360 	case THREAD_AWAKENED:
7361 		if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7362 			error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7363 		} else {
7364 			error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7365 		}
7366 		break;
7367 	case THREAD_TIMED_OUT:
7368 		error = 0;
7369 		break;
7370 	case THREAD_INTERRUPTED:
7371 		error = EINTR;
7372 		break;
7373 	case THREAD_RESTART:
7374 		error = EBADF;
7375 		break;
7376 	default:
7377 		panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7378 	}
7379 
7380 
7381 	error = kevent_cleanup(kq, flags, error, kectx);
7382 	*(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7383 	unix_syscall_return(error);
7384 }
7385 
7386 /*!
7387  * @function kqueue_scan
7388  *
7389  * @brief
7390  * Scan and wait for events in a kqueue (used by poll & kevent).
7391  *
7392  * @discussion
7393  * Process the triggered events in a kqueue.
7394  *
7395  * If there are no events triggered arrange to wait for them:
7396  * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7397  * - possibly until kectx->kec_deadline expires
7398  *
7399  * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7400  * are set, then it will wait in the kqueue_scan_continue continuation.
7401  *
7402  * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7403  * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7404  *
7405  * @param kqu
7406  * The kqueue being scanned.
7407  *
7408  * @param flags
7409  * The KEVENT_FLAG_* flags for this call.
7410  *
7411  * @param kectx
7412  * The context used for this scan.
7413  * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7414  *
7415  * @param callback
7416  * The callback to be called on events sucessfully processed.
7417  * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7418  */
7419 int
kqueue_scan(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7420 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7421     kevent_callback_t callback)
7422 {
7423 	int error;
7424 
7425 	for (;;) {
7426 		kqlock(kqu);
7427 		error = kqueue_process(kqu, flags, kectx, callback);
7428 
7429 		/*
7430 		 * If we got an error, events returned (EWOULDBLOCK)
7431 		 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7432 		 * just return.
7433 		 */
7434 		if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
7435 			kqunlock(kqu);
7436 			return error == EWOULDBLOCK ? 0 : error;
7437 		}
7438 
7439 		assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
7440 
7441 		kqu.kqf->kqf_state |= KQ_SLEEP;
7442 		assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
7443 		    kectx->kec_deadline);
7444 		kqunlock(kqu);
7445 
7446 		if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
7447 			thread_block_parameter(kqueue_scan_continue, kqu.kqf);
7448 			__builtin_unreachable();
7449 		}
7450 
7451 		wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7452 		switch (wr) {
7453 		case THREAD_AWAKENED:
7454 			break;
7455 		case THREAD_TIMED_OUT:
7456 			return 0;
7457 		case THREAD_INTERRUPTED:
7458 			return EINTR;
7459 		case THREAD_RESTART:
7460 			return EBADF;
7461 		default:
7462 			panic("%s: - bad wait_result (%d)", __func__, wr);
7463 		}
7464 	}
7465 }
7466 
7467 /*!
7468  * @function kevent_internal
7469  *
7470  * @brief
7471  * Common kevent code.
7472  *
7473  * @discussion
7474  * Needs to be inlined to specialize for legacy or modern and
7475  * eliminate dead code.
7476  *
7477  * This is the core logic of kevent entry points, that will:
7478  * - register kevents
7479  * - optionally scan the kqueue for events
7480  *
7481  * The caller is giving kevent_internal a reference on the kqueue
7482  * or its fileproc that needs to be cleaned up by kevent_cleanup().
7483  */
7484 OS_ALWAYS_INLINE
7485 static inline int
kevent_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval,bool legacy)7486 kevent_internal(kqueue_t kqu,
7487     user_addr_t changelist, int nchanges,
7488     user_addr_t ueventlist, int nevents,
7489     int flags, kevent_ctx_t kectx, int32_t *retval,
7490     bool legacy)
7491 {
7492 	int error = 0, noutputs = 0, register_rc;
7493 
7494 	/* only bound threads can receive events on workloops */
7495 	if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7496 #if CONFIG_WORKLOOP_DEBUG
7497 		UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7498 			.uu_kqid = kqu.kqwl->kqwl_dynamicid,
7499 			.uu_kq = error ? NULL : kqu.kq,
7500 			.uu_error = error,
7501 			.uu_nchanges = nchanges,
7502 			.uu_nevents = nevents,
7503 			.uu_flags = flags,
7504 		});
7505 #endif // CONFIG_WORKLOOP_DEBUG
7506 
7507 		if (flags & KEVENT_FLAG_KERNEL) {
7508 			/* see kevent_workq_internal */
7509 			error = copyout(&kqu.kqwl->kqwl_dynamicid,
7510 			    ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7511 			kectx->kec_data_resid -= sizeof(kqueue_id_t);
7512 			if (__improbable(error)) {
7513 				goto out;
7514 			}
7515 		}
7516 
7517 		if (kevent_args_requesting_events(flags, nevents)) {
7518 			/*
7519 			 * Disable the R2K notification while doing a register, if the
7520 			 * caller wants events too, we don't want the AST to be set if we
7521 			 * will process these events soon.
7522 			 */
7523 			kqlock(kqu);
7524 			kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7525 			kqunlock(kqu);
7526 			flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
7527 		}
7528 	}
7529 
7530 	/* register all the change requests the user provided... */
7531 	while (nchanges > 0 && error == 0) {
7532 		struct kevent_qos_s kev;
7533 		struct knote *kn = NULL;
7534 
7535 		if (legacy) {
7536 			error = kevent_legacy_copyin(&changelist, &kev, flags);
7537 		} else {
7538 			error = kevent_modern_copyin(&changelist, &kev);
7539 		}
7540 		if (error) {
7541 			break;
7542 		}
7543 
7544 		register_rc = kevent_register(kqu.kq, &kev, &kn);
7545 		if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7546 			thread_t thread = current_thread();
7547 
7548 			kqlock_held(kqu);
7549 
7550 			if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7551 				workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7552 			}
7553 
7554 			// f_post_register_wait is meant to call a continuation and not to
7555 			// return, which is why we don't support FILTER_REGISTER_WAIT if
7556 			// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7557 			// waits isn't the last.
7558 			//
7559 			// It is implementable, but not used by any userspace code at the
7560 			// moment, so for now return ENOTSUP if someone tries to do it.
7561 			if (nchanges == 1 && noutputs < nevents &&
7562 			    (flags & KEVENT_FLAG_KERNEL) == 0 &&
7563 			    (flags & KEVENT_FLAG_PARKING) == 0 &&
7564 			    (flags & KEVENT_FLAG_ERROR_EVENTS) &&
7565 			    (flags & KEVENT_FLAG_WORKLOOP)) {
7566 				uthread_t ut = get_bsdthread_info(thread);
7567 
7568 				/*
7569 				 * store the continuation/completion data in the uthread
7570 				 *
7571 				 * Note: the kectx aliases with this,
7572 				 * and is destroyed in the process.
7573 				 */
7574 				ut->uu_save.uus_kevent_register = (struct _kevent_register){
7575 					.kev        = kev,
7576 					.kqwl       = kqu.kqwl,
7577 					.eventout   = noutputs,
7578 					.ueventlist = ueventlist,
7579 				};
7580 				knote_fops(kn)->f_post_register_wait(ut, kn,
7581 				    &ut->uu_save.uus_kevent_register);
7582 				__builtin_unreachable();
7583 			}
7584 			kqunlock(kqu);
7585 
7586 			kev.flags |= EV_ERROR;
7587 			kev.data = ENOTSUP;
7588 		} else {
7589 			assert((register_rc & FILTER_REGISTER_WAIT) == 0);
7590 		}
7591 
7592 		// keep in sync with kevent_register_wait_return()
7593 		if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
7594 			if ((kev.flags & EV_ERROR) == 0) {
7595 				kev.flags |= EV_ERROR;
7596 				kev.data = 0;
7597 			}
7598 			if (legacy) {
7599 				error = kevent_legacy_copyout(&kev, &ueventlist, flags);
7600 			} else {
7601 				error = kevent_modern_copyout(&kev, &ueventlist);
7602 			}
7603 			if (error == 0) {
7604 				noutputs++;
7605 			}
7606 		} else if (kev.flags & EV_ERROR) {
7607 			error = (int)kev.data;
7608 		}
7609 		nchanges--;
7610 	}
7611 
7612 	if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
7613 	    nevents > 0 && noutputs == 0 && error == 0) {
7614 		kectx->kec_process_flags = flags;
7615 		kectx->kec_process_nevents = nevents;
7616 		kectx->kec_process_noutputs = 0;
7617 		kectx->kec_process_eventlist = ueventlist;
7618 
7619 		if (legacy) {
7620 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
7621 		} else {
7622 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
7623 		}
7624 
7625 		noutputs = kectx->kec_process_noutputs;
7626 	} else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7627 		/*
7628 		 * If we didn't through kqworkloop_end_processing(),
7629 		 * we need to do it here.
7630 		 *
7631 		 * kqueue_scan will call kqworkloop_end_processing(),
7632 		 * so we only need to do it if we didn't scan.
7633 		 */
7634 		kqlock(kqu);
7635 		kqworkloop_end_processing(kqu.kqwl, 0, 0);
7636 		kqunlock(kqu);
7637 	}
7638 
7639 	*retval = noutputs;
7640 out:
7641 	return kevent_cleanup(kqu.kq, flags, error, kectx);
7642 }
7643 
7644 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7645 
7646 /*!
7647  * @function kevent_modern_internal
7648  *
7649  * @brief
7650  * The backend of the kevent_id and kevent_workq_internal entry points.
7651  *
7652  * @discussion
7653  * Needs to be inline due to the number of arguments.
7654  */
7655 OS_NOINLINE
7656 static int
kevent_modern_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval)7657 kevent_modern_internal(kqueue_t kqu,
7658     user_addr_t changelist, int nchanges,
7659     user_addr_t ueventlist, int nevents,
7660     int flags, kevent_ctx_t kectx, int32_t *retval)
7661 {
7662 	return kevent_internal(kqu.kq, changelist, nchanges,
7663 	           ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
7664 }
7665 
7666 /*!
7667  * @function kevent_id
7668  *
7669  * @brief
7670  * The kevent_id() syscall.
7671  */
7672 int
kevent_id(struct proc * p,struct kevent_id_args * uap,int32_t * retval)7673 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
7674 {
7675 	int error, flags = uap->flags & KEVENT_FLAG_USER;
7676 	uthread_t uth = current_uthread();
7677 	workq_threadreq_t kqr = uth->uu_kqr_bound;
7678 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7679 	kqueue_t kqu;
7680 
7681 	flags = kevent_adjust_flags_for_proc(p, flags);
7682 	flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
7683 
7684 	if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
7685 	    KEVENT_FLAG_WORKLOOP)) {
7686 		return EINVAL;
7687 	}
7688 
7689 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7690 	if (__improbable(error)) {
7691 		return error;
7692 	}
7693 
7694 	kectx->kec_deadline = 0;
7695 	kectx->kec_fp       = NULL;
7696 	kectx->kec_fd       = -1;
7697 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7698 
7699 	/*
7700 	 * Get the kq we are going to be working on
7701 	 * As a fastpath, look at the currently bound workloop.
7702 	 */
7703 	kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7704 	if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7705 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7706 			return EEXIST;
7707 		}
7708 		kqworkloop_retain(kqu.kqwl);
7709 	} else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7710 		return EXDEV;
7711 	} else {
7712 		error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl);
7713 		if (__improbable(error)) {
7714 			return error;
7715 		}
7716 	}
7717 
7718 	return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
7719 	           uap->eventlist, uap->nevents, flags, kectx, retval);
7720 }
7721 
7722 /**!
7723  * @function kevent_workq_internal
7724  *
7725  * @discussion
7726  * This function is exported for the sake of the workqueue subsystem.
7727  *
7728  * It is called in two ways:
7729  * - when a thread is about to go to userspace to ask for pending event
7730  * - when a thread is returning from userspace with events back
7731  *
7732  * the workqueue subsystem will only use the following flags:
7733  * - KEVENT_FLAG_STACK_DATA (always)
7734  * - KEVENT_FLAG_IMMEDIATE (always)
7735  * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7736  *   userspace).
7737  *
7738  * It implicitly acts on the bound kqueue, and for the case of workloops
7739  * will copyout the kqueue ID before anything else.
7740  *
7741  *
7742  * Pthread will have setup the various arguments to fit this stack layout:
7743  *
7744  * +-------....----+--------------+-----------+--------------------+
7745  * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
7746  * +-------....----+--------------+-----------+--------------------+
7747  *                 ^              ^
7748  *             data_out       eventlist
7749  *
7750  * When a workloop is used, the workloop ID is copied out right before
7751  * the eventlist and is taken from the data buffer.
7752  *
7753  * @warning
7754  * This function is carefuly tailored to not make any call except the final tail
7755  * call into kevent_modern_internal. (LTO inlines current_uthread()).
7756  *
7757  * This function is performance sensitive due to the workq subsystem.
7758  */
7759 int
kevent_workq_internal(struct proc * p,user_addr_t changelist,int nchanges,user_addr_t eventlist,int nevents,user_addr_t data_out,user_size_t * data_available,unsigned int flags,int32_t * retval)7760 kevent_workq_internal(struct proc *p,
7761     user_addr_t changelist, int nchanges,
7762     user_addr_t eventlist, int nevents,
7763     user_addr_t data_out, user_size_t *data_available,
7764     unsigned int flags, int32_t *retval)
7765 {
7766 	uthread_t uth = current_uthread();
7767 	workq_threadreq_t kqr = uth->uu_kqr_bound;
7768 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7769 	kqueue_t kqu;
7770 
7771 	assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
7772 	    flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
7773 
7774 	kectx->kec_data_out   = data_out;
7775 	kectx->kec_data_avail = (uint64_t)data_available;
7776 	kectx->kec_data_size  = *data_available;
7777 	kectx->kec_data_resid = *data_available;
7778 	kectx->kec_deadline   = 0;
7779 	kectx->kec_fp         = NULL;
7780 	kectx->kec_fd         = -1;
7781 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7782 
7783 	flags = kevent_adjust_flags_for_proc(p, flags);
7784 
7785 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7786 		kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7787 		kqworkloop_retain(kqu.kqwl);
7788 
7789 		flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
7790 		    KEVENT_FLAG_KERNEL;
7791 	} else {
7792 		kqu.kqwq = p->p_fd.fd_wqkqueue;
7793 
7794 		flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
7795 	}
7796 
7797 	return kevent_modern_internal(kqu, changelist, nchanges,
7798 	           eventlist, nevents, flags, kectx, retval);
7799 }
7800 
7801 /*!
7802  * @function kevent_qos
7803  *
7804  * @brief
7805  * The kevent_qos() syscall.
7806  */
7807 int
kevent_qos(struct proc * p,struct kevent_qos_args * uap,int32_t * retval)7808 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
7809 {
7810 	uthread_t uth = current_uthread();
7811 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7812 	int error, flags = uap->flags & KEVENT_FLAG_USER;
7813 	struct kqueue *kq;
7814 
7815 	if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
7816 		return EINVAL;
7817 	}
7818 
7819 	flags = kevent_adjust_flags_for_proc(p, flags);
7820 
7821 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7822 	if (__improbable(error)) {
7823 		return error;
7824 	}
7825 
7826 	kectx->kec_deadline = 0;
7827 	kectx->kec_fp       = NULL;
7828 	kectx->kec_fd       = uap->fd;
7829 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7830 
7831 	/* get the kq we are going to be working on */
7832 	if (__probable(flags & KEVENT_FLAG_WORKQ)) {
7833 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7834 	} else {
7835 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7836 	}
7837 	if (__improbable(error)) {
7838 		return error;
7839 	}
7840 
7841 	return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
7842 	           uap->eventlist, uap->nevents, flags, kectx, retval);
7843 }
7844 
7845 #pragma mark legacy syscalls: kevent, kevent64
7846 
7847 /*!
7848  * @function kevent_legacy_get_deadline
7849  *
7850  * @brief
7851  * Compute the deadline for the legacy kevent syscalls.
7852  *
7853  * @discussion
7854  * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
7855  * as this takes precedence over the deadline.
7856  *
7857  * This function will fail if utimeout is USER_ADDR_NULL
7858  * (the caller should check).
7859  */
7860 static int
kevent_legacy_get_deadline(int flags,user_addr_t utimeout,uint64_t * deadline)7861 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
7862 {
7863 	struct timespec ts;
7864 
7865 	if (flags & KEVENT_FLAG_PROC64) {
7866 		struct user64_timespec ts64;
7867 		int error = copyin(utimeout, &ts64, sizeof(ts64));
7868 		if (__improbable(error)) {
7869 			return error;
7870 		}
7871 		ts.tv_sec = (unsigned long)ts64.tv_sec;
7872 		ts.tv_nsec = (long)ts64.tv_nsec;
7873 	} else {
7874 		struct user32_timespec ts32;
7875 		int error = copyin(utimeout, &ts32, sizeof(ts32));
7876 		if (__improbable(error)) {
7877 			return error;
7878 		}
7879 		ts.tv_sec = ts32.tv_sec;
7880 		ts.tv_nsec = ts32.tv_nsec;
7881 	}
7882 	if (!timespec_is_valid(&ts)) {
7883 		return EINVAL;
7884 	}
7885 
7886 	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
7887 	return 0;
7888 }
7889 
7890 /*!
7891  * @function kevent_legacy_internal
7892  *
7893  * @brief
7894  * The core implementation for kevent and kevent64
7895  */
7896 OS_NOINLINE
7897 static int
kevent_legacy_internal(struct proc * p,struct kevent64_args * uap,int32_t * retval,int flags)7898 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
7899     int32_t *retval, int flags)
7900 {
7901 	uthread_t uth = current_uthread();
7902 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7903 	struct kqueue *kq;
7904 	int error;
7905 
7906 	if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
7907 		return EINVAL;
7908 	}
7909 
7910 	flags = kevent_adjust_flags_for_proc(p, flags);
7911 
7912 	kectx->kec_data_out   = 0;
7913 	kectx->kec_data_avail = 0;
7914 	kectx->kec_data_size  = 0;
7915 	kectx->kec_data_resid = 0;
7916 	kectx->kec_deadline   = 0;
7917 	kectx->kec_fp         = NULL;
7918 	kectx->kec_fd         = uap->fd;
7919 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7920 
7921 	/* convert timeout to absolute - if we have one (and not immediate) */
7922 	if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
7923 		error = kevent_legacy_get_deadline(flags, uap->timeout,
7924 		    &kectx->kec_deadline);
7925 		if (__improbable(error)) {
7926 			return error;
7927 		}
7928 	}
7929 
7930 	/* get the kq we are going to be working on */
7931 	if (flags & KEVENT_FLAG_WORKQ) {
7932 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7933 	} else {
7934 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7935 	}
7936 	if (__improbable(error)) {
7937 		return error;
7938 	}
7939 
7940 	return kevent_internal(kq, uap->changelist, uap->nchanges,
7941 	           uap->eventlist, uap->nevents, flags, kectx, retval,
7942 	           /*legacy*/ true);
7943 }
7944 
7945 /*!
7946  * @function kevent
7947  *
7948  * @brief
7949  * The legacy kevent() syscall.
7950  */
7951 int
kevent(struct proc * p,struct kevent_args * uap,int32_t * retval)7952 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
7953 {
7954 	struct kevent64_args args = {
7955 		.fd         = uap->fd,
7956 		.changelist = uap->changelist,
7957 		.nchanges   = uap->nchanges,
7958 		.eventlist  = uap->eventlist,
7959 		.nevents    = uap->nevents,
7960 		.timeout    = uap->timeout,
7961 	};
7962 
7963 	return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
7964 }
7965 
7966 /*!
7967  * @function kevent64
7968  *
7969  * @brief
7970  * The legacy kevent64() syscall.
7971  */
7972 int
kevent64(struct proc * p,struct kevent64_args * uap,int32_t * retval)7973 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
7974 {
7975 	int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
7976 	return kevent_legacy_internal(p, uap, retval, flags);
7977 }
7978 
7979 #pragma mark - socket interface
7980 
7981 #if SOCKETS
7982 #include <sys/param.h>
7983 #include <sys/socket.h>
7984 #include <sys/protosw.h>
7985 #include <sys/domain.h>
7986 #include <sys/mbuf.h>
7987 #include <sys/kern_event.h>
7988 #include <sys/malloc.h>
7989 #include <sys/sys_domain.h>
7990 #include <sys/syslog.h>
7991 
7992 #ifndef ROUNDUP64
7993 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
7994 #endif
7995 
7996 #ifndef ADVANCE64
7997 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
7998 #endif
7999 
8000 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8001 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8002 
8003 static int kev_attach(struct socket *so, int proto, struct proc *p);
8004 static int kev_detach(struct socket *so);
8005 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8006     struct ifnet *ifp, struct proc *p);
8007 static lck_mtx_t * event_getlock(struct socket *, int);
8008 static int event_lock(struct socket *, int, void *);
8009 static int event_unlock(struct socket *, int, void *);
8010 
8011 static int event_sofreelastref(struct socket *);
8012 static void kev_delete(struct kern_event_pcb *);
8013 
8014 static struct pr_usrreqs event_usrreqs = {
8015 	.pru_attach =           kev_attach,
8016 	.pru_control =          kev_control,
8017 	.pru_detach =           kev_detach,
8018 	.pru_soreceive =        soreceive,
8019 };
8020 
8021 static struct protosw eventsw[] = {
8022 	{
8023 		.pr_type =              SOCK_RAW,
8024 		.pr_protocol =          SYSPROTO_EVENT,
8025 		.pr_flags =             PR_ATOMIC,
8026 		.pr_usrreqs =           &event_usrreqs,
8027 		.pr_lock =              event_lock,
8028 		.pr_unlock =            event_unlock,
8029 		.pr_getlock =           event_getlock,
8030 	}
8031 };
8032 
8033 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8034 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8035 
8036 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8037     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8038 
8039 struct kevtstat kevtstat;
8040 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8041     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8042     kevt_getstat, "S,kevtstat", "");
8043 
8044 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8045     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8046     kevt_pcblist, "S,xkevtpcb", "");
8047 
8048 static lck_mtx_t *
event_getlock(struct socket * so,int flags)8049 event_getlock(struct socket *so, int flags)
8050 {
8051 #pragma unused(flags)
8052 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8053 
8054 	if (so->so_pcb != NULL) {
8055 		if (so->so_usecount < 0) {
8056 			panic("%s: so=%p usecount=%d lrh= %s", __func__,
8057 			    so, so->so_usecount, solockhistory_nr(so));
8058 		}
8059 		/* NOTREACHED */
8060 	} else {
8061 		panic("%s: so=%p NULL NO so_pcb %s", __func__,
8062 		    so, solockhistory_nr(so));
8063 		/* NOTREACHED */
8064 	}
8065 	return &ev_pcb->evp_mtx;
8066 }
8067 
8068 static int
event_lock(struct socket * so,int refcount,void * lr)8069 event_lock(struct socket *so, int refcount, void *lr)
8070 {
8071 	void *lr_saved;
8072 
8073 	if (lr == NULL) {
8074 		lr_saved = __builtin_return_address(0);
8075 	} else {
8076 		lr_saved = lr;
8077 	}
8078 
8079 	if (so->so_pcb != NULL) {
8080 		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8081 	} else {
8082 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8083 		    so, lr_saved, solockhistory_nr(so));
8084 		/* NOTREACHED */
8085 	}
8086 
8087 	if (so->so_usecount < 0) {
8088 		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8089 		    so, so->so_pcb, lr_saved, so->so_usecount,
8090 		    solockhistory_nr(so));
8091 		/* NOTREACHED */
8092 	}
8093 
8094 	if (refcount) {
8095 		so->so_usecount++;
8096 	}
8097 
8098 	so->lock_lr[so->next_lock_lr] = lr_saved;
8099 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8100 	return 0;
8101 }
8102 
8103 static int
event_unlock(struct socket * so,int refcount,void * lr)8104 event_unlock(struct socket *so, int refcount, void *lr)
8105 {
8106 	void *lr_saved;
8107 	lck_mtx_t *mutex_held;
8108 
8109 	if (lr == NULL) {
8110 		lr_saved = __builtin_return_address(0);
8111 	} else {
8112 		lr_saved = lr;
8113 	}
8114 
8115 	if (refcount) {
8116 		so->so_usecount--;
8117 	}
8118 	if (so->so_usecount < 0) {
8119 		panic("%s: so=%p usecount=%d lrh= %s", __func__,
8120 		    so, so->so_usecount, solockhistory_nr(so));
8121 		/* NOTREACHED */
8122 	}
8123 	if (so->so_pcb == NULL) {
8124 		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8125 		    so, so->so_usecount, (void *)lr_saved,
8126 		    solockhistory_nr(so));
8127 		/* NOTREACHED */
8128 	}
8129 	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8130 
8131 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8132 	so->unlock_lr[so->next_unlock_lr] = lr_saved;
8133 	so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8134 
8135 	if (so->so_usecount == 0) {
8136 		VERIFY(so->so_flags & SOF_PCBCLEARING);
8137 		event_sofreelastref(so);
8138 	} else {
8139 		lck_mtx_unlock(mutex_held);
8140 	}
8141 
8142 	return 0;
8143 }
8144 
8145 static int
event_sofreelastref(struct socket * so)8146 event_sofreelastref(struct socket *so)
8147 {
8148 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8149 
8150 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8151 
8152 	so->so_pcb = NULL;
8153 
8154 	/*
8155 	 * Disable upcall in the event another thread is in kev_post_msg()
8156 	 * appending record to the receive socket buffer, since sbwakeup()
8157 	 * may release the socket lock otherwise.
8158 	 */
8159 	so->so_rcv.sb_flags &= ~SB_UPCALL;
8160 	so->so_snd.sb_flags &= ~SB_UPCALL;
8161 	so->so_event = sonullevent;
8162 	lck_mtx_unlock(&(ev_pcb->evp_mtx));
8163 
8164 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8165 	lck_rw_lock_exclusive(&kev_rwlock);
8166 	LIST_REMOVE(ev_pcb, evp_link);
8167 	kevtstat.kes_pcbcount--;
8168 	kevtstat.kes_gencnt++;
8169 	lck_rw_done(&kev_rwlock);
8170 	kev_delete(ev_pcb);
8171 
8172 	sofreelastref(so, 1);
8173 	return 0;
8174 }
8175 
8176 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8177 
8178 static
8179 struct kern_event_head kern_event_head;
8180 
8181 static u_int32_t static_event_id = 0;
8182 
8183 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8184 
8185 /*
8186  * Install the protosw's for the NKE manager.  Invoked at extension load time
8187  */
8188 void
kern_event_init(struct domain * dp)8189 kern_event_init(struct domain *dp)
8190 {
8191 	struct protosw *pr;
8192 	int i;
8193 
8194 	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8195 	VERIFY(dp == systemdomain);
8196 
8197 	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8198 		net_add_proto(pr, dp, 1);
8199 	}
8200 }
8201 
8202 static int
kev_attach(struct socket * so,__unused int proto,__unused struct proc * p)8203 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8204 {
8205 	int error = 0;
8206 	struct kern_event_pcb *ev_pcb;
8207 
8208 	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8209 	if (error != 0) {
8210 		return error;
8211 	}
8212 
8213 	ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8214 	lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8215 
8216 	ev_pcb->evp_socket = so;
8217 	ev_pcb->evp_vendor_code_filter = 0xffffffff;
8218 
8219 	so->so_pcb = (caddr_t) ev_pcb;
8220 	lck_rw_lock_exclusive(&kev_rwlock);
8221 	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8222 	kevtstat.kes_pcbcount++;
8223 	kevtstat.kes_gencnt++;
8224 	lck_rw_done(&kev_rwlock);
8225 
8226 	return error;
8227 }
8228 
8229 static void
kev_delete(struct kern_event_pcb * ev_pcb)8230 kev_delete(struct kern_event_pcb *ev_pcb)
8231 {
8232 	VERIFY(ev_pcb != NULL);
8233 	lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8234 	zfree(ev_pcb_zone, ev_pcb);
8235 }
8236 
8237 static int
kev_detach(struct socket * so)8238 kev_detach(struct socket *so)
8239 {
8240 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8241 
8242 	if (ev_pcb != NULL) {
8243 		soisdisconnected(so);
8244 		so->so_flags |= SOF_PCBCLEARING;
8245 	}
8246 
8247 	return 0;
8248 }
8249 
8250 /*
8251  * For now, kev_vendor_code and mbuf_tags use the same
8252  * mechanism.
8253  */
8254 errno_t
kev_vendor_code_find(const char * string,u_int32_t * out_vendor_code)8255 kev_vendor_code_find(
8256 	const char      *string,
8257 	u_int32_t       *out_vendor_code)
8258 {
8259 	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8260 		return EINVAL;
8261 	}
8262 	return net_str_id_find_internal(string, out_vendor_code,
8263 	           NSI_VENDOR_CODE, 1);
8264 }
8265 
8266 errno_t
kev_msg_post(struct kev_msg * event_msg)8267 kev_msg_post(struct kev_msg *event_msg)
8268 {
8269 	mbuf_tag_id_t min_vendor, max_vendor;
8270 
8271 	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8272 
8273 	if (event_msg == NULL) {
8274 		return EINVAL;
8275 	}
8276 
8277 	/*
8278 	 * Limit third parties to posting events for registered vendor codes
8279 	 * only
8280 	 */
8281 	if (event_msg->vendor_code < min_vendor ||
8282 	    event_msg->vendor_code > max_vendor) {
8283 		os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8284 		return EINVAL;
8285 	}
8286 	return kev_post_msg(event_msg);
8287 }
8288 
8289 static int
kev_post_msg_internal(struct kev_msg * event_msg,int wait)8290 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8291 {
8292 	struct mbuf *m, *m2;
8293 	struct kern_event_pcb *ev_pcb;
8294 	struct kern_event_msg *ev;
8295 	char *tmp;
8296 	u_int32_t total_size;
8297 	int i;
8298 
8299 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8300 	/*
8301 	 * Special hook for ALF state updates
8302 	 */
8303 	if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8304 	    event_msg->kev_class == KEV_NKE_CLASS &&
8305 	    event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8306 	    event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8307 #if (DEBUG || DEVELOPMENT)
8308 		os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8309 #endif /* DEBUG || DEVELOPMENT */
8310 		net_filter_event_mark(NET_FILTER_EVENT_ALF,
8311 		    net_check_compatible_alf());
8312 	}
8313 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8314 
8315 	/* Verify the message is small enough to fit in one mbuf w/o cluster */
8316 	total_size = KEV_MSG_HEADER_SIZE;
8317 
8318 	for (i = 0; i < 5; i++) {
8319 		if (event_msg->dv[i].data_length == 0) {
8320 			break;
8321 		}
8322 		total_size += event_msg->dv[i].data_length;
8323 	}
8324 
8325 	if (total_size > MLEN) {
8326 		os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8327 		return EMSGSIZE;
8328 	}
8329 
8330 	m = m_get(wait, MT_DATA);
8331 	if (m == 0) {
8332 		os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8333 		return ENOMEM;
8334 	}
8335 	ev = mtod(m, struct kern_event_msg *);
8336 	total_size = KEV_MSG_HEADER_SIZE;
8337 
8338 	tmp = (char *) &ev->event_data[0];
8339 	for (i = 0; i < 5; i++) {
8340 		if (event_msg->dv[i].data_length == 0) {
8341 			break;
8342 		}
8343 
8344 		total_size += event_msg->dv[i].data_length;
8345 		bcopy(event_msg->dv[i].data_ptr, tmp,
8346 		    event_msg->dv[i].data_length);
8347 		tmp += event_msg->dv[i].data_length;
8348 	}
8349 
8350 	ev->id = ++static_event_id;
8351 	ev->total_size   = total_size;
8352 	ev->vendor_code  = event_msg->vendor_code;
8353 	ev->kev_class    = event_msg->kev_class;
8354 	ev->kev_subclass = event_msg->kev_subclass;
8355 	ev->event_code   = event_msg->event_code;
8356 
8357 	m->m_len = total_size;
8358 	lck_rw_lock_shared(&kev_rwlock);
8359 	for (ev_pcb = LIST_FIRST(&kern_event_head);
8360 	    ev_pcb;
8361 	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8362 		lck_mtx_lock(&ev_pcb->evp_mtx);
8363 		if (ev_pcb->evp_socket->so_pcb == NULL) {
8364 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8365 			continue;
8366 		}
8367 		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8368 			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8369 				lck_mtx_unlock(&ev_pcb->evp_mtx);
8370 				continue;
8371 			}
8372 
8373 			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8374 				if (ev_pcb->evp_class_filter != ev->kev_class) {
8375 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8376 					continue;
8377 				}
8378 
8379 				if ((ev_pcb->evp_subclass_filter !=
8380 				    KEV_ANY_SUBCLASS) &&
8381 				    (ev_pcb->evp_subclass_filter !=
8382 				    ev->kev_subclass)) {
8383 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8384 					continue;
8385 				}
8386 			}
8387 		}
8388 
8389 		m2 = m_copym(m, 0, m->m_len, wait);
8390 		if (m2 == 0) {
8391 			os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8392 			m_free(m);
8393 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8394 			lck_rw_done(&kev_rwlock);
8395 			return ENOMEM;
8396 		}
8397 		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8398 			/*
8399 			 * We use "m" for the socket stats as it would be
8400 			 * unsafe to use "m2"
8401 			 */
8402 			so_inc_recv_data_stat(ev_pcb->evp_socket,
8403 			    1, m->m_len, MBUF_TC_BE);
8404 
8405 			sorwakeup(ev_pcb->evp_socket);
8406 			os_atomic_inc(&kevtstat.kes_posted, relaxed);
8407 		} else {
8408 			os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8409 		}
8410 		lck_mtx_unlock(&ev_pcb->evp_mtx);
8411 	}
8412 	m_free(m);
8413 	lck_rw_done(&kev_rwlock);
8414 
8415 	return 0;
8416 }
8417 
8418 int
kev_post_msg(struct kev_msg * event_msg)8419 kev_post_msg(struct kev_msg *event_msg)
8420 {
8421 	return kev_post_msg_internal(event_msg, M_WAIT);
8422 }
8423 
8424 int
kev_post_msg_nowait(struct kev_msg * event_msg)8425 kev_post_msg_nowait(struct kev_msg *event_msg)
8426 {
8427 	return kev_post_msg_internal(event_msg, M_NOWAIT);
8428 }
8429 
8430 static int
kev_control(struct socket * so,u_long cmd,caddr_t data,__unused struct ifnet * ifp,__unused struct proc * p)8431 kev_control(struct socket *so,
8432     u_long cmd,
8433     caddr_t data,
8434     __unused struct ifnet *ifp,
8435     __unused struct proc *p)
8436 {
8437 	struct kev_request *kev_req = (struct kev_request *) data;
8438 	struct kern_event_pcb  *ev_pcb;
8439 	struct kev_vendor_code *kev_vendor;
8440 	u_int32_t  *id_value = (u_int32_t *) data;
8441 
8442 	switch (cmd) {
8443 	case SIOCGKEVID:
8444 		*id_value = static_event_id;
8445 		break;
8446 	case SIOCSKEVFILT:
8447 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8448 		ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8449 		ev_pcb->evp_class_filter = kev_req->kev_class;
8450 		ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
8451 		break;
8452 	case SIOCGKEVFILT:
8453 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8454 		kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8455 		kev_req->kev_class   = ev_pcb->evp_class_filter;
8456 		kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8457 		break;
8458 	case SIOCGKEVVENDOR:
8459 		kev_vendor = (struct kev_vendor_code *)data;
8460 		/* Make sure string is NULL terminated */
8461 		kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8462 		return net_str_id_find_internal(kev_vendor->vendor_string,
8463 		           &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8464 	default:
8465 		return ENOTSUP;
8466 	}
8467 
8468 	return 0;
8469 }
8470 
8471 int
8472 kevt_getstat SYSCTL_HANDLER_ARGS
8473 {
8474 #pragma unused(oidp, arg1, arg2)
8475 	int error = 0;
8476 
8477 	lck_rw_lock_shared(&kev_rwlock);
8478 
8479 	if (req->newptr != USER_ADDR_NULL) {
8480 		error = EPERM;
8481 		goto done;
8482 	}
8483 	if (req->oldptr == USER_ADDR_NULL) {
8484 		req->oldidx = sizeof(struct kevtstat);
8485 		goto done;
8486 	}
8487 
8488 	error = SYSCTL_OUT(req, &kevtstat,
8489 	    MIN(sizeof(struct kevtstat), req->oldlen));
8490 done:
8491 	lck_rw_done(&kev_rwlock);
8492 
8493 	return error;
8494 }
8495 
8496 __private_extern__ int
8497 kevt_pcblist SYSCTL_HANDLER_ARGS
8498 {
8499 #pragma unused(oidp, arg1, arg2)
8500 	int error = 0;
8501 	uint64_t n, i;
8502 	struct xsystmgen xsg;
8503 	void *buf = NULL;
8504 	size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8505 	    ROUNDUP64(sizeof(struct xsocket_n)) +
8506 	    2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8507 	    ROUNDUP64(sizeof(struct xsockstat_n));
8508 	struct kern_event_pcb  *ev_pcb;
8509 
8510 	buf = kalloc_data(item_size, Z_WAITOK | Z_ZERO);
8511 	if (buf == NULL) {
8512 		return ENOMEM;
8513 	}
8514 
8515 	lck_rw_lock_shared(&kev_rwlock);
8516 
8517 	n = kevtstat.kes_pcbcount;
8518 
8519 	if (req->oldptr == USER_ADDR_NULL) {
8520 		req->oldidx = (size_t) ((n + n / 8) * item_size);
8521 		goto done;
8522 	}
8523 	if (req->newptr != USER_ADDR_NULL) {
8524 		error = EPERM;
8525 		goto done;
8526 	}
8527 	bzero(&xsg, sizeof(xsg));
8528 	xsg.xg_len = sizeof(xsg);
8529 	xsg.xg_count = n;
8530 	xsg.xg_gen = kevtstat.kes_gencnt;
8531 	xsg.xg_sogen = so_gencnt;
8532 	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8533 	if (error) {
8534 		goto done;
8535 	}
8536 	/*
8537 	 * We are done if there is no pcb
8538 	 */
8539 	if (n == 0) {
8540 		goto done;
8541 	}
8542 
8543 	i = 0;
8544 	for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8545 	    i < n && ev_pcb != NULL;
8546 	    i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8547 		struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8548 		struct xsocket_n *xso = (struct xsocket_n *)
8549 		    ADVANCE64(xk, sizeof(*xk));
8550 		struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8551 		    ADVANCE64(xso, sizeof(*xso));
8552 		struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8553 		    ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8554 		struct xsockstat_n *xsostats = (struct xsockstat_n *)
8555 		    ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8556 
8557 		bzero(buf, item_size);
8558 
8559 		lck_mtx_lock(&ev_pcb->evp_mtx);
8560 
8561 		xk->kep_len = sizeof(struct xkevtpcb);
8562 		xk->kep_kind = XSO_EVT;
8563 		xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8564 		xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8565 		xk->kep_class_filter = ev_pcb->evp_class_filter;
8566 		xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8567 
8568 		sotoxsocket_n(ev_pcb->evp_socket, xso);
8569 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
8570 		    &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8571 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
8572 		    &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8573 		sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8574 
8575 		lck_mtx_unlock(&ev_pcb->evp_mtx);
8576 
8577 		error = SYSCTL_OUT(req, buf, item_size);
8578 	}
8579 
8580 	if (error == 0) {
8581 		/*
8582 		 * Give the user an updated idea of our state.
8583 		 * If the generation differs from what we told
8584 		 * her before, she knows that something happened
8585 		 * while we were processing this request, and it
8586 		 * might be necessary to retry.
8587 		 */
8588 		bzero(&xsg, sizeof(xsg));
8589 		xsg.xg_len = sizeof(xsg);
8590 		xsg.xg_count = n;
8591 		xsg.xg_gen = kevtstat.kes_gencnt;
8592 		xsg.xg_sogen = so_gencnt;
8593 		error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8594 		if (error) {
8595 			goto done;
8596 		}
8597 	}
8598 
8599 done:
8600 	lck_rw_done(&kev_rwlock);
8601 
8602 	kfree_data(buf, item_size);
8603 	return error;
8604 }
8605 
8606 #endif /* SOCKETS */
8607 
8608 
8609 int
fill_kqueueinfo(kqueue_t kqu,struct kqueue_info * kinfo)8610 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
8611 {
8612 	struct vinfo_stat * st;
8613 
8614 	st = &kinfo->kq_stat;
8615 
8616 	st->vst_size = kqu.kq->kq_count;
8617 	if (kqu.kq->kq_state & KQ_KEV_QOS) {
8618 		st->vst_blksize = sizeof(struct kevent_qos_s);
8619 	} else if (kqu.kq->kq_state & KQ_KEV64) {
8620 		st->vst_blksize = sizeof(struct kevent64_s);
8621 	} else {
8622 		st->vst_blksize = sizeof(struct kevent);
8623 	}
8624 	st->vst_mode = S_IFIFO;
8625 	st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
8626 	    kqu.kqwl->kqwl_dynamicid : 0;
8627 
8628 	/* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8629 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8630 	static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
8631 	static_assert(PROC_KQUEUE_32 == KQ_KEV32);
8632 	static_assert(PROC_KQUEUE_64 == KQ_KEV64);
8633 	static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
8634 	static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
8635 	static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
8636 	kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
8637 	if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
8638 		if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
8639 			kinfo->kq_state |= PROC_KQUEUE_SELECT;
8640 		}
8641 	}
8642 
8643 	return 0;
8644 }
8645 
8646 static int
fill_kqueue_dyninfo(struct kqworkloop * kqwl,struct kqueue_dyninfo * kqdi)8647 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
8648 {
8649 	workq_threadreq_t kqr = &kqwl->kqwl_request;
8650 	workq_threadreq_param_t trp = {};
8651 	int err;
8652 
8653 	if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
8654 		return EINVAL;
8655 	}
8656 
8657 	if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
8658 		return err;
8659 	}
8660 
8661 	kqlock(kqwl);
8662 
8663 	kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
8664 	kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8665 	kqdi->kqdi_request_state = kqr->tr_state;
8666 	kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8667 	kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8668 	kqdi->kqdi_sync_waiters = 0;
8669 	kqdi->kqdi_sync_waiter_qos = 0;
8670 
8671 	trp.trp_value = kqwl->kqwl_params;
8672 	if (trp.trp_flags & TRP_PRIORITY) {
8673 		kqdi->kqdi_pri = trp.trp_pri;
8674 	} else {
8675 		kqdi->kqdi_pri = 0;
8676 	}
8677 
8678 	if (trp.trp_flags & TRP_POLICY) {
8679 		kqdi->kqdi_pol = trp.trp_pol;
8680 	} else {
8681 		kqdi->kqdi_pol = 0;
8682 	}
8683 
8684 	if (trp.trp_flags & TRP_CPUPERCENT) {
8685 		kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8686 	} else {
8687 		kqdi->kqdi_cpupercent = 0;
8688 	}
8689 
8690 	kqunlock(kqwl);
8691 
8692 	return 0;
8693 }
8694 
8695 
8696 static unsigned long
kevent_extinfo_emit(struct kqueue * kq,struct knote * kn,struct kevent_extinfo * buf,unsigned long buflen,unsigned long nknotes)8697 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8698     unsigned long buflen, unsigned long nknotes)
8699 {
8700 	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8701 		if (kq == knote_get_kq(kn)) {
8702 			if (nknotes < buflen) {
8703 				struct kevent_extinfo *info = &buf[nknotes];
8704 
8705 				kqlock(kq);
8706 
8707 				if (knote_fops(kn)->f_sanitized_copyout) {
8708 					knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
8709 				} else {
8710 					info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
8711 				}
8712 
8713 				if (knote_has_qos(kn)) {
8714 					info->kqext_kev.qos =
8715 					    _pthread_priority_thread_qos_fast(kn->kn_qos);
8716 				} else {
8717 					info->kqext_kev.qos = kn->kn_qos_override;
8718 				}
8719 				info->kqext_kev.filter |= 0xff00; /* sign extend filter */
8720 				info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
8721 				info->kqext_kev.data    = 0; /* this is where sdata lives */
8722 				info->kqext_sdata       = kn->kn_sdata;
8723 				info->kqext_status      = kn->kn_status;
8724 				info->kqext_sfflags     = kn->kn_sfflags;
8725 
8726 				kqunlock(kq);
8727 			}
8728 
8729 			/* we return total number of knotes, which may be more than requested */
8730 			nknotes++;
8731 		}
8732 	}
8733 
8734 	return nknotes;
8735 }
8736 
8737 int
kevent_copyout_proc_dynkqids(void * proc,user_addr_t ubuf,uint32_t ubufsize,int32_t * nkqueues_out)8738 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8739     int32_t *nkqueues_out)
8740 {
8741 	proc_t p = (proc_t)proc;
8742 	struct filedesc *fdp = &p->p_fd;
8743 	unsigned int nkqueues = 0;
8744 	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8745 	size_t buflen, bufsize;
8746 	kqueue_id_t *kq_ids = NULL;
8747 	int err = 0;
8748 
8749 	assert(p != NULL);
8750 
8751 	if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8752 		err = EINVAL;
8753 		goto out;
8754 	}
8755 
8756 	buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8757 
8758 	if (ubuflen != 0) {
8759 		if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8760 			err = ERANGE;
8761 			goto out;
8762 		}
8763 		kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
8764 		if (!kq_ids) {
8765 			err = ENOMEM;
8766 			goto out;
8767 		}
8768 	}
8769 
8770 	kqhash_lock(fdp);
8771 
8772 	if (fdp->fd_kqhashmask > 0) {
8773 		for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8774 			struct kqworkloop *kqwl;
8775 
8776 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8777 				/* report the number of kqueues, even if they don't all fit */
8778 				if (nkqueues < buflen) {
8779 					kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8780 				}
8781 				nkqueues++;
8782 			}
8783 		}
8784 	}
8785 
8786 	kqhash_unlock(fdp);
8787 
8788 	if (kq_ids) {
8789 		size_t copysize;
8790 		if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
8791 			err = ERANGE;
8792 			goto out;
8793 		}
8794 
8795 		assert(ubufsize >= copysize);
8796 		err = copyout(kq_ids, ubuf, copysize);
8797 	}
8798 
8799 out:
8800 	if (kq_ids) {
8801 		kfree_data(kq_ids, bufsize);
8802 	}
8803 
8804 	if (!err) {
8805 		*nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8806 	}
8807 	return err;
8808 }
8809 
8810 int
kevent_copyout_dynkqinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * size_out)8811 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8812     uint32_t ubufsize, int32_t *size_out)
8813 {
8814 	proc_t p = (proc_t)proc;
8815 	struct kqworkloop *kqwl;
8816 	int err = 0;
8817 	struct kqueue_dyninfo kqdi = { };
8818 
8819 	assert(p != NULL);
8820 
8821 	if (ubufsize < sizeof(struct kqueue_info)) {
8822 		return ENOBUFS;
8823 	}
8824 
8825 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
8826 	if (!kqwl) {
8827 		return ESRCH;
8828 	}
8829 
8830 	/*
8831 	 * backward compatibility: allow the argument to this call to only be
8832 	 * a struct kqueue_info
8833 	 */
8834 	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8835 		ubufsize = sizeof(struct kqueue_dyninfo);
8836 		err = fill_kqueue_dyninfo(kqwl, &kqdi);
8837 	} else {
8838 		ubufsize = sizeof(struct kqueue_info);
8839 		err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
8840 	}
8841 	if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
8842 		*size_out = ubufsize;
8843 	}
8844 	kqworkloop_release(kqwl);
8845 	return err;
8846 }
8847 
8848 int
kevent_copyout_dynkqextinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * nknotes_out)8849 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8850     uint32_t ubufsize, int32_t *nknotes_out)
8851 {
8852 	proc_t p = (proc_t)proc;
8853 	struct kqworkloop *kqwl;
8854 	int err;
8855 
8856 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
8857 	if (!kqwl) {
8858 		return ESRCH;
8859 	}
8860 
8861 	err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
8862 	kqworkloop_release(kqwl);
8863 	return err;
8864 }
8865 
8866 int
pid_kqueue_extinfo(proc_t p,struct kqueue * kq,user_addr_t ubuf,uint32_t bufsize,int32_t * retval)8867 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8868     uint32_t bufsize, int32_t *retval)
8869 {
8870 	struct knote *kn;
8871 	int i;
8872 	int err = 0;
8873 	struct filedesc *fdp = &p->p_fd;
8874 	unsigned long nknotes = 0;
8875 	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8876 	struct kevent_extinfo *kqext = NULL;
8877 
8878 	/* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8879 	buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8880 
8881 	kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
8882 	if (kqext == NULL) {
8883 		err = ENOMEM;
8884 		goto out;
8885 	}
8886 
8887 	proc_fdlock(p);
8888 	for (i = 0; i < fdp->fd_knlistsize; i++) {
8889 		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8890 		nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8891 	}
8892 	proc_fdunlock(p);
8893 
8894 	if (fdp->fd_knhashmask != 0) {
8895 		for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8896 			knhash_lock(fdp);
8897 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8898 			nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8899 			knhash_unlock(fdp);
8900 		}
8901 	}
8902 
8903 	assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8904 	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8905 
8906 out:
8907 	kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
8908 
8909 	if (!err) {
8910 		*retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8911 	}
8912 	return err;
8913 }
8914 
8915 static unsigned int
klist_copy_udata(struct klist * list,uint64_t * buf,unsigned int buflen,unsigned int nknotes)8916 klist_copy_udata(struct klist *list, uint64_t *buf,
8917     unsigned int buflen, unsigned int nknotes)
8918 {
8919 	struct knote *kn;
8920 	SLIST_FOREACH(kn, list, kn_link) {
8921 		if (nknotes < buflen) {
8922 			/*
8923 			 * kevent_register will always set kn_udata atomically
8924 			 * so that we don't have to take any kqlock here.
8925 			 */
8926 			buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
8927 		}
8928 		/* we return total number of knotes, which may be more than requested */
8929 		nknotes++;
8930 	}
8931 
8932 	return nknotes;
8933 }
8934 
8935 int
kevent_proc_copy_uptrs(void * proc,uint64_t * buf,uint32_t bufsize)8936 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
8937 {
8938 	proc_t p = (proc_t)proc;
8939 	struct filedesc *fdp = &p->p_fd;
8940 	unsigned int nuptrs = 0;
8941 	unsigned int buflen = bufsize / sizeof(uint64_t);
8942 	struct kqworkloop *kqwl;
8943 
8944 	if (buflen > 0) {
8945 		assert(buf != NULL);
8946 	}
8947 
8948 	proc_fdlock(p);
8949 	for (int i = 0; i < fdp->fd_knlistsize; i++) {
8950 		nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
8951 	}
8952 	proc_fdunlock(p);
8953 
8954 	knhash_lock(fdp);
8955 	if (fdp->fd_knhashmask != 0) {
8956 		for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) {
8957 			nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
8958 		}
8959 	}
8960 	knhash_unlock(fdp);
8961 
8962 	kqhash_lock(fdp);
8963 	if (fdp->fd_kqhashmask != 0) {
8964 		for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8965 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8966 				if (nuptrs < buflen) {
8967 					buf[nuptrs] = kqwl->kqwl_dynamicid;
8968 				}
8969 				nuptrs++;
8970 			}
8971 		}
8972 	}
8973 	kqhash_unlock(fdp);
8974 
8975 	return (int)nuptrs;
8976 }
8977 
8978 static void
kevent_set_return_to_kernel_user_tsd(proc_t p,thread_t thread)8979 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
8980 {
8981 	uint64_t ast_addr;
8982 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
8983 	size_t user_addr_size = proc_is_64bit ? 8 : 4;
8984 	uint32_t ast_flags32 = 0;
8985 	uint64_t ast_flags64 = 0;
8986 	struct uthread *ut = get_bsdthread_info(thread);
8987 
8988 	if (ut->uu_kqr_bound != NULL) {
8989 		ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
8990 	}
8991 
8992 	if (ast_flags64 == 0) {
8993 		return;
8994 	}
8995 
8996 	if (!(p->p_flag & P_LP64)) {
8997 		ast_flags32 = (uint32_t)ast_flags64;
8998 		assert(ast_flags64 < 0x100000000ull);
8999 	}
9000 
9001 	ast_addr = thread_rettokern_addr(thread);
9002 	if (ast_addr == 0) {
9003 		return;
9004 	}
9005 
9006 	if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9007 	    (user_addr_t)ast_addr,
9008 	    user_addr_size) != 0) {
9009 		printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9010 		    "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9011 	}
9012 }
9013 
9014 /*
9015  * Semantics of writing to TSD value:
9016  *
9017  * 1. It is written to by the kernel and cleared by userspace.
9018  * 2. When the userspace code clears the TSD field, it takes responsibility for
9019  * taking action on the quantum expiry action conveyed by kernel.
9020  * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9021  * userspace back to kernel to make sure that it is never leaked across thread
9022  * requests.
9023  */
9024 void
kevent_set_workq_quantum_expiry_user_tsd(proc_t p,thread_t thread,uint64_t flags)9025 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9026     uint64_t flags)
9027 {
9028 	uint64_t ast_addr;
9029 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9030 	uint32_t ast_flags32 = 0;
9031 	uint64_t ast_flags64 = flags;
9032 
9033 	if (ast_flags64 == 0) {
9034 		return;
9035 	}
9036 
9037 	if (!(p->p_flag & P_LP64)) {
9038 		ast_flags32 = (uint32_t)ast_flags64;
9039 		assert(ast_flags64 < 0x100000000ull);
9040 	}
9041 
9042 	ast_addr = thread_wqquantum_addr(thread);
9043 	assert(ast_addr != 0);
9044 
9045 	if (proc_is_64bit) {
9046 		if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9047 #if DEBUG || DEVELOPMENT
9048 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9049 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9050 #endif
9051 		}
9052 	} else {
9053 		if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9054 #if DEBUG || DEVELOPMENT
9055 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9056 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9057 #endif
9058 		}
9059 	}
9060 }
9061 
9062 void
kevent_ast(thread_t thread,uint16_t bits)9063 kevent_ast(thread_t thread, uint16_t bits)
9064 {
9065 	proc_t p = current_proc();
9066 
9067 
9068 	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9069 		workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9070 	}
9071 	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9072 		kevent_set_return_to_kernel_user_tsd(p, thread);
9073 	}
9074 
9075 	if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9076 		workq_kern_quantum_expiry_reevaluate(p, thread);
9077 	}
9078 }
9079 
9080 #if DEVELOPMENT || DEBUG
9081 
9082 #define KEVENT_SYSCTL_BOUND_ID 1
9083 
9084 static int
9085 kevent_sysctl SYSCTL_HANDLER_ARGS
9086 {
9087 #pragma unused(oidp, arg2)
9088 	uintptr_t type = (uintptr_t)arg1;
9089 	uint64_t bound_id = 0;
9090 
9091 	if (type != KEVENT_SYSCTL_BOUND_ID) {
9092 		return EINVAL;
9093 	}
9094 
9095 	if (req->newptr) {
9096 		return EINVAL;
9097 	}
9098 
9099 	struct uthread *ut = current_uthread();
9100 	if (!ut) {
9101 		return EFAULT;
9102 	}
9103 
9104 	workq_threadreq_t kqr = ut->uu_kqr_bound;
9105 	if (kqr) {
9106 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9107 			bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9108 		} else {
9109 			bound_id = -1;
9110 		}
9111 	}
9112 
9113 	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9114 }
9115 
9116 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9117     "kevent information");
9118 
9119 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9120     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9121     (void *)KEVENT_SYSCTL_BOUND_ID,
9122     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9123     "get the ID of the bound kqueue");
9124 
9125 #endif /* DEVELOPMENT || DEBUG */
9126