xref: /xnu-12377.41.6/bsd/kern/kern_event.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 /*-
30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 /*
55  *	@(#)kern_event.c       1.0 (3/31/2000)
56  */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91 
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 #include <kern/work_interval.h>
107 
108 #include <pthread/priority_private.h>
109 #include <pthread/workqueue_syscalls.h>
110 #include <pthread/workqueue_internal.h>
111 #include <libkern/libkern.h>
112 
113 #include <os/log.h>
114 
115 #include "mach/kern_return.h"
116 #include "net/net_str_id.h"
117 
118 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
119 #include <skywalk/lib/net_filter_event.h>
120 
121 extern bool net_check_compatible_alf(void);
122 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
123 
124 #include <mach/task.h>
125 #include <libkern/section_keywords.h>
126 
127 #if CONFIG_MEMORYSTATUS
128 #include <sys/kern_memorystatus.h>
129 #endif
130 
131 #if DEVELOPMENT || DEBUG
132 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
133 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
134 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
135 #endif
136 
137 /* Enable bound thread support for kqworkloop. */
138 static TUNABLE(int, bootarg_thread_bound_kqwl_support_enabled,
139     "enable_thread_bound_kqwl_support", 0);
140 SYSCTL_NODE(_kern, OID_AUTO, kern_event, CTLFLAG_RD | CTLFLAG_LOCKED, 0, NULL);
141 SYSCTL_INT(_kern_kern_event, OID_AUTO, thread_bound_kqwl_support_enabled,
142     CTLFLAG_RD | CTLFLAG_LOCKED,
143     &bootarg_thread_bound_kqwl_support_enabled, 0,
144     "Whether thread bound kqwl support is enabled");
145 
146 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
147 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
148     VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
149 
150 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
151 extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
152 
153 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
154 
155 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
156     vfs_context_t ctx);
157 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
158 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
159     struct kevent_qos_s *kev);
160 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
161 
162 static const struct fileops kqueueops = {
163 	.fo_type     = DTYPE_KQUEUE,
164 	.fo_read     = fo_no_read,
165 	.fo_write    = fo_no_write,
166 	.fo_ioctl    = fo_no_ioctl,
167 	.fo_select   = kqueue_select,
168 	.fo_close    = kqueue_close,
169 	.fo_drain    = kqueue_drain,
170 	.fo_kqfilter = kqueue_kqfilter,
171 };
172 
173 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
174 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
175 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
176     thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
177 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
178 static void kevent_register_wait_cleanup(struct knote *kn);
179 
180 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
181 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
182 
183 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
184 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
185 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
186 static void kqueue_update_iotier_override(kqueue_t kqu);
187 
188 static void kqworkloop_unbind(struct kqworkloop *kqwl);
189 
190 enum kqwl_unbind_locked_mode {
191 	KQWL_OVERRIDE_DROP_IMMEDIATELY,
192 	KQWL_OVERRIDE_DROP_DELAYED,
193 };
194 // The soft unbinding of kqworkloop only applies to kqwls configured
195 // with a permanently bound thread.
196 #define KQUEUE_THREADREQ_UNBIND_SOFT 0x1
197 static void kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
198     enum kqwl_unbind_locked_mode how, unsigned int flags);
199 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
200 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
201 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
202 static void kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread);
203 static void kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl);
204 
205 enum {
206 	KQWL_UTQ_NONE,
207 	/*
208 	 * The wakeup qos is the qos of QUEUED knotes.
209 	 *
210 	 * This QoS is accounted for with the events override in the
211 	 * kqr_override_index field. It is raised each time a new knote is queued at
212 	 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
213 	 * knote buckets and is recomputed after each event delivery.
214 	 */
215 	KQWL_UTQ_UPDATE_WAKEUP_QOS,
216 	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
217 	KQWL_UTQ_UNBINDING, /* attempt to rebind */
218 	KQWL_UTQ_PARKING,
219 	/*
220 	 * The wakeup override is for suppressed knotes that have fired again at
221 	 * a higher QoS than the one for which they are suppressed already.
222 	 * This override is cleared when the knote suppressed list becomes empty.
223 	 */
224 	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
225 	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
226 	/*
227 	 * The QoS is the maximum QoS of an event enqueued on this workloop in
228 	 * userland. It is copied from the only EVFILT_WORKLOOP knote with
229 	 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
230 	 * such knote, this QoS is 0.
231 	 */
232 	KQWL_UTQ_SET_QOS_INDEX,
233 	KQWL_UTQ_REDRIVE_EVENTS,
234 };
235 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
236 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
237 
238 static struct knote *knote_alloc(void);
239 static void knote_free(struct knote *kn);
240 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
241     struct knote_lock_ctx *knlc, struct proc *p);
242 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
243     struct kevent_qos_s *kev, bool is_fd, struct proc *p);
244 
245 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
246 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
247 
248 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
249     struct kevent_qos_s *kev, int result);
250 static void knote_suppress(kqueue_t kqu, struct knote *kn);
251 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
252 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
253 
254 // both these functions may dequeue the knote and it is up to the caller
255 // to enqueue the knote back
256 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
257 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
258 
259 static ZONE_DEFINE(knote_zone, "knote zone",
260     sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
261 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
262     sizeof(struct kqfile), ZC_ZFREE_CLEARMEM);
263 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
264     sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM);
265 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
266     sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM);
267 
268 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
269 
270 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
271 static void filt_no_detach(struct knote *kn);
272 static int filt_bad_event(struct knote *kn, long hint);
273 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
274 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
275 
276 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
277 	.f_attach  = filt_no_attach,
278 	.f_detach  = filt_no_detach,
279 	.f_event   = filt_bad_event,
280 	.f_touch   = filt_bad_touch,
281 	.f_process = filt_bad_process,
282 };
283 
284 #if CONFIG_MEMORYSTATUS
285 extern const struct filterops memorystatus_filtops;
286 #endif /* CONFIG_MEMORYSTATUS */
287 extern const struct filterops fs_filtops;
288 extern const struct filterops sig_filtops;
289 extern const struct filterops machport_attach_filtops;
290 extern const struct filterops mach_port_filtops;
291 extern const struct filterops mach_port_set_filtops;
292 extern const struct filterops pipe_nfiltops;
293 extern const struct filterops pipe_rfiltops;
294 extern const struct filterops pipe_wfiltops;
295 extern const struct filterops ptsd_kqops;
296 extern const struct filterops ptmx_kqops;
297 extern const struct filterops soread_filtops;
298 extern const struct filterops sowrite_filtops;
299 extern const struct filterops sock_filtops;
300 extern const struct filterops soexcept_filtops;
301 extern const struct filterops spec_filtops;
302 extern const struct filterops bpfread_filtops;
303 extern const struct filterops necp_fd_rfiltops;
304 #if SKYWALK
305 extern const struct filterops skywalk_channel_rfiltops;
306 extern const struct filterops skywalk_channel_wfiltops;
307 extern const struct filterops skywalk_channel_efiltops;
308 #endif /* SKYWALK */
309 extern const struct filterops fsevent_filtops;
310 extern const struct filterops vnode_filtops;
311 extern const struct filterops tty_filtops;
312 
313 __security_const_early static struct filterops file_filtops;
314 __security_const_early static struct filterops kqread_filtops;
315 __security_const_early static struct filterops proc_filtops;
316 __security_const_early static struct filterops timer_filtops;
317 __security_const_early static struct filterops user_filtops;
318 __security_const_early static struct filterops workloop_filtops;
319 #if CONFIG_EXCLAVES
320 extern const struct filterops exclaves_notification_filtops;
321 #endif /* CONFIG_EXCLAVES */
322 extern const struct filterops aio_filtops;
323 
324 /*
325  *
326  * Rules for adding new filters to the system:
327  * Public filters:
328  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
329  *   in the exported section of the header
330  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
331  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
332  *   of the Public Filters section in the array.
333  * Private filters:
334  * - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
335  * - Update the EVFILTID_MAX value to reflect the new addition
336  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
337  *   the Private filters section of the array.
338  */
339 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
340 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
341 	/* Public Filters */
342 	[~EVFILT_READ]                  = &file_filtops,
343 	[~EVFILT_WRITE]                 = &file_filtops,
344 	[~EVFILT_AIO]                   = &aio_filtops,
345 	[~EVFILT_VNODE]                 = &file_filtops,
346 	[~EVFILT_PROC]                  = &proc_filtops,
347 	[~EVFILT_SIGNAL]                = &sig_filtops,
348 	[~EVFILT_TIMER]                 = &timer_filtops,
349 	[~EVFILT_MACHPORT]              = &machport_attach_filtops,
350 	[~EVFILT_FS]                    = &fs_filtops,
351 	[~EVFILT_USER]                  = &user_filtops,
352 	[~EVFILT_UNUSED_11]             = &bad_filtops,
353 	[~EVFILT_VM]                    = &bad_filtops,
354 	[~EVFILT_SOCK]                  = &file_filtops,
355 #if CONFIG_MEMORYSTATUS
356 	[~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
357 #else
358 	[~EVFILT_MEMORYSTATUS]          = &bad_filtops,
359 #endif
360 	[~EVFILT_EXCEPT]                = &file_filtops,
361 #if SKYWALK
362 	[~EVFILT_NW_CHANNEL]            = &file_filtops,
363 #else /* !SKYWALK */
364 	[~EVFILT_NW_CHANNEL]            = &bad_filtops,
365 #endif /* !SKYWALK */
366 	[~EVFILT_WORKLOOP]              = &workloop_filtops,
367 #if CONFIG_EXCLAVES
368 	[~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
369 #else /* !CONFIG_EXCLAVES */
370 	[~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
371 #endif /* CONFIG_EXCLAVES*/
372 
373 	/* Private filters */
374 	[EVFILTID_KQREAD]               = &kqread_filtops,
375 	[EVFILTID_PIPE_N]               = &pipe_nfiltops,
376 	[EVFILTID_PIPE_R]               = &pipe_rfiltops,
377 	[EVFILTID_PIPE_W]               = &pipe_wfiltops,
378 	[EVFILTID_PTSD]                 = &ptsd_kqops,
379 	[EVFILTID_SOREAD]               = &soread_filtops,
380 	[EVFILTID_SOWRITE]              = &sowrite_filtops,
381 	[EVFILTID_SCK]                  = &sock_filtops,
382 	[EVFILTID_SOEXCEPT]             = &soexcept_filtops,
383 	[EVFILTID_SPEC]                 = &spec_filtops,
384 	[EVFILTID_BPFREAD]              = &bpfread_filtops,
385 	[EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
386 #if SKYWALK
387 	[EVFILTID_SKYWALK_CHANNEL_W]    = &skywalk_channel_wfiltops,
388 	[EVFILTID_SKYWALK_CHANNEL_R]    = &skywalk_channel_rfiltops,
389 	[EVFILTID_SKYWALK_CHANNEL_E]    = &skywalk_channel_efiltops,
390 #else /* !SKYWALK */
391 	[EVFILTID_SKYWALK_CHANNEL_W]    = &bad_filtops,
392 	[EVFILTID_SKYWALK_CHANNEL_R]    = &bad_filtops,
393 	[EVFILTID_SKYWALK_CHANNEL_E]    = &bad_filtops,
394 #endif /* !SKYWALK */
395 	[EVFILTID_FSEVENT]              = &fsevent_filtops,
396 	[EVFILTID_VN]                   = &vnode_filtops,
397 	[EVFILTID_TTY]                  = &tty_filtops,
398 	[EVFILTID_PTMX]                 = &ptmx_kqops,
399 	[EVFILTID_MACH_PORT]            = &mach_port_filtops,
400 	[EVFILTID_MACH_PORT_SET]        = &mach_port_set_filtops,
401 
402 	/* fake filter for detached knotes, keep last */
403 	[EVFILTID_DETACHED]             = &bad_filtops,
404 };
405 
406 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)407 kqr_thread_bound(workq_threadreq_t kqr)
408 {
409 	return kqr->tr_state == WORKQ_TR_STATE_BOUND;
410 }
411 
412 static inline bool
kqr_thread_permanently_bound(workq_threadreq_t kqr)413 kqr_thread_permanently_bound(workq_threadreq_t kqr)
414 {
415 	return kqr_thread_bound(kqr) && (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
416 }
417 
418 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)419 kqr_thread_requested_pending(workq_threadreq_t kqr)
420 {
421 	workq_tr_state_t tr_state = kqr->tr_state;
422 	return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
423 }
424 
425 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)426 kqr_thread_requested(workq_threadreq_t kqr)
427 {
428 	return kqr->tr_state != WORKQ_TR_STATE_IDLE;
429 }
430 
431 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)432 kqr_thread_fast(workq_threadreq_t kqr)
433 {
434 	assert(kqr_thread_bound(kqr));
435 	return kqr->tr_thread;
436 }
437 
438 static inline thread_t
kqr_thread(workq_threadreq_t kqr)439 kqr_thread(workq_threadreq_t kqr)
440 {
441 	return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
442 }
443 
444 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)445 kqr_kqworkloop(workq_threadreq_t kqr)
446 {
447 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
448 		return __container_of(kqr, struct kqworkloop, kqwl_request);
449 	}
450 	return NULL;
451 }
452 
453 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)454 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
455 {
456 	kqueue_t kqu;
457 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
458 		kqu.kqwl = kqr_kqworkloop(kqr);
459 	} else {
460 		kqu.kqwq = p->p_fd.fd_wqkqueue;
461 		assert(kqr >= kqu.kqwq->kqwq_request &&
462 		    kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
463 	}
464 	return kqu;
465 }
466 
467 #if CONFIG_PREADOPT_TG
468 /* There are no guarantees about which locks are held when this is called */
469 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)470 kqr_preadopt_thread_group(workq_threadreq_t req)
471 {
472 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
473 	return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
474 }
475 
476 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)477 inline _Atomic(thread_group_qos_t) *
478 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
479 {
480 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
481 	return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
482 }
483 #endif
484 
485 /*
486  * kqueue/note lock implementations
487  *
488  *	The kqueue lock guards the kq state, the state of its queues,
489  *	and the kqueue-aware status and locks of individual knotes.
490  *
491  *	The kqueue workq lock is used to protect state guarding the
492  *	interaction of the kqueue with the workq.  This state cannot
493  *	be guarded by the kq lock - as it needs to be taken when we
494  *	already have the waitq set lock held (during the waitq hook
495  *	callback).  It might be better to use the waitq lock itself
496  *	for this, but the IRQ requirements make that difficult).
497  *
498  *	Knote flags, filter flags, and associated data are protected
499  *	by the underlying object lock - and are only ever looked at
500  *	by calling the filter to get a [consistent] snapshot of that
501  *	data.
502  */
503 
504 static inline void
kqlock(kqueue_t kqu)505 kqlock(kqueue_t kqu)
506 {
507 	lck_spin_lock(&kqu.kq->kq_lock);
508 }
509 
510 static inline void
kqlock_held(__assert_only kqueue_t kqu)511 kqlock_held(__assert_only kqueue_t kqu)
512 {
513 	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
514 }
515 
516 static inline void
kqunlock(kqueue_t kqu)517 kqunlock(kqueue_t kqu)
518 {
519 	lck_spin_unlock(&kqu.kq->kq_lock);
520 }
521 
522 static inline void
knhash_lock(struct filedesc * fdp)523 knhash_lock(struct filedesc *fdp)
524 {
525 	lck_mtx_lock(&fdp->fd_knhashlock);
526 }
527 
528 static inline void
knhash_unlock(struct filedesc * fdp)529 knhash_unlock(struct filedesc *fdp)
530 {
531 	lck_mtx_unlock(&fdp->fd_knhashlock);
532 }
533 
534 /* wait event for knote locks */
535 static inline event_t
knote_lock_wev(struct knote * kn)536 knote_lock_wev(struct knote *kn)
537 {
538 	return (event_t)(&kn->kn_hook);
539 }
540 
541 /* wait event for kevent_register_wait_* */
542 static inline event64_t
knote_filt_wev64(struct knote * kn)543 knote_filt_wev64(struct knote *kn)
544 {
545 	/* kdp_workloop_sync_wait_find_owner knows about this */
546 	return CAST_EVENT64_T(kn);
547 }
548 
549 /* wait event for knote_post/knote_drop */
550 static inline event_t
knote_post_wev(struct knote * kn)551 knote_post_wev(struct knote *kn)
552 {
553 	return &kn->kn_kevent;
554 }
555 
556 /*!
557  * @function knote_has_qos
558  *
559  * @brief
560  * Whether the knote has a regular QoS.
561  *
562  * @discussion
563  * kn_qos_override is:
564  * - 0 on kqfiles
565  * - THREAD_QOS_LAST for special buckets (manager)
566  *
567  * Other values mean the knote participates to QoS propagation.
568  */
569 static inline bool
knote_has_qos(struct knote * kn)570 knote_has_qos(struct knote *kn)
571 {
572 	return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
573 }
574 
575 #pragma mark knote locks
576 
577 /*
578  * Enum used by the knote_lock_* functions.
579  *
580  * KNOTE_KQ_LOCK_ALWAYS
581  *   The function will always return with the kq lock held.
582  *
583  * KNOTE_KQ_LOCK_ON_SUCCESS
584  *   The function will return with the kq lock held if it was successful
585  *   (knote_lock() is the only function that can fail).
586  *
587  * KNOTE_KQ_LOCK_ON_FAILURE
588  *   The function will return with the kq lock held if it was unsuccessful
589  *   (knote_lock() is the only function that can fail).
590  *
591  * KNOTE_KQ_UNLOCK:
592  *   The function returns with the kq unlocked.
593  */
594 enum kqlocking {
595 	KNOTE_KQ_LOCK_ALWAYS,
596 	KNOTE_KQ_LOCK_ON_SUCCESS,
597 	KNOTE_KQ_LOCK_ON_FAILURE,
598 	KNOTE_KQ_UNLOCK,
599 };
600 
601 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)602 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
603 {
604 	struct knote_lock_ctx *ctx;
605 	LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
606 		if (ctx->knlc_knote == kn) {
607 			return ctx;
608 		}
609 	}
610 	panic("knote lock context not found: %p", kn);
611 	__builtin_trap();
612 }
613 
614 /* slowpath of knote_lock() */
615 __attribute__((noinline))
616 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)617 knote_lock_slow(kqueue_t kqu, struct knote *kn,
618     struct knote_lock_ctx *knlc, int kqlocking)
619 {
620 	struct knote_lock_ctx *owner_lc;
621 	struct uthread *uth = current_uthread();
622 	wait_result_t wr;
623 
624 	kqlock_held(kqu);
625 
626 	owner_lc = knote_lock_ctx_find(kqu, kn);
627 #if MACH_ASSERT
628 	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
629 #endif
630 	owner_lc->knlc_waiters++;
631 
632 	/*
633 	 * Make our lock context visible to knote_unlock()
634 	 */
635 	uth->uu_knlock = knlc;
636 
637 	wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
638 	    knote_lock_wev(kn), owner_lc->knlc_thread,
639 	    THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
640 
641 	if (wr == THREAD_RESTART) {
642 		/*
643 		 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
644 		 * We need to cleanup the state since no one did.
645 		 */
646 		uth->uu_knlock = NULL;
647 #if MACH_ASSERT
648 		assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
649 		knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
650 #endif
651 
652 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
653 		    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
654 			kqlock(kqu);
655 		}
656 		return false;
657 	} else {
658 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
659 		    kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
660 			kqlock(kqu);
661 			/*
662 			 * This state is set under the lock so we can't
663 			 * really assert this unless we hold the lock.
664 			 */
665 			assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
666 		}
667 		return true;
668 	}
669 }
670 
671 /*
672  * Attempts to take the "knote" lock.
673  *
674  * Called with the kqueue lock held.
675  *
676  * Returns true if the knote lock is acquired, false if it has been dropped
677  */
678 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)679 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
680     enum kqlocking kqlocking)
681 {
682 	kqlock_held(kqu);
683 
684 #if MACH_ASSERT
685 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
686 #endif
687 	knlc->knlc_knote = kn;
688 	knlc->knlc_thread = current_thread();
689 	knlc->knlc_waiters = 0;
690 
691 	if (__improbable(kn->kn_status & KN_LOCKED)) {
692 		return knote_lock_slow(kqu, kn, knlc, kqlocking);
693 	}
694 
695 	/*
696 	 * When the knote will be dropped, the knote lock is taken before
697 	 * KN_DROPPING is set, and then the knote will be removed from any
698 	 * hash table that references it before the lock is canceled.
699 	 */
700 	assert((kn->kn_status & KN_DROPPING) == 0);
701 	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
702 	kn->kn_status |= KN_LOCKED;
703 #if MACH_ASSERT
704 	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
705 #endif
706 
707 	if (kqlocking == KNOTE_KQ_UNLOCK ||
708 	    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
709 		kqunlock(kqu);
710 	}
711 	return true;
712 }
713 
714 /*
715  * Unlocks a knote successfully locked with knote_lock().
716  *
717  * Called with the kqueue lock held.
718  *
719  * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
720  */
721 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)722 knote_unlock(kqueue_t kqu, struct knote *kn,
723     struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
724 {
725 	kqlock_held(kqu);
726 
727 	assert(knlc->knlc_knote == kn);
728 	assert(kn->kn_status & KN_LOCKED);
729 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
730 
731 	LIST_REMOVE(knlc, knlc_link);
732 
733 	if (knlc->knlc_waiters) {
734 		thread_t thread = THREAD_NULL;
735 
736 		wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
737 		    LCK_WAKE_DEFAULT, &thread);
738 
739 		/*
740 		 * knote_lock_slow() publishes the lock context of waiters
741 		 * in uthread::uu_knlock.
742 		 *
743 		 * Reach out and make this context the new owner.
744 		 */
745 		struct uthread *ut = get_bsdthread_info(thread);
746 		struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
747 
748 		assert(next_owner_lc->knlc_knote == kn);
749 		next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
750 		LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
751 #if MACH_ASSERT
752 		next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
753 #endif
754 		ut->uu_knlock = NULL;
755 		thread_deallocate_safe(thread);
756 	} else {
757 		kn->kn_status &= ~KN_LOCKED;
758 	}
759 
760 	if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
761 		/*
762 		 * No f_event() in flight anymore, we can leave QoS "Merge" mode
763 		 *
764 		 * See knote_adjust_qos()
765 		 */
766 		kn->kn_status &= ~KN_MERGE_QOS;
767 	}
768 	if (kqlocking == KNOTE_KQ_UNLOCK) {
769 		kqunlock(kqu);
770 	}
771 #if MACH_ASSERT
772 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
773 #endif
774 }
775 
776 /*
777  * Aborts all waiters for a knote lock, and unlock the knote.
778  *
779  * Called with the kqueue lock held.
780  *
781  * Returns with the kqueue unlocked.
782  */
783 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)784 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
785     struct knote_lock_ctx *knlc)
786 {
787 	kqlock_held(kq);
788 
789 	assert(knlc->knlc_knote == kn);
790 	assert(kn->kn_status & KN_LOCKED);
791 	assert(kn->kn_status & KN_DROPPING);
792 
793 	LIST_REMOVE(knlc, knlc_link);
794 	kn->kn_status &= ~KN_LOCKED;
795 	kqunlock(kq);
796 
797 	if (knlc->knlc_waiters) {
798 		wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
799 	}
800 #if MACH_ASSERT
801 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
802 #endif
803 }
804 
805 /*
806  * Call the f_event hook of a given filter.
807  *
808  * Takes a use count to protect against concurrent drops.
809  * Called with the object lock held.
810  */
811 static void
knote_post(struct knote * kn,long hint)812 knote_post(struct knote *kn, long hint)
813 {
814 	struct kqueue *kq = knote_get_kq(kn);
815 	int dropping, result;
816 
817 	kqlock(kq);
818 
819 	if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
820 		return kqunlock(kq);
821 	}
822 
823 	if (__improbable(kn->kn_status & KN_POSTING)) {
824 		panic("KNOTE() called concurrently on knote %p", kn);
825 	}
826 
827 	kn->kn_status |= KN_POSTING;
828 
829 	kqunlock(kq);
830 	result = filter_call(knote_fops(kn), f_event(kn, hint));
831 	kqlock(kq);
832 
833 	/* Someone dropped the knote/the monitored object vanished while we
834 	 * were in f_event, swallow the side effects of the post.
835 	 */
836 	dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
837 
838 	if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
839 		kqueue_update_iotier_override(kq);
840 	}
841 
842 	if (!dropping && (result & FILTER_ACTIVE)) {
843 		knote_activate(kq, kn, result);
844 	}
845 
846 	if ((kn->kn_status & KN_LOCKED) == 0) {
847 		/*
848 		 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
849 		 *
850 		 * See knote_adjust_qos()
851 		 */
852 		kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
853 	} else {
854 		kn->kn_status &= ~KN_POSTING;
855 	}
856 
857 	if (__improbable(dropping)) {
858 		thread_wakeup(knote_post_wev(kn));
859 	}
860 
861 	kqunlock(kq);
862 }
863 
864 /*
865  * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
866  * caller to be done.
867  *
868  *	- kq locked at entry
869  *	- kq unlocked at exit
870  */
871 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)872 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
873 {
874 	kqlock_held(kq);
875 
876 	assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
877 
878 	if (kn->kn_status & KN_POSTING) {
879 		lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
880 		    THREAD_UNINT | THREAD_WAIT_NOREPORT);
881 	} else {
882 		kqunlock(kq);
883 	}
884 }
885 
886 #pragma mark knote helpers for filters
887 
888 OS_ALWAYS_INLINE
889 void *
knote_kn_hook_get_raw(struct knote * kn)890 knote_kn_hook_get_raw(struct knote *kn)
891 {
892 	uintptr_t *addr = &kn->kn_hook;
893 
894 	void *hook = (void *) *addr;
895 #if __has_feature(ptrauth_calls)
896 	if (hook) {
897 		uint16_t blend = kn->kn_filter;
898 		blend |= (kn->kn_filtid << 8);
899 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
900 
901 		hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
902 		    ptrauth_blend_discriminator(addr, blend));
903 	}
904 #endif
905 
906 	return hook;
907 }
908 
909 OS_ALWAYS_INLINE void
knote_kn_hook_set_raw(struct knote * kn,void * kn_hook)910 knote_kn_hook_set_raw(struct knote *kn, void *kn_hook)
911 {
912 	uintptr_t *addr = &kn->kn_hook;
913 #if __has_feature(ptrauth_calls)
914 	if (kn_hook) {
915 		uint16_t blend = kn->kn_filter;
916 		blend |= (kn->kn_filtid << 8);
917 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
918 
919 		kn_hook = ptrauth_sign_unauthenticated(kn_hook,
920 		    ptrauth_key_process_independent_data,
921 		    ptrauth_blend_discriminator(addr, blend));
922 	}
923 #endif
924 	*addr = (uintptr_t) kn_hook;
925 }
926 
927 OS_ALWAYS_INLINE
928 void
knote_set_error(struct knote * kn,int error)929 knote_set_error(struct knote *kn, int error)
930 {
931 	kn->kn_flags |= EV_ERROR;
932 	kn->kn_sdata = error;
933 }
934 
935 OS_ALWAYS_INLINE
936 int64_t
knote_low_watermark(const struct knote * kn)937 knote_low_watermark(const struct knote *kn)
938 {
939 	return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
940 }
941 
942 /*!
943  * @function knote_fill_kevent_with_sdata
944  *
945  * @brief
946  * Fills in a kevent from the current content of a knote.
947  *
948  * @discussion
949  * This is meant to be called from filter's f_process hooks.
950  * The kevent data is filled with kn->kn_sdata.
951  *
952  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
953  *
954  * Using knote_fill_kevent is typically preferred.
955  */
956 OS_ALWAYS_INLINE
957 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)958 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
959 {
960 #define knote_assert_aliases(name1, offs1, name2) \
961 	static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
962 	    offsetof(struct kevent_internal_s, name2), \
963 	        "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
964 	/*
965 	 * All the code makes assumptions on these aliasing,
966 	 * so make sure we fail the build if we ever ever ever break them.
967 	 */
968 	knote_assert_aliases(ident, 0, kei_ident);
969 #ifdef __LITTLE_ENDIAN__
970 	knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
971 	knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
972 #else
973 	knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
974 	knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
975 #endif
976 	knote_assert_aliases(flags, 0, kei_flags);
977 	knote_assert_aliases(qos, 0, kei_qos);
978 	knote_assert_aliases(udata, 0, kei_udata);
979 	knote_assert_aliases(fflags, 0, kei_fflags);
980 	knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
981 	knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
982 	knote_assert_aliases(ext, 0, kei_ext);
983 #undef knote_assert_aliases
984 
985 	/*
986 	 * Fix the differences between kevent_qos_s and kevent_internal_s:
987 	 * - xflags is where kn_sfflags lives, we need to zero it
988 	 * - fixup the high bits of `filter` where kn_filtid lives
989 	 */
990 	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;
991 	kev->xflags = 0;
992 	kev->filter |= 0xff00;
993 	if (kn->kn_flags & EV_CLEAR) {
994 		kn->kn_fflags = 0;
995 	}
996 }
997 
998 /*!
999  * @function knote_fill_kevent
1000  *
1001  * @brief
1002  * Fills in a kevent from the current content of a knote.
1003  *
1004  * @discussion
1005  * This is meant to be called from filter's f_process hooks.
1006  * The kevent data is filled with the passed in data.
1007  *
1008  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
1009  */
1010 OS_ALWAYS_INLINE
1011 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)1012 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
1013 {
1014 	knote_fill_kevent_with_sdata(kn, kev);
1015 	kev->filter = kn->kn_filter;
1016 	kev->data = data;
1017 }
1018 
1019 
1020 #pragma mark file_filtops
1021 
1022 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)1023 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
1024 {
1025 	return fo_kqfilter(kn->kn_fp, kn, kev);
1026 }
1027 
1028 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1029 	.f_isfd = 1,
1030 	.f_attach = filt_fileattach,
1031 };
1032 
1033 #pragma mark kqread_filtops
1034 
1035 #define f_flag fp_glob->fg_flag
1036 #define f_ops fp_glob->fg_ops
1037 #define f_lflags fp_glob->fg_lflags
1038 
1039 static void
filt_kqdetach(struct knote * kn)1040 filt_kqdetach(struct knote *kn)
1041 {
1042 	struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
1043 	struct kqueue *kq = &kqf->kqf_kqueue;
1044 
1045 	kqlock(kq);
1046 	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1047 	kqunlock(kq);
1048 }
1049 
1050 static int
filt_kqueue(struct knote * kn,__unused long hint)1051 filt_kqueue(struct knote *kn, __unused long hint)
1052 {
1053 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1054 
1055 	return kq->kq_count > 0;
1056 }
1057 
1058 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)1059 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
1060 {
1061 #pragma unused(kev)
1062 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1063 	int res;
1064 
1065 	kqlock(kq);
1066 	res = (kq->kq_count > 0);
1067 	kqunlock(kq);
1068 
1069 	return res;
1070 }
1071 
1072 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1073 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1074 {
1075 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1076 	int res = 0;
1077 
1078 	kqlock(kq);
1079 	if (kq->kq_count) {
1080 		knote_fill_kevent(kn, kev, kq->kq_count);
1081 		res = 1;
1082 	}
1083 	kqunlock(kq);
1084 
1085 	return res;
1086 }
1087 
1088 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1089 	.f_isfd = 1,
1090 	.f_detach = filt_kqdetach,
1091 	.f_event = filt_kqueue,
1092 	.f_touch = filt_kqtouch,
1093 	.f_process = filt_kqprocess,
1094 };
1095 
1096 #pragma mark proc_filtops
1097 
1098 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1099 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1100 {
1101 	struct proc *p;
1102 
1103 	assert(PID_MAX < NOTE_PDATAMASK);
1104 
1105 	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1106 		knote_set_error(kn, ENOTSUP);
1107 		return 0;
1108 	}
1109 
1110 	p = proc_find((int)kn->kn_id);
1111 	if (p == NULL) {
1112 		knote_set_error(kn, ESRCH);
1113 		return 0;
1114 	}
1115 
1116 	const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1117 
1118 	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1119 		do {
1120 			pid_t selfpid = proc_selfpid();
1121 
1122 			if (p->p_ppid == selfpid) {
1123 				break;  /* parent => ok */
1124 			}
1125 			if ((p->p_lflag & P_LTRACED) != 0 &&
1126 			    (p->p_oppid == selfpid)) {
1127 				break;  /* parent-in-waiting => ok */
1128 			}
1129 			if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1130 				break; /* allowed to signal => ok */
1131 			}
1132 			proc_rele(p);
1133 			knote_set_error(kn, EACCES);
1134 			return 0;
1135 		} while (0);
1136 	}
1137 
1138 	kn->kn_proc = p;
1139 	kn->kn_flags |= EV_CLEAR;       /* automatically set */
1140 	kn->kn_sdata = 0;               /* incoming data is ignored */
1141 
1142 	proc_klist_lock();
1143 
1144 	KNOTE_ATTACH(&p->p_klist, kn);
1145 
1146 	proc_klist_unlock();
1147 
1148 	proc_rele(p);
1149 
1150 	/*
1151 	 * only captures edge-triggered events after this point
1152 	 * so it can't already be fired.
1153 	 */
1154 	return 0;
1155 }
1156 
1157 
1158 /*
1159  * The knote may be attached to a different process, which may exit,
1160  * leaving nothing for the knote to be attached to.  In that case,
1161  * the pointer to the process will have already been nulled out.
1162  */
1163 static void
filt_procdetach(struct knote * kn)1164 filt_procdetach(struct knote *kn)
1165 {
1166 	struct proc *p;
1167 
1168 	proc_klist_lock();
1169 
1170 	p = kn->kn_proc;
1171 	if (p != PROC_NULL) {
1172 		kn->kn_proc = PROC_NULL;
1173 		KNOTE_DETACH(&p->p_klist, kn);
1174 	}
1175 
1176 	proc_klist_unlock();
1177 }
1178 
1179 static int
filt_procevent(struct knote * kn,long hint)1180 filt_procevent(struct knote *kn, long hint)
1181 {
1182 	u_int event;
1183 
1184 	/* ALWAYS CALLED WITH proc_klist_lock */
1185 
1186 	/*
1187 	 * Note: a lot of bits in hint may be obtained from the knote
1188 	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1189 	 * bits in hint for filt_procevent
1190 	 *
1191 	 * mask off extra data
1192 	 */
1193 	event = (u_int)hint & NOTE_PCTRLMASK;
1194 
1195 	/*
1196 	 * termination lifecycle events can happen while a debugger
1197 	 * has reparented a process, in which case notifications
1198 	 * should be quashed except to the tracing parent. When
1199 	 * the debugger reaps the child (either via wait4(2) or
1200 	 * process exit), the child will be reparented to the original
1201 	 * parent and these knotes re-fired.
1202 	 */
1203 	if (event & NOTE_EXIT) {
1204 		if ((kn->kn_proc->p_oppid != 0)
1205 		    && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1206 			/*
1207 			 * This knote is not for the current ptrace(2) parent, ignore.
1208 			 */
1209 			return 0;
1210 		}
1211 	}
1212 
1213 	/*
1214 	 * if the user is interested in this event, record it.
1215 	 */
1216 	if (kn->kn_sfflags & event) {
1217 		kn->kn_fflags |= event;
1218 	}
1219 
1220 #pragma clang diagnostic push
1221 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1222 	if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1223 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1224 	}
1225 #pragma clang diagnostic pop
1226 
1227 
1228 	/*
1229 	 * The kernel has a wrapper in place that returns the same data
1230 	 * as is collected here, in kn_hook32.  Any changes to how
1231 	 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1232 	 * should also be reflected in the proc_pidnoteexit() wrapper.
1233 	 */
1234 	if (event == NOTE_EXIT) {
1235 		kn->kn_hook32 = 0;
1236 		if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1237 			kn->kn_fflags |= NOTE_EXITSTATUS;
1238 			kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1239 		}
1240 		if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1241 			kn->kn_fflags |= NOTE_EXIT_DETAIL;
1242 			if ((kn->kn_proc->p_lflag &
1243 			    P_LTERM_DECRYPTFAIL) != 0) {
1244 				kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1245 			}
1246 			if ((kn->kn_proc->p_lflag &
1247 			    P_LTERM_JETSAM) != 0) {
1248 				kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1249 				switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1250 				case P_JETSAM_VMPAGESHORTAGE:
1251 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1252 					break;
1253 				case P_JETSAM_VMTHRASHING:
1254 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1255 					break;
1256 				case P_JETSAM_FCTHRASHING:
1257 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1258 					break;
1259 				case P_JETSAM_VNODE:
1260 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1261 					break;
1262 				case P_JETSAM_HIWAT:
1263 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1264 					break;
1265 				case P_JETSAM_PID:
1266 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1267 					break;
1268 				case P_JETSAM_IDLEEXIT:
1269 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1270 					break;
1271 				}
1272 			}
1273 			if ((proc_getcsflags(kn->kn_proc) &
1274 			    CS_KILLED) != 0) {
1275 				kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1276 			}
1277 		}
1278 	}
1279 
1280 	/* if we have any matching state, activate the knote */
1281 	return kn->kn_fflags != 0;
1282 }
1283 
1284 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1285 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1286 {
1287 	int res;
1288 
1289 	proc_klist_lock();
1290 
1291 	/* accept new filter flags and mask off output events no long interesting */
1292 	kn->kn_sfflags = kev->fflags;
1293 
1294 	/* restrict the current results to the (smaller?) set of new interest */
1295 	/*
1296 	 * For compatibility with previous implementations, we leave kn_fflags
1297 	 * as they were before.
1298 	 */
1299 	//kn->kn_fflags &= kn->kn_sfflags;
1300 
1301 	res = (kn->kn_fflags != 0);
1302 
1303 	proc_klist_unlock();
1304 
1305 	return res;
1306 }
1307 
1308 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1309 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1310 {
1311 	int res = 0;
1312 
1313 	proc_klist_lock();
1314 	if (kn->kn_fflags) {
1315 		knote_fill_kevent(kn, kev, kn->kn_hook32);
1316 		kn->kn_hook32 = 0;
1317 		res = 1;
1318 	}
1319 	proc_klist_unlock();
1320 	return res;
1321 }
1322 
1323 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1324 	.f_attach  = filt_procattach,
1325 	.f_detach  = filt_procdetach,
1326 	.f_event   = filt_procevent,
1327 	.f_touch   = filt_proctouch,
1328 	.f_process = filt_procprocess,
1329 };
1330 
1331 #pragma mark timer_filtops
1332 
1333 struct filt_timer_params {
1334 	uint64_t deadline; /* deadline in abs/cont time
1335 	                    *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1336 	uint64_t leeway;   /* leeway in abstime, or 0 if none */
1337 	uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1338 };
1339 
1340 /*
1341  * Values stored in the knote at rest (using Mach absolute time units)
1342  *
1343  * kn->kn_thcall        where the thread_call object is stored
1344  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1345  * kn->kn_ext[1]        leeway value
1346  * kn->kn_sdata         interval timer: the interval
1347  *                      absolute/deadline timer: 0
1348  * kn->kn_hook32        timer state (with gencount)
1349  *
1350  * TIMER_IDLE:
1351  *   The timer has either never been scheduled or been cancelled.
1352  *   It is safe to schedule a new one in this state.
1353  *
1354  * TIMER_ARMED:
1355  *   The timer has been scheduled
1356  *
1357  * TIMER_FIRED
1358  *   The timer has fired and an event needs to be delivered.
1359  *   When in this state, the callout may still be running.
1360  *
1361  * TIMER_IMMEDIATE
1362  *   The timer has fired at registration time, and the callout was never
1363  *   dispatched.
1364  */
1365 #define TIMER_IDLE       0x0
1366 #define TIMER_ARMED      0x1
1367 #define TIMER_FIRED      0x2
1368 #define TIMER_IMMEDIATE  0x3
1369 #define TIMER_STATE_MASK 0x3
1370 #define TIMER_GEN_INC    0x4
1371 
1372 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1373 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1374 {
1375 	kn->kn_ext[0] = params->deadline;
1376 	kn->kn_ext[1] = params->leeway;
1377 	kn->kn_sdata  = params->interval;
1378 }
1379 
1380 /*
1381  * filt_timervalidate - process data from user
1382  *
1383  * Sets up the deadline, interval, and leeway from the provided user data
1384  *
1385  * Input:
1386  *      kn_sdata        timer deadline or interval time
1387  *      kn_sfflags      style of timer, unit of measurement
1388  *
1389  * Output:
1390  *      struct filter_timer_params to apply to the filter with
1391  *      filt_timer_set_params when changes are ready to be commited.
1392  *
1393  * Returns:
1394  *      EINVAL          Invalid user data parameters
1395  *      ERANGE          Various overflows with the parameters
1396  *
1397  * Called with timer filter lock held.
1398  */
1399 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1400 filt_timervalidate(const struct kevent_qos_s *kev,
1401     struct filt_timer_params *params)
1402 {
1403 	/*
1404 	 * There are 5 knobs that need to be chosen for a timer registration:
1405 	 *
1406 	 * A) Units of time (what is the time duration of the specified number)
1407 	 *      Absolute and interval take:
1408 	 *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1409 	 *      Defaults to milliseconds if not specified
1410 	 *
1411 	 * B) Clock epoch (what is the zero point of the specified number)
1412 	 *      For interval, there is none
1413 	 *      For absolute, defaults to the gettimeofday/calendar epoch
1414 	 *      With NOTE_MACHTIME, uses mach_absolute_time()
1415 	 *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1416 	 *
1417 	 * C) The knote's behavior on delivery
1418 	 *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1419 	 *      Absolute is a forced one-shot timer which deletes on delivery
1420 	 *      TODO: Add a way for absolute to be not forced one-shot
1421 	 *
1422 	 * D) Whether the time duration is relative to now or absolute
1423 	 *      Interval fires at now + duration when it is set up
1424 	 *      Absolute fires at now + difference between now walltime and passed in walltime
1425 	 *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1426 	 *
1427 	 * E) Whether the timer continues to tick across sleep
1428 	 *      By default all three do not.
1429 	 *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1430 	 *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1431 	 *              expires when mach_continuous_time() is > the passed in value.
1432 	 */
1433 
1434 	uint64_t multiplier;
1435 
1436 	boolean_t use_abstime = FALSE;
1437 
1438 	switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1439 	case NOTE_SECONDS:
1440 		multiplier = NSEC_PER_SEC;
1441 		break;
1442 	case NOTE_USECONDS:
1443 		multiplier = NSEC_PER_USEC;
1444 		break;
1445 	case NOTE_NSECONDS:
1446 		multiplier = 1;
1447 		break;
1448 	case NOTE_MACHTIME:
1449 		multiplier = 0;
1450 		use_abstime = TRUE;
1451 		break;
1452 	case 0: /* milliseconds (default) */
1453 		multiplier = NSEC_PER_SEC / 1000;
1454 		break;
1455 	default:
1456 		return EINVAL;
1457 	}
1458 
1459 	/* transform the leeway in kn_ext[1] to same time scale */
1460 	if (kev->fflags & NOTE_LEEWAY) {
1461 		uint64_t leeway_abs;
1462 
1463 		if (use_abstime) {
1464 			leeway_abs = (uint64_t)kev->ext[1];
1465 		} else {
1466 			uint64_t leeway_ns;
1467 			if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1468 				return ERANGE;
1469 			}
1470 
1471 			nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1472 		}
1473 
1474 		params->leeway = leeway_abs;
1475 	} else {
1476 		params->leeway = 0;
1477 	}
1478 
1479 	if (kev->fflags & NOTE_ABSOLUTE) {
1480 		uint64_t deadline_abs;
1481 
1482 		if (use_abstime) {
1483 			deadline_abs = (uint64_t)kev->data;
1484 		} else {
1485 			uint64_t calendar_deadline_ns;
1486 
1487 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1488 				return ERANGE;
1489 			}
1490 
1491 			/* calendar_deadline_ns is in nanoseconds since the epoch */
1492 
1493 			clock_sec_t seconds;
1494 			clock_nsec_t nanoseconds;
1495 
1496 			/*
1497 			 * Note that the conversion through wall-time is only done once.
1498 			 *
1499 			 * If the relationship between MAT and gettimeofday changes,
1500 			 * the underlying timer does not update.
1501 			 *
1502 			 * TODO: build a wall-time denominated timer_call queue
1503 			 * and a flag to request DTRTing with wall-time timers
1504 			 */
1505 			clock_get_calendar_nanotime(&seconds, &nanoseconds);
1506 
1507 			uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1508 
1509 			/* if deadline is in the future */
1510 			if (calendar_now_ns < calendar_deadline_ns) {
1511 				uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1512 				uint64_t interval_abs;
1513 
1514 				nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1515 
1516 				/*
1517 				 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1518 				 * causes the timer to keep ticking across sleep, but
1519 				 * it does not change the calendar timebase.
1520 				 */
1521 
1522 				if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1523 					clock_continuoustime_interval_to_deadline(interval_abs,
1524 					    &deadline_abs);
1525 				} else {
1526 					clock_absolutetime_interval_to_deadline(interval_abs,
1527 					    &deadline_abs);
1528 				}
1529 			} else {
1530 				deadline_abs = 0; /* cause immediate expiration */
1531 			}
1532 		}
1533 
1534 		params->deadline = deadline_abs;
1535 		params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1536 	} else if (kev->data < 0) {
1537 		/*
1538 		 * Negative interval timers fire immediately, once.
1539 		 *
1540 		 * Ideally a negative interval would be an error, but certain clients
1541 		 * pass negative values on accident, and expect an event back.
1542 		 *
1543 		 * In the old implementation the timer would repeat with no delay
1544 		 * N times until mach_absolute_time() + (N * interval) underflowed,
1545 		 * then it would wait ~forever by accidentally arming a timer for the far future.
1546 		 *
1547 		 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1548 		 */
1549 
1550 		params->deadline = 0; /* expire immediately */
1551 		params->interval = 0; /* non-repeating */
1552 	} else {
1553 		uint64_t interval_abs = 0;
1554 
1555 		if (use_abstime) {
1556 			interval_abs = (uint64_t)kev->data;
1557 		} else {
1558 			uint64_t interval_ns;
1559 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1560 				return ERANGE;
1561 			}
1562 
1563 			nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1564 		}
1565 
1566 		uint64_t deadline = 0;
1567 
1568 		if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1569 			clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1570 		} else {
1571 			clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1572 		}
1573 
1574 		params->deadline = deadline;
1575 		params->interval = interval_abs;
1576 	}
1577 
1578 	return 0;
1579 }
1580 
1581 /*
1582  * filt_timerexpire - the timer callout routine
1583  */
1584 static void
filt_timerexpire(void * knx,void * state_on_arm)1585 filt_timerexpire(void *knx, void *state_on_arm)
1586 {
1587 	struct knote *kn = knx;
1588 
1589 	uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1590 	uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1591 
1592 	if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1593 		// our f_event always would say FILTER_ACTIVE,
1594 		// so be leaner and just do it.
1595 		struct kqueue *kq = knote_get_kq(kn);
1596 		kqlock(kq);
1597 		knote_activate(kq, kn, FILTER_ACTIVE);
1598 		kqunlock(kq);
1599 	} else {
1600 		/*
1601 		 * The timer has been reprogrammed or canceled since it was armed,
1602 		 * and this is a late firing for the timer, just ignore it.
1603 		 */
1604 	}
1605 }
1606 
1607 /*
1608  * Does this deadline needs a timer armed for it, or has it expired?
1609  */
1610 static bool
filt_timer_is_ready(struct knote * kn)1611 filt_timer_is_ready(struct knote *kn)
1612 {
1613 	uint64_t now, deadline = kn->kn_ext[0];
1614 
1615 	if (deadline == 0) {
1616 		return true;
1617 	}
1618 
1619 	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1620 		now = mach_continuous_time();
1621 	} else {
1622 		now = mach_absolute_time();
1623 	}
1624 	return deadline <= now;
1625 }
1626 
1627 /*
1628  * Arm a timer
1629  *
1630  * It is the responsibility of the caller to make sure the timer call
1631  * has completed or been cancelled properly prior to arming it.
1632  */
1633 static void
filt_timerarm(struct knote * kn)1634 filt_timerarm(struct knote *kn)
1635 {
1636 	uint64_t deadline = kn->kn_ext[0];
1637 	uint64_t leeway   = kn->kn_ext[1];
1638 	uint32_t state;
1639 
1640 	int filter_flags = kn->kn_sfflags;
1641 	unsigned int timer_flags = 0;
1642 
1643 	if (filter_flags & NOTE_CRITICAL) {
1644 		timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1645 	} else if (filter_flags & NOTE_BACKGROUND) {
1646 		timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1647 	} else {
1648 		timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1649 	}
1650 
1651 	if (filter_flags & NOTE_LEEWAY) {
1652 		timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1653 	}
1654 
1655 	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1656 		timer_flags |= THREAD_CALL_CONTINUOUS;
1657 	}
1658 
1659 	/*
1660 	 * Move to ARMED.
1661 	 *
1662 	 * We increase the gencount, and setup the thread call with this expected
1663 	 * state. It means that if there was a previous generation of the timer in
1664 	 * flight that needs to be ignored, then 3 things are possible:
1665 	 *
1666 	 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1667 	 *   but we clobber it with ARMED and a new gencount. The knote will still
1668 	 *   be activated, but filt_timerprocess() which is serialized with this
1669 	 *   call will not see the FIRED bit set and will not deliver an event.
1670 	 *
1671 	 * - this code runs first, but filt_timerexpire() comes second. Because it
1672 	 *   knows an old gencount, it will debounce and not activate the knote.
1673 	 *
1674 	 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1675 	 *   will just cancel it properly.
1676 	 *
1677 	 * This is important as userspace expects to never be woken up for past
1678 	 * timers after filt_timertouch ran.
1679 	 */
1680 	state = os_atomic_load(&kn->kn_hook32, relaxed);
1681 	state &= ~TIMER_STATE_MASK;
1682 	state += TIMER_GEN_INC + TIMER_ARMED;
1683 	os_atomic_store(&kn->kn_hook32, state, relaxed);
1684 
1685 	thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1686 	    (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1687 }
1688 
1689 /*
1690  * Mark a timer as "already fired" when it is being reprogrammed
1691  *
1692  * If there is a timer in flight, this will do a best effort at canceling it,
1693  * but will not wait. If the thread call was in flight, having set the
1694  * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1695  * cancelation.
1696  */
1697 static void
filt_timerfire_immediate(struct knote * kn)1698 filt_timerfire_immediate(struct knote *kn)
1699 {
1700 	uint32_t state;
1701 
1702 	static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1703 	    "validate that this atomic or will transition to IMMEDIATE");
1704 	state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1705 
1706 	if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1707 		thread_call_cancel(kn->kn_thcall);
1708 	}
1709 }
1710 
1711 /*
1712  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1713  */
1714 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1715 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1716 {
1717 	thread_call_t callout;
1718 	struct filt_timer_params params;
1719 	int error;
1720 
1721 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1722 		knote_set_error(kn, error);
1723 		return 0;
1724 	}
1725 
1726 	callout = thread_call_allocate_with_options(filt_timerexpire,
1727 	    (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1728 	    THREAD_CALL_OPTIONS_ONCE);
1729 
1730 	if (NULL == callout) {
1731 		knote_set_error(kn, ENOMEM);
1732 		return 0;
1733 	}
1734 
1735 	filt_timer_set_params(kn, &params);
1736 	kn->kn_thcall = callout;
1737 	kn->kn_flags |= EV_CLEAR;
1738 	os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1739 
1740 	/* NOTE_ABSOLUTE implies EV_ONESHOT */
1741 	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1742 		kn->kn_flags |= EV_ONESHOT;
1743 	}
1744 
1745 	if (filt_timer_is_ready(kn)) {
1746 		os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1747 		return FILTER_ACTIVE;
1748 	} else {
1749 		filt_timerarm(kn);
1750 		return 0;
1751 	}
1752 }
1753 
1754 /*
1755  * Shut down the timer if it's running, and free the callout.
1756  */
1757 static void
filt_timerdetach(struct knote * kn)1758 filt_timerdetach(struct knote *kn)
1759 {
1760 	__assert_only boolean_t freed;
1761 
1762 	/*
1763 	 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1764 	 * running anymore.
1765 	 */
1766 	thread_call_cancel_wait(kn->kn_thcall);
1767 	freed = thread_call_free(kn->kn_thcall);
1768 	assert(freed);
1769 }
1770 
1771 /*
1772  * filt_timertouch - update timer knote with new user input
1773  *
1774  * Cancel and restart the timer based on new user data. When
1775  * the user picks up a knote, clear the count of how many timer
1776  * pops have gone off (in kn_data).
1777  */
1778 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1779 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1780 {
1781 	struct filt_timer_params params;
1782 	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1783 	int error;
1784 
1785 	if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1786 	    !_pthread_priority_thread_qos(kev->qos)) {
1787 		/* validate usage of FILTER_UPDATE_REQ_QOS */
1788 		kev->flags |= EV_ERROR;
1789 		kev->data = ERANGE;
1790 		return 0;
1791 	}
1792 
1793 	if (changed_flags & NOTE_ABSOLUTE) {
1794 		kev->flags |= EV_ERROR;
1795 		kev->data = EINVAL;
1796 		return 0;
1797 	}
1798 
1799 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1800 		kev->flags |= EV_ERROR;
1801 		kev->data = error;
1802 		return 0;
1803 	}
1804 
1805 	/* capture the new values used to compute deadline */
1806 	filt_timer_set_params(kn, &params);
1807 	kn->kn_sfflags = kev->fflags;
1808 
1809 	if (filt_timer_is_ready(kn)) {
1810 		filt_timerfire_immediate(kn);
1811 		return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1812 	} else {
1813 		filt_timerarm(kn);
1814 		return FILTER_UPDATE_REQ_QOS;
1815 	}
1816 }
1817 
1818 /*
1819  * filt_timerprocess - query state of knote and snapshot event data
1820  *
1821  * Determine if the timer has fired in the past, snapshot the state
1822  * of the kevent for returning to user-space, and clear pending event
1823  * counters for the next time.
1824  */
1825 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1826 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1827 {
1828 	uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1829 
1830 	/*
1831 	 * filt_timerprocess is serialized with any filter routine except for
1832 	 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1833 	 * transition, and on success, activates the knote.
1834 	 *
1835 	 * Hence, we don't need atomic modifications of the state, only to peek at
1836 	 * whether we see any of the "FIRED" state, and if we do, it is safe to
1837 	 * do simple state machine transitions.
1838 	 */
1839 	switch (state & TIMER_STATE_MASK) {
1840 	case TIMER_IDLE:
1841 	case TIMER_ARMED:
1842 		/*
1843 		 * This can happen if a touch resets a timer that had fired
1844 		 * without being processed
1845 		 */
1846 		return 0;
1847 	}
1848 
1849 	os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1850 
1851 	/*
1852 	 * Copy out the interesting kevent state,
1853 	 * but don't leak out the raw time calculations.
1854 	 *
1855 	 * TODO: potential enhancements - tell the user about:
1856 	 *      - deadline to which this timer thought it was expiring
1857 	 *      - return kn_sfflags in the fflags field so the client can know
1858 	 *        under what flags the timer fired
1859 	 */
1860 	knote_fill_kevent(kn, kev, 1);
1861 	kev->ext[0] = 0;
1862 	/* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1863 
1864 	if (kn->kn_sdata != 0) {
1865 		/*
1866 		 * This is a 'repeating' timer, so we have to emit
1867 		 * how many intervals expired between the arm
1868 		 * and the process.
1869 		 *
1870 		 * A very strange style of interface, because
1871 		 * this could easily be done in the client...
1872 		 */
1873 
1874 		uint64_t now;
1875 
1876 		if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1877 			now = mach_continuous_time();
1878 		} else {
1879 			now = mach_absolute_time();
1880 		}
1881 
1882 		uint64_t first_deadline = kn->kn_ext[0];
1883 		uint64_t interval_abs   = kn->kn_sdata;
1884 		uint64_t orig_arm_time  = first_deadline - interval_abs;
1885 
1886 		assert(now > orig_arm_time);
1887 		assert(now > first_deadline);
1888 
1889 		uint64_t elapsed = now - orig_arm_time;
1890 
1891 		uint64_t num_fired = elapsed / interval_abs;
1892 
1893 		/*
1894 		 * To reach this code, we must have seen the timer pop
1895 		 * and be in repeating mode, so therefore it must have been
1896 		 * more than 'interval' time since the attach or last
1897 		 * successful touch.
1898 		 */
1899 		assert(num_fired > 0);
1900 
1901 		/* report how many intervals have elapsed to the user */
1902 		kev->data = (int64_t)num_fired;
1903 
1904 		/* We only need to re-arm the timer if it's not about to be destroyed */
1905 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
1906 			/* fire at the end of the next interval */
1907 			uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1908 
1909 			assert(new_deadline > now);
1910 
1911 			kn->kn_ext[0] = new_deadline;
1912 
1913 			/*
1914 			 * This can't shortcut setting up the thread call, because
1915 			 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1916 			 */
1917 			filt_timerarm(kn);
1918 		}
1919 	}
1920 
1921 	return FILTER_ACTIVE;
1922 }
1923 
1924 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1925 	.f_extended_codes = true,
1926 	.f_attach   = filt_timerattach,
1927 	.f_detach   = filt_timerdetach,
1928 	.f_event    = filt_bad_event,
1929 	.f_touch    = filt_timertouch,
1930 	.f_process  = filt_timerprocess,
1931 };
1932 
1933 #pragma mark user_filtops
1934 
1935 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1936 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1937 {
1938 	if (kn->kn_sfflags & NOTE_TRIGGER) {
1939 		kn->kn_hook32 = FILTER_ACTIVE;
1940 	} else {
1941 		kn->kn_hook32 = 0;
1942 	}
1943 	return kn->kn_hook32;
1944 }
1945 
1946 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1947 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1948 {
1949 	uint32_t ffctrl;
1950 	int fflags;
1951 
1952 	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1953 	fflags = kev->fflags & NOTE_FFLAGSMASK;
1954 	switch (ffctrl) {
1955 	case NOTE_FFNOP:
1956 		break;
1957 	case NOTE_FFAND:
1958 		kn->kn_sfflags &= fflags;
1959 		break;
1960 	case NOTE_FFOR:
1961 		kn->kn_sfflags |= fflags;
1962 		break;
1963 	case NOTE_FFCOPY:
1964 		kn->kn_sfflags = fflags;
1965 		break;
1966 	}
1967 	kn->kn_sdata = kev->data;
1968 
1969 	if (kev->fflags & NOTE_TRIGGER) {
1970 		kn->kn_hook32 = FILTER_ACTIVE;
1971 	}
1972 	return (int)kn->kn_hook32;
1973 }
1974 
1975 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1976 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1977 {
1978 	int result = (int)kn->kn_hook32;
1979 
1980 	if (result) {
1981 		/* EVFILT_USER returns the data that was passed in */
1982 		knote_fill_kevent_with_sdata(kn, kev);
1983 		kev->fflags = kn->kn_sfflags;
1984 		if (kn->kn_flags & EV_CLEAR) {
1985 			/* knote_fill_kevent cleared kn_fflags */
1986 			kn->kn_hook32 = 0;
1987 		}
1988 	}
1989 
1990 	return result;
1991 }
1992 
1993 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1994 	.f_extended_codes = true,
1995 	.f_attach  = filt_userattach,
1996 	.f_detach  = filt_no_detach,
1997 	.f_event   = filt_bad_event,
1998 	.f_touch   = filt_usertouch,
1999 	.f_process = filt_userprocess,
2000 };
2001 
2002 #pragma mark workloop_filtops
2003 
2004 #define EPREEMPTDISABLED (-1)
2005 
2006 static inline void
filt_wllock(struct kqworkloop * kqwl)2007 filt_wllock(struct kqworkloop *kqwl)
2008 {
2009 	lck_spin_lock(&kqwl->kqwl_statelock);
2010 }
2011 
2012 static inline void
filt_wlunlock(struct kqworkloop * kqwl)2013 filt_wlunlock(struct kqworkloop *kqwl)
2014 {
2015 	lck_spin_unlock(&kqwl->kqwl_statelock);
2016 }
2017 
2018 /*
2019  * Returns true when the interlock for the turnstile is the workqueue lock
2020  *
2021  * When this is the case, all turnstiles operations are delegated
2022  * to the workqueue subsystem.
2023  *
2024  * This is required because kqueue_threadreq_bind_prepost only holds the
2025  * workqueue lock but needs to move the inheritor from the workloop turnstile
2026  * away from the creator thread, so that this now fulfilled request cannot be
2027  * picked anymore by other threads.
2028  */
2029 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)2030 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2031 {
2032 	return kqr_thread_requested_pending(&kqwl->kqwl_request);
2033 }
2034 
2035 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)2036 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
2037     turnstile_update_flags_t flags)
2038 {
2039 	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2040 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2041 
2042 	/*
2043 	 * binding to the workq should always happen through
2044 	 * workq_kern_threadreq_update_inheritor()
2045 	 */
2046 	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2047 
2048 	if ((inheritor = kqwl->kqwl_owner)) {
2049 		flags |= TURNSTILE_INHERITOR_THREAD;
2050 	} else if ((inheritor = kqr_thread(kqr))) {
2051 		flags |= TURNSTILE_INHERITOR_THREAD;
2052 	}
2053 
2054 	turnstile_update_inheritor(ts, inheritor, flags);
2055 }
2056 
2057 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2058 #define FILT_WLATTACH 0
2059 #define FILT_WLTOUCH  1
2060 #define FILT_WLDROP   2
2061 
2062 __result_use_check
2063 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)2064 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
2065     struct kevent_qos_s *kev, kq_index_t qos_index, int op)
2066 {
2067 	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2068 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2069 	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2070 	kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2071 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2072 	int action = KQWL_UTQ_NONE, error = 0;
2073 	bool wl_inheritor_updated = false, needs_wake = false;
2074 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2075 	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2076 	uint64_t udata = 0;
2077 	struct turnstile *ts = TURNSTILE_NULL;
2078 
2079 	filt_wllock(kqwl);
2080 
2081 again:
2082 	new_owner = cur_owner = kqwl->kqwl_owner;
2083 
2084 	/*
2085 	 * Phase 1:
2086 	 *
2087 	 * If asked, load the uint64 value at the user provided address and compare
2088 	 * it against the passed in mask and expected value.
2089 	 *
2090 	 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2091 	 * a thread reference.
2092 	 *
2093 	 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2094 	 * the current thread, then end ownership.
2095 	 *
2096 	 * Lastly decide whether we need to perform a QoS update.
2097 	 */
2098 	if (uaddr) {
2099 		/*
2100 		 * Until <rdar://problem/24999882> exists,
2101 		 * disabling preemption copyin forces any
2102 		 * vm_fault we encounter to fail.
2103 		 */
2104 		error = copyin_atomic64(uaddr, &udata);
2105 
2106 		/*
2107 		 * If we get EFAULT, drop locks, and retry.
2108 		 * If we still get an error report it,
2109 		 * else assume the memory has been faulted
2110 		 * and attempt to copyin under lock again.
2111 		 */
2112 		switch (error) {
2113 		case 0:
2114 			break;
2115 		case EFAULT:
2116 			if (efault_retry-- > 0) {
2117 				filt_wlunlock(kqwl);
2118 				error = copyin_atomic64(uaddr, &udata);
2119 				filt_wllock(kqwl);
2120 				if (error == 0) {
2121 					goto again;
2122 				}
2123 			}
2124 			OS_FALLTHROUGH;
2125 		default:
2126 			goto out;
2127 		}
2128 
2129 		/* Update state as copied in.  */
2130 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2131 
2132 		if ((udata & mask) != (kdata & mask)) {
2133 			error = ESTALE;
2134 		} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2135 			/*
2136 			 * Decipher the owner port name, and translate accordingly.
2137 			 * The low 2 bits were borrowed for other flags, so mask them off.
2138 			 *
2139 			 * Then attempt translation to a thread reference or fail.
2140 			 */
2141 			mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2142 			if (name != MACH_PORT_NULL) {
2143 				name = ipc_entry_name_mask(name);
2144 				extra_thread_ref = port_name_to_thread(name,
2145 				    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2146 				if (extra_thread_ref == THREAD_NULL) {
2147 					error = EOWNERDEAD;
2148 					goto out;
2149 				}
2150 				new_owner = extra_thread_ref;
2151 			}
2152 		}
2153 	}
2154 
2155 	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2156 		new_owner = THREAD_NULL;
2157 	}
2158 
2159 	if (error == 0) {
2160 		if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2161 			action = KQWL_UTQ_SET_QOS_INDEX;
2162 		} else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2163 			action = KQWL_UTQ_SET_QOS_INDEX;
2164 		}
2165 
2166 		if (op == FILT_WLTOUCH) {
2167 			/*
2168 			 * Save off any additional fflags/data we just accepted
2169 			 * But only keep the last round of "update" bits we acted on which helps
2170 			 * debugging a lot.
2171 			 */
2172 			kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2173 			kn->kn_sfflags |= kev->fflags;
2174 			if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2175 				needs_wake = (kn->kn_thread != THREAD_NULL);
2176 			}
2177 		} else if (op == FILT_WLDROP) {
2178 			if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2179 			    NOTE_WL_SYNC_WAIT) {
2180 				/*
2181 				 * When deleting a SYNC_WAIT knote that hasn't been woken up
2182 				 * explicitly, issue a wake up.
2183 				 */
2184 				kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2185 				needs_wake = (kn->kn_thread != THREAD_NULL);
2186 			}
2187 		}
2188 	}
2189 
2190 	/*
2191 	 * Phase 2:
2192 	 *
2193 	 * Commit ownership and QoS changes if any, possibly wake up waiters
2194 	 */
2195 
2196 	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2197 		goto out;
2198 	}
2199 
2200 	kqlock(kqwl);
2201 
2202 	/* If already tracked as servicer, don't track as owner */
2203 	if (new_owner == kqr_thread(kqr)) {
2204 		new_owner = THREAD_NULL;
2205 	}
2206 
2207 	if (cur_owner != new_owner) {
2208 		kqwl->kqwl_owner = new_owner;
2209 		if (new_owner == extra_thread_ref) {
2210 			/* we just transfered this ref to kqwl_owner */
2211 			extra_thread_ref = THREAD_NULL;
2212 		}
2213 		cur_override = kqworkloop_override(kqwl);
2214 
2215 		if (new_owner) {
2216 			/* override it before we drop the old */
2217 			if (cur_override != THREAD_QOS_UNSPECIFIED) {
2218 				thread_add_kevent_override(new_owner, cur_override);
2219 			}
2220 			if (kqr_thread_requested_pending(kqr)) {
2221 				if (action == KQWL_UTQ_NONE) {
2222 					action = KQWL_UTQ_REDRIVE_EVENTS;
2223 				}
2224 			}
2225 		} else if (action == KQWL_UTQ_NONE &&
2226 		    !kqr_thread_requested(kqr) &&
2227 		    kqwl->kqwl_wakeup_qos) {
2228 			action = KQWL_UTQ_REDRIVE_EVENTS;
2229 		}
2230 	}
2231 
2232 	if (action != KQWL_UTQ_NONE) {
2233 		kqworkloop_update_threads_qos(kqwl, action, qos_index);
2234 	}
2235 
2236 	ts = kqwl->kqwl_turnstile;
2237 	if (cur_owner != new_owner && ts) {
2238 		if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2239 			/*
2240 			 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2241 			 * the code went through workq_kern_threadreq_initiate()
2242 			 * and the workqueue has set the inheritor already
2243 			 */
2244 			assert(filt_wlturnstile_interlock_is_workq(kqwl));
2245 		} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2246 			workq_kern_threadreq_lock(kqwl->kqwl_p);
2247 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2248 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
2249 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2250 			if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2251 				/*
2252 				 * If the workq is no longer the interlock, then
2253 				 * workq_kern_threadreq_update_inheritor() has finished a bind
2254 				 * and we need to fallback to the regular path.
2255 				 */
2256 				filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2257 			}
2258 			wl_inheritor_updated = true;
2259 		} else {
2260 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2261 			wl_inheritor_updated = true;
2262 		}
2263 
2264 		/*
2265 		 * We need a turnstile reference because we are dropping the interlock
2266 		 * and the caller has not called turnstile_prepare.
2267 		 */
2268 		if (wl_inheritor_updated) {
2269 			turnstile_reference(ts);
2270 		}
2271 	}
2272 
2273 	if (needs_wake && ts) {
2274 		waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2275 		    kn->kn_thread, THREAD_AWAKENED);
2276 		if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2277 			disable_preemption();
2278 			error = EPREEMPTDISABLED;
2279 		}
2280 	}
2281 
2282 	kqunlock(kqwl);
2283 
2284 out:
2285 	/*
2286 	 * Phase 3:
2287 	 *
2288 	 * Unlock and cleanup various lingering references and things.
2289 	 */
2290 	filt_wlunlock(kqwl);
2291 
2292 #if CONFIG_WORKLOOP_DEBUG
2293 	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2294 		.updater = current_thread(),
2295 		.servicer = kqr_thread(kqr), /* Note: racy */
2296 		.old_owner = cur_owner,
2297 		.new_owner = new_owner,
2298 
2299 		.kev_ident  = kev->ident,
2300 		.error      = (int16_t)error,
2301 		.kev_flags  = kev->flags,
2302 		.kev_fflags = kev->fflags,
2303 
2304 		.kev_mask   = mask,
2305 		.kev_value  = kdata,
2306 		.in_value   = udata,
2307 	});
2308 #endif // CONFIG_WORKLOOP_DEBUG
2309 
2310 	if (wl_inheritor_updated) {
2311 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2312 		turnstile_deallocate(ts);
2313 	}
2314 
2315 	if (cur_owner && new_owner != cur_owner) {
2316 		if (cur_override != THREAD_QOS_UNSPECIFIED) {
2317 			thread_drop_kevent_override(cur_owner);
2318 		}
2319 		thread_deallocate_safe(cur_owner);
2320 	}
2321 	if (extra_thread_ref) {
2322 		thread_deallocate_safe(extra_thread_ref);
2323 	}
2324 	return error;
2325 }
2326 
2327 /*
2328  * Remembers the last updated that came in from userspace for debugging reasons.
2329  * - fflags is mirrored from the userspace kevent
2330  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2331  * - ext[VALUE] is set to what the kernel loaded atomically
2332  * - data is set to the error if any
2333  */
2334 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2335 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2336     int error)
2337 {
2338 	kn->kn_fflags = kev->fflags;
2339 	kn->kn_sdata = error;
2340 	memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2341 }
2342 
2343 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2344 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2345     struct kevent_qos_s *kev, int op)
2346 {
2347 	user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2348 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2349 	uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
2350 	uint64_t udata = 0;
2351 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2352 	int error = 0;
2353 
2354 	if (op == FILT_WLATTACH) {
2355 		(void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2356 	} else if (uaddr == 0) {
2357 		return 0;
2358 	}
2359 
2360 	filt_wllock(kqwl);
2361 
2362 again:
2363 
2364 	/*
2365 	 * Do the debounce thing, the lock serializing the state is the knote lock.
2366 	 */
2367 	if (uaddr) {
2368 		/*
2369 		 * Until <rdar://problem/24999882> exists,
2370 		 * disabling preemption copyin forces any
2371 		 * vm_fault we encounter to fail.
2372 		 */
2373 		error = copyin_atomic64(uaddr, &udata);
2374 
2375 		/*
2376 		 * If we get EFAULT, drop locks, and retry.
2377 		 * If we still get an error report it,
2378 		 * else assume the memory has been faulted
2379 		 * and attempt to copyin under lock again.
2380 		 */
2381 		switch (error) {
2382 		case 0:
2383 			break;
2384 		case EFAULT:
2385 			if (efault_retry-- > 0) {
2386 				filt_wlunlock(kqwl);
2387 				error = copyin_atomic64(uaddr, &udata);
2388 				filt_wllock(kqwl);
2389 				if (error == 0) {
2390 					goto again;
2391 				}
2392 			}
2393 			OS_FALLTHROUGH;
2394 		default:
2395 			goto out;
2396 		}
2397 
2398 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2399 		kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2400 
2401 		if ((udata & mask) != (kdata & mask)) {
2402 			error = ESTALE;
2403 			goto out;
2404 		}
2405 	}
2406 
2407 	if (op == FILT_WLATTACH) {
2408 		error = filt_wlattach_sync_ipc(kn);
2409 		if (error == 0) {
2410 			disable_preemption();
2411 			error = EPREEMPTDISABLED;
2412 		}
2413 	}
2414 
2415 out:
2416 	filt_wlunlock(kqwl);
2417 	return error;
2418 }
2419 
2420 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2421 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2422 {
2423 	struct kqueue *kq = knote_get_kq(kn);
2424 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2425 	int error = 0, result = 0;
2426 	kq_index_t qos_index = 0;
2427 
2428 	if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2429 		error = ENOTSUP;
2430 		goto out;
2431 	}
2432 
2433 	uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2434 	switch (command) {
2435 	case NOTE_WL_THREAD_REQUEST:
2436 		if (kn->kn_id != kqwl->kqwl_dynamicid) {
2437 			error = EINVAL;
2438 			goto out;
2439 		}
2440 		qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2441 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
2442 			error = ERANGE;
2443 			goto out;
2444 		}
2445 		if (kqwl->kqwl_request.tr_kq_qos_index) {
2446 			/*
2447 			 * There already is a thread request, and well, you're only allowed
2448 			 * one per workloop, so fail the attach.
2449 			 */
2450 			error = EALREADY;
2451 			goto out;
2452 		}
2453 		break;
2454 	case NOTE_WL_SYNC_WAIT:
2455 	case NOTE_WL_SYNC_WAKE:
2456 		if (kn->kn_id == kqwl->kqwl_dynamicid) {
2457 			error = EINVAL;
2458 			goto out;
2459 		}
2460 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2461 			error = EINVAL;
2462 			goto out;
2463 		}
2464 		if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2465 			error = EINVAL;
2466 			goto out;
2467 		}
2468 		break;
2469 
2470 	case NOTE_WL_SYNC_IPC:
2471 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2472 			error = EINVAL;
2473 			goto out;
2474 		}
2475 		if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2476 			error = EINVAL;
2477 			goto out;
2478 		}
2479 		break;
2480 	default:
2481 		error = EINVAL;
2482 		goto out;
2483 	}
2484 
2485 	if (command == NOTE_WL_SYNC_IPC) {
2486 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2487 	} else {
2488 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2489 	}
2490 
2491 	if (error == EPREEMPTDISABLED) {
2492 		error = 0;
2493 		result = FILTER_THREADREQ_NODEFEER;
2494 	}
2495 out:
2496 	if (error) {
2497 		/* If userland wants ESTALE to be hidden, fail the attach anyway */
2498 		if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2499 			error = 0;
2500 		}
2501 		knote_set_error(kn, error);
2502 		return result;
2503 	}
2504 	if (command == NOTE_WL_SYNC_WAIT) {
2505 		return kevent_register_wait_prepare(kn, kev, result);
2506 	}
2507 	/* Just attaching the thread request successfully will fire it */
2508 	if (command == NOTE_WL_THREAD_REQUEST) {
2509 		/*
2510 		 * Thread Request knotes need an explicit touch to be active again,
2511 		 * so delivering an event needs to also consume it.
2512 		 */
2513 		kn->kn_flags |= EV_CLEAR;
2514 		return result | FILTER_ACTIVE;
2515 	}
2516 	return result;
2517 }
2518 
2519 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2520 filt_wlwait_continue(void *parameter, wait_result_t wr)
2521 {
2522 	struct _kevent_register *cont_args = parameter;
2523 	struct kqworkloop *kqwl = cont_args->kqwl;
2524 
2525 	kqlock(kqwl);
2526 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2527 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2528 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2529 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2530 	} else {
2531 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2532 	}
2533 	kqunlock(kqwl);
2534 
2535 	turnstile_cleanup();
2536 
2537 	if (wr == THREAD_INTERRUPTED) {
2538 		cont_args->kev.flags |= EV_ERROR;
2539 		cont_args->kev.data = EINTR;
2540 	} else if (wr != THREAD_AWAKENED) {
2541 		panic("Unexpected wait result: %d", wr);
2542 	}
2543 
2544 	kevent_register_wait_return(cont_args);
2545 }
2546 
2547 /*
2548  * Called with the workloop mutex held, most of the time never returns as it
2549  * calls filt_wlwait_continue through a continuation.
2550  */
2551 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2552 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2553     struct _kevent_register *cont_args)
2554 {
2555 	struct kqworkloop *kqwl = cont_args->kqwl;
2556 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2557 	struct turnstile *ts;
2558 	bool workq_locked = false;
2559 
2560 	kqlock_held(kqwl);
2561 
2562 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2563 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2564 		workq_locked = true;
2565 	}
2566 
2567 	ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2568 	    TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2569 
2570 	if (workq_locked) {
2571 		workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2572 		    &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2573 		    TURNSTILE_DELAYED_UPDATE);
2574 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2575 			/*
2576 			 * if the interlock is no longer the workqueue lock,
2577 			 * then we don't need to hold it anymore.
2578 			 */
2579 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2580 			workq_locked = false;
2581 		}
2582 	}
2583 	if (!workq_locked) {
2584 		/*
2585 		 * If the interlock is the workloop's, then it's our responsibility to
2586 		 * call update_inheritor, so just do it.
2587 		 */
2588 		filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2589 	}
2590 
2591 	thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2592 	waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2593 	    THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2594 
2595 	if (workq_locked) {
2596 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2597 	}
2598 
2599 	thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2600 	if (thread) {
2601 		thread_reference(thread);
2602 	}
2603 
2604 	kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2605 }
2606 
2607 /* called in stackshot context to report the thread responsible for blocking this thread */
2608 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2609 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2610     event64_t event, thread_waitinfo_t *waitinfo)
2611 {
2612 	struct knote *kn = (struct knote *)event;
2613 
2614 	zone_require(knote_zone, kn);
2615 
2616 	assert(kn->kn_thread == thread);
2617 
2618 	struct kqueue *kq = knote_get_kq(kn);
2619 
2620 	zone_require(kqworkloop_zone, kq);
2621 	assert(kq->kq_state & KQ_WORKLOOP);
2622 
2623 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2624 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2625 
2626 	thread_t kqwl_owner = kqwl->kqwl_owner;
2627 
2628 	if (kqwl_owner != THREAD_NULL) {
2629 		thread_require(kqwl_owner);
2630 		waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2631 	} else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2632 		thread_require(kqr->tr_thread);
2633 		waitinfo->owner = thread_tid(kqr->tr_thread);
2634 	} else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2635 		waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2636 	} else {
2637 		waitinfo->owner = 0;
2638 	}
2639 
2640 	waitinfo->context = kqwl->kqwl_dynamicid;
2641 }
2642 
2643 static void
filt_wldetach(struct knote * kn)2644 filt_wldetach(struct knote *kn)
2645 {
2646 	if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2647 		filt_wldetach_sync_ipc(kn);
2648 	} else if (kn->kn_thread) {
2649 		kevent_register_wait_cleanup(kn);
2650 	}
2651 }
2652 
2653 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2654 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2655     thread_qos_t *qos_index)
2656 {
2657 	uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2658 	uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2659 
2660 	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2661 		return EINVAL;
2662 	}
2663 	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2664 		if (kev->flags & EV_DELETE) {
2665 			return EINVAL;
2666 		}
2667 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2668 			return EINVAL;
2669 		}
2670 		if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2671 			return ERANGE;
2672 		}
2673 	}
2674 
2675 	switch (new_commands) {
2676 	case NOTE_WL_THREAD_REQUEST:
2677 		/* thread requests can only update themselves */
2678 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2679 			return EINVAL;
2680 		}
2681 		break;
2682 
2683 	case NOTE_WL_SYNC_WAIT:
2684 		if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2685 			return EINVAL;
2686 		}
2687 		goto sync_checks;
2688 
2689 	case NOTE_WL_SYNC_WAKE:
2690 sync_checks:
2691 		if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2692 			return EINVAL;
2693 		}
2694 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2695 			return EINVAL;
2696 		}
2697 		break;
2698 
2699 	case NOTE_WL_SYNC_IPC:
2700 		if (sav_commands != NOTE_WL_SYNC_IPC) {
2701 			return EINVAL;
2702 		}
2703 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2704 			return EINVAL;
2705 		}
2706 		break;
2707 
2708 	default:
2709 		return EINVAL;
2710 	}
2711 	return 0;
2712 }
2713 
2714 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2715 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2716 {
2717 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2718 	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2719 	int result = 0;
2720 
2721 	int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2722 	if (error) {
2723 		goto out;
2724 	}
2725 
2726 	uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2727 	if (command == NOTE_WL_SYNC_IPC) {
2728 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2729 	} else {
2730 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2731 		filt_wlremember_last_update(kn, kev, error);
2732 	}
2733 	if (error == EPREEMPTDISABLED) {
2734 		error = 0;
2735 		result = FILTER_THREADREQ_NODEFEER;
2736 	}
2737 
2738 out:
2739 	if (error) {
2740 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2741 			/* If userland wants ESTALE to be hidden, do not activate */
2742 			return result;
2743 		}
2744 		kev->flags |= EV_ERROR;
2745 		kev->data = error;
2746 		return result;
2747 	}
2748 	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2749 		return kevent_register_wait_prepare(kn, kev, result);
2750 	}
2751 	/* Just touching the thread request successfully will fire it */
2752 	if (command == NOTE_WL_THREAD_REQUEST) {
2753 		if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2754 			result |= FILTER_UPDATE_REQ_QOS;
2755 		}
2756 		result |= FILTER_ACTIVE;
2757 	}
2758 	return result;
2759 }
2760 
2761 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2762 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2763 {
2764 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2765 
2766 	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2767 	if (error) {
2768 		goto out;
2769 	}
2770 
2771 	uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2772 	if (command == NOTE_WL_SYNC_IPC) {
2773 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2774 	} else {
2775 		error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2776 		filt_wlremember_last_update(kn, kev, error);
2777 	}
2778 	assert(error != EPREEMPTDISABLED);
2779 
2780 out:
2781 	if (error) {
2782 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2783 			return false;
2784 		}
2785 		kev->flags |= EV_ERROR;
2786 		kev->data = error;
2787 		return false;
2788 	}
2789 	return true;
2790 }
2791 
2792 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2793 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2794 {
2795 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2796 	int rc = 0;
2797 
2798 	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2799 
2800 	kqlock(kqwl);
2801 
2802 	if (kqwl->kqwl_owner) {
2803 		/*
2804 		 * <rdar://problem/33584321> userspace sometimes due to events being
2805 		 * delivered but not triggering a drain session can cause a process
2806 		 * of the thread request knote.
2807 		 *
2808 		 * When that happens, the automatic deactivation due to process
2809 		 * would swallow the event, so we have to activate the knote again.
2810 		 */
2811 		knote_activate(kqwl, kn, FILTER_ACTIVE);
2812 	} else {
2813 #if DEBUG || DEVELOPMENT
2814 		if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2815 			/*
2816 			 * see src/queue_internal.h in libdispatch
2817 			 */
2818 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2819 			user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2820 			task_t t = current_task();
2821 			uint64_t val;
2822 			if (addr && task_is_active(t) && !task_is_halting(t) &&
2823 			    copyin_atomic64(addr, &val) == 0 &&
2824 			    val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2825 			    (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2826 				panic("kevent: workloop %#016llx is not enqueued "
2827 				    "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2828 				    kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2829 			}
2830 		}
2831 #endif
2832 		knote_fill_kevent(kn, kev, 0);
2833 		kev->fflags = kn->kn_sfflags;
2834 		rc |= FILTER_ACTIVE;
2835 	}
2836 
2837 	kqunlock(kqwl);
2838 
2839 	if (rc & FILTER_ACTIVE) {
2840 		workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2841 	}
2842 	return rc;
2843 }
2844 
2845 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2846 	.f_extended_codes = true,
2847 	.f_attach  = filt_wlattach,
2848 	.f_detach  = filt_wldetach,
2849 	.f_event   = filt_bad_event,
2850 	.f_touch   = filt_wltouch,
2851 	.f_process = filt_wlprocess,
2852 	.f_allow_drop = filt_wlallow_drop,
2853 	.f_post_register_wait = filt_wlpost_register_wait,
2854 };
2855 
2856 #pragma mark - kqueues allocation and deallocation
2857 
2858 OS_NOINLINE
2859 static void
2860 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2861 
2862 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2863 kqworkloop_try_retain(struct kqworkloop *kqwl)
2864 {
2865 	return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2866 }
2867 
2868 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2869 kqworkloop_retain(struct kqworkloop *kqwl)
2870 {
2871 	return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2872 }
2873 
2874 OS_ALWAYS_INLINE
2875 static inline void
kqueue_retain(kqueue_t kqu)2876 kqueue_retain(kqueue_t kqu)
2877 {
2878 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2879 		kqworkloop_retain(kqu.kqwl);
2880 	}
2881 }
2882 
2883 OS_ALWAYS_INLINE
2884 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2885 kqworkloop_release_live(struct kqworkloop *kqwl)
2886 {
2887 	os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2888 }
2889 
2890 OS_ALWAYS_INLINE
2891 static inline void
kqueue_release_live(kqueue_t kqu)2892 kqueue_release_live(kqueue_t kqu)
2893 {
2894 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2895 		kqworkloop_release_live(kqu.kqwl);
2896 	}
2897 }
2898 
2899 OS_ALWAYS_INLINE
2900 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2901 kqworkloop_release(struct kqworkloop *kqwl)
2902 {
2903 	if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2904 		kqworkloop_dealloc(kqwl, true);
2905 	}
2906 }
2907 
2908 OS_ALWAYS_INLINE
2909 static inline void
kqueue_release(kqueue_t kqu)2910 kqueue_release(kqueue_t kqu)
2911 {
2912 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2913 		kqworkloop_release(kqu.kqwl);
2914 	}
2915 }
2916 
2917 /*!
2918  * @function kqueue_destroy
2919  *
2920  * @brief
2921  * Common part to all kqueue dealloc functions.
2922  */
2923 OS_NOINLINE
2924 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2925 kqueue_destroy(kqueue_t kqu, zone_t zone)
2926 {
2927 	lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2928 
2929 	zfree(zone, kqu.kq);
2930 }
2931 
2932 /*!
2933  * @function kqueue_init
2934  *
2935  * @brief
2936  * Common part to all kqueue alloc functions.
2937  */
2938 static kqueue_t
kqueue_init(kqueue_t kqu)2939 kqueue_init(kqueue_t kqu)
2940 {
2941 	lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2942 	return kqu;
2943 }
2944 
2945 #pragma mark kqfile allocation and deallocation
2946 
2947 /*!
2948  * @function kqueue_dealloc
2949  *
2950  * @brief
2951  * Detach all knotes from a kqfile and free it.
2952  *
2953  * @discussion
2954  * We walk each list looking for knotes referencing this
2955  * this kqueue.  If we find one, we try to drop it.  But
2956  * if we fail to get a drop reference, that will wait
2957  * until it is dropped.  So, we can just restart again
2958  * safe in the assumption that the list will eventually
2959  * not contain any more references to this kqueue (either
2960  * we dropped them all, or someone else did).
2961  *
2962  * Assumes no new events are being added to the kqueue.
2963  * Nothing locked on entry or exit.
2964  */
2965 void
kqueue_dealloc(struct kqueue * kq)2966 kqueue_dealloc(struct kqueue *kq)
2967 {
2968 	KNOTE_LOCK_CTX(knlc);
2969 	struct proc *p = kq->kq_p;
2970 	struct filedesc *fdp = &p->p_fd;
2971 	struct knote *kn;
2972 
2973 	assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2974 
2975 	proc_fdlock(p);
2976 	for (int i = 0; i < fdp->fd_knlistsize; i++) {
2977 		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2978 		while (kn != NULL) {
2979 			if (kq == knote_get_kq(kn)) {
2980 				kqlock(kq);
2981 				proc_fdunlock(p);
2982 				if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2983 					knote_drop(kq, kn, &knlc);
2984 				}
2985 				proc_fdlock(p);
2986 				/* start over at beginning of list */
2987 				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2988 				continue;
2989 			}
2990 			kn = SLIST_NEXT(kn, kn_link);
2991 		}
2992 	}
2993 
2994 	knhash_lock(fdp);
2995 	proc_fdunlock(p);
2996 
2997 	if (fdp->fd_knhashmask != 0) {
2998 		for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2999 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3000 			while (kn != NULL) {
3001 				if (kq == knote_get_kq(kn)) {
3002 					kqlock(kq);
3003 					knhash_unlock(fdp);
3004 					if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3005 						knote_drop(kq, kn, &knlc);
3006 					}
3007 					knhash_lock(fdp);
3008 					/* start over at beginning of list */
3009 					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3010 					continue;
3011 				}
3012 				kn = SLIST_NEXT(kn, kn_link);
3013 			}
3014 		}
3015 	}
3016 	knhash_unlock(fdp);
3017 
3018 	kqueue_destroy(kq, kqfile_zone);
3019 }
3020 
3021 /*!
3022  * @function kqueue_alloc
3023  *
3024  * @brief
3025  * Allocate a kqfile.
3026  */
3027 struct kqueue *
kqueue_alloc(struct proc * p)3028 kqueue_alloc(struct proc *p)
3029 {
3030 	struct kqfile *kqf;
3031 
3032 	/*
3033 	 * kqfiles are created with kqueue() so we need to wait for
3034 	 * the first kevent syscall to know which bit among
3035 	 * KQ_KEV_{32,64,QOS} will be set in kqf_state
3036 	 */
3037 	kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
3038 	kqf->kqf_p = p;
3039 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3040 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3041 
3042 	return kqueue_init(kqf).kq;
3043 }
3044 
3045 /*!
3046  * @function kqueue_internal
3047  *
3048  * @brief
3049  * Core implementation for kqueue and guarded_kqueue_np()
3050  */
3051 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)3052 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
3053 {
3054 	struct kqueue *kq;
3055 	struct fileproc *fp;
3056 	int fd, error;
3057 
3058 	error = falloc_withinit(p, current_cached_proc_cred(p),
3059 	    vfs_context_current(), &fp, &fd, fp_init, initarg);
3060 	if (error) {
3061 		return error;
3062 	}
3063 
3064 	kq = kqueue_alloc(p);
3065 	if (kq == NULL) {
3066 		fp_free(p, fd, fp);
3067 		return ENOMEM;
3068 	}
3069 
3070 	fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3071 	fp->f_flag = FREAD | FWRITE;
3072 	fp->f_ops = &kqueueops;
3073 	fp_set_data(fp, kq);
3074 	fp->f_lflags |= FG_CONFINED;
3075 
3076 	proc_fdlock(p);
3077 	procfdtbl_releasefd(p, fd, NULL);
3078 	fp_drop(p, fd, fp, 1);
3079 	proc_fdunlock(p);
3080 
3081 	*retval = fd;
3082 	return error;
3083 }
3084 
3085 /*!
3086  * @function kqueue
3087  *
3088  * @brief
3089  * The kqueue syscall.
3090  */
3091 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3092 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3093 {
3094 	return kqueue_internal(p, NULL, NULL, retval);
3095 }
3096 
3097 #pragma mark kqworkq allocation and deallocation
3098 
3099 /*!
3100  * @function kqworkq_dealloc
3101  *
3102  * @brief
3103  * Deallocates a workqueue kqueue.
3104  *
3105  * @discussion
3106  * This only happens at process death, or for races with concurrent
3107  * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3108  * this kqueue, either there are none, or someone else took care of them.
3109  */
3110 void
kqworkq_dealloc(struct kqworkq * kqwq)3111 kqworkq_dealloc(struct kqworkq *kqwq)
3112 {
3113 	kqueue_destroy(kqwq, kqworkq_zone);
3114 }
3115 
3116 /*!
3117  * @function kqworkq_alloc
3118  *
3119  * @brief
3120  * Allocates a workqueue kqueue.
3121  *
3122  * @discussion
3123  * This is the slow path of kevent_get_kqwq.
3124  * This takes care of making sure procs have a single workq kqueue.
3125  */
3126 OS_NOINLINE
3127 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3128 kqworkq_alloc(struct proc *p, unsigned int flags)
3129 {
3130 	struct kqworkq *kqwq, *tmp;
3131 
3132 	kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3133 
3134 	assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3135 	if (flags & KEVENT_FLAG_LEGACY64) {
3136 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3137 	} else {
3138 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3139 	}
3140 	kqwq->kqwq_p = p;
3141 
3142 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3143 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3144 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3145 	}
3146 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3147 		/*
3148 		 * Because of how the bucketized system works, we mix overcommit
3149 		 * sources with not overcommit: each time we move a knote from
3150 		 * one bucket to the next due to overrides, we'd had to track
3151 		 * overcommitness, and it's really not worth it in the workloop
3152 		 * enabled world that track this faithfully.
3153 		 *
3154 		 * Incidentally, this behaves like the original manager-based
3155 		 * kqwq where event delivery always happened (hence is
3156 		 * "overcommit")
3157 		 */
3158 		kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3159 		kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3160 		if (i != KQWQ_QOS_MANAGER) {
3161 			kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3162 		}
3163 		kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3164 	}
3165 
3166 	kqueue_init(kqwq);
3167 
3168 	if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3169 		kqworkq_dealloc(kqwq);
3170 		return tmp;
3171 	}
3172 
3173 	return kqwq;
3174 }
3175 
3176 #pragma mark kqworkloop allocation and deallocation
3177 
3178 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3179 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3180 
3181 OS_ALWAYS_INLINE
3182 static inline void
kqhash_lock(struct filedesc * fdp)3183 kqhash_lock(struct filedesc *fdp)
3184 {
3185 	lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3186 }
3187 
3188 OS_ALWAYS_INLINE
3189 static inline void
kqhash_unlock(struct filedesc * fdp)3190 kqhash_unlock(struct filedesc *fdp)
3191 {
3192 	lck_mtx_unlock(&fdp->fd_kqhashlock);
3193 }
3194 
3195 OS_ALWAYS_INLINE
3196 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3197 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3198     struct kqworkloop *kqwl)
3199 {
3200 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3201 	LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3202 }
3203 
3204 OS_ALWAYS_INLINE
3205 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3206 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3207 {
3208 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3209 	struct kqworkloop *kqwl;
3210 
3211 	LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3212 		if (kqwl->kqwl_dynamicid == id) {
3213 			return kqwl;
3214 		}
3215 	}
3216 	return NULL;
3217 }
3218 
3219 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3220 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3221 {
3222 	struct kqworkloop *kqwl = NULL;
3223 
3224 	kqhash_lock(fdp);
3225 	if (__probable(fdp->fd_kqhash)) {
3226 		kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3227 		if (kqwl && !kqworkloop_try_retain(kqwl)) {
3228 			kqwl = NULL;
3229 		}
3230 	}
3231 	kqhash_unlock(fdp);
3232 	return kqwl;
3233 }
3234 
3235 OS_NOINLINE
3236 static void
kqworkloop_hash_init(struct filedesc * fdp)3237 kqworkloop_hash_init(struct filedesc *fdp)
3238 {
3239 	struct kqwllist *alloc_hash;
3240 	u_long alloc_mask;
3241 
3242 	kqhash_unlock(fdp);
3243 	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3244 	kqhash_lock(fdp);
3245 
3246 	/* See if we won the race */
3247 	if (__probable(fdp->fd_kqhashmask == 0)) {
3248 		fdp->fd_kqhash = alloc_hash;
3249 		fdp->fd_kqhashmask = alloc_mask;
3250 	} else {
3251 		kqhash_unlock(fdp);
3252 		hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3253 		kqhash_lock(fdp);
3254 	}
3255 }
3256 
3257 /*
3258  * kqueue iotier override is only supported for kqueue that has
3259  * only one port as a mach port source. Updating the iotier
3260  * override on the mach port source will update the override
3261  * on kqueue as well. Since kqueue with iotier override will
3262  * only have one port attached, there is no logic for saturation
3263  * like qos override, the iotier override of mach port source
3264  * would be reflected in kevent iotier override.
3265  */
3266 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3267 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3268 {
3269 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3270 		return;
3271 	}
3272 
3273 	struct kqworkloop *kqwl = kqu.kqwl;
3274 	os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3275 }
3276 
3277 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3278 kqueue_get_iotier_override(kqueue_t kqu)
3279 {
3280 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3281 		return THROTTLE_LEVEL_END;
3282 	}
3283 
3284 	struct kqworkloop *kqwl = kqu.kqwl;
3285 	return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3286 }
3287 
3288 #if CONFIG_PREADOPT_TG
3289 /*
3290  * This function is called with a borrowed reference on the thread group without
3291  * kq lock held with the mqueue lock held. It may or may not have the knote lock
3292  * (called from both fevent as well as fattach/ftouch). Upon success, an
3293  * additional reference on the TG is taken
3294  */
3295 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3296 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3297 {
3298 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3299 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3300 		    (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3301 		return;
3302 	}
3303 
3304 	struct kqworkloop *kqwl = kqu.kqwl;
3305 
3306 	assert(qos < THREAD_QOS_LAST);
3307 
3308 	thread_group_retain(tg);
3309 
3310 	thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3311 	int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3312 		if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3313 		        os_atomic_rmw_loop_give_up(break);
3314 		}
3315 
3316 		if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3317 		        /*
3318 		         * Note that old_tg could be a NULL TG pointer but with a QoS
3319 		         * set. See also workq_thread_reset_pri.
3320 		         *
3321 		         * Compare the QoS of existing preadopted tg with new one and
3322 		         * only overwrite the thread group if we have one with a higher
3323 		         * QoS.
3324 		         */
3325 		        thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3326 		        if (existing_qos >= qos) {
3327 		                os_atomic_rmw_loop_give_up(break);
3328 			}
3329 		}
3330 
3331 		// Transfer the ref taken earlier in the function to the kqwl
3332 		new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3333 	});
3334 
3335 	if (ret) {
3336 		KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3337 
3338 		if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3339 			thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3340 		}
3341 
3342 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3343 	} else {
3344 		// We failed to write to the kqwl_preadopt_tg, drop the ref we took
3345 		// earlier in the function
3346 		thread_group_deallocate_safe(tg);
3347 	}
3348 }
3349 
3350 /*
3351  * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3352  */
3353 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3354 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3355 {
3356 	bool success = false;
3357 	if (kq->kq_state & KQ_WORKLOOP) {
3358 		struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3359 		thread_group_qos_t old_tg;
3360 		success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3361 		    KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3362 		    &old_tg, relaxed);
3363 		if (success) {
3364 			thread_set_preadopt_thread_group(thread, tg);
3365 		} else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3366 			/*
3367 			 * Technically the following set_preadopt should be a no-op since this
3368 			 * servicer thread preadopts kqwl's permanent tg at bind time.
3369 			 * See kqueue_threadreq_bind.
3370 			 */
3371 			thread_set_preadopt_thread_group(thread, KQWL_GET_PREADOPTED_TG(old_tg));
3372 		} else {
3373 			assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3374 			    old_tg == KQWL_PREADOPTED_TG_NEVER);
3375 		}
3376 	}
3377 	return success;
3378 }
3379 #endif
3380 
3381 /*!
3382  * @function kqworkloop_dealloc
3383  *
3384  * @brief
3385  * Deallocates a workloop kqueue.
3386  *
3387  * @discussion
3388  * Knotes hold references on the workloop, so we can't really reach this
3389  * function unless all of these are already gone.
3390  *
3391  * Nothing locked on entry or exit.
3392  *
3393  * @param hash_remove
3394  * Whether to remove the workloop from its hash table.
3395  */
3396 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3397 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3398 {
3399 	thread_t cur_owner;
3400 
3401 	cur_owner = kqwl->kqwl_owner;
3402 	if (cur_owner) {
3403 		if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3404 			thread_drop_kevent_override(cur_owner);
3405 		}
3406 		thread_deallocate(cur_owner);
3407 		kqwl->kqwl_owner = THREAD_NULL;
3408 	}
3409 
3410 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3411 		struct turnstile *ts;
3412 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3413 		    &ts, TURNSTILE_WORKLOOPS);
3414 		turnstile_cleanup();
3415 		turnstile_deallocate(ts);
3416 	}
3417 
3418 	if (hash_remove) {
3419 		struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3420 
3421 		kqhash_lock(fdp);
3422 		LIST_REMOVE(kqwl, kqwl_hashlink);
3423 #if CONFIG_PROC_RESOURCE_LIMITS
3424 		fdp->num_kqwls--;
3425 #endif
3426 		kqhash_unlock(fdp);
3427 	}
3428 
3429 #if CONFIG_PREADOPT_TG
3430 	thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3431 	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3432 		thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3433 	}
3434 #endif
3435 
3436 	workq_threadreq_t kqr = &kqwl->kqwl_request;
3437 	if ((kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) && kqr->tr_work_interval) {
3438 		kern_work_interval_release(kqr->tr_work_interval);
3439 	}
3440 
3441 	assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3442 	assert(kqwl->kqwl_owner == THREAD_NULL);
3443 	assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3444 
3445 	lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3446 	kqueue_destroy(kqwl, kqworkloop_zone);
3447 }
3448 
3449 /*!
3450  * @function kqworkloop_init
3451  *
3452  * @brief
3453  * Initializes an allocated kqworkloop.
3454  */
3455 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended)3456 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3457     kqueue_id_t id, workq_threadreq_param_t *trp,
3458     struct workq_threadreq_extended_param_s *trp_extended)
3459 {
3460 	kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3461 	os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3462 	kqwl->kqwl_dynamicid = id;
3463 	kqwl->kqwl_p         = p;
3464 	if (trp) {
3465 		kqwl->kqwl_params = trp->trp_value;
3466 	}
3467 
3468 	workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3469 	if (trp) {
3470 		if (trp->trp_flags & TRP_PRIORITY) {
3471 			tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3472 		}
3473 		if (trp->trp_flags & TRP_BOUND_THREAD) {
3474 			tr_flags |= WORKQ_TR_FLAG_PERMANENT_BIND;
3475 		}
3476 		if (trp->trp_flags) {
3477 			tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3478 		}
3479 	}
3480 	kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3481 	kqwl->kqwl_request.tr_flags = tr_flags;
3482 	os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3483 #if CONFIG_PREADOPT_TG
3484 	if (trp_extended && trp_extended->trp_permanent_preadopt_tg) {
3485 		/*
3486 		 * This kqwl is permanently configured with a thread group.
3487 		 * By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3488 		 * has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3489 		 * trp_extended->trp_permanent_preadopt_tg is transferred to the kqwl.
3490 		 */
3491 		thread_group_qos_t kqwl_preadopt_tg;
3492 		kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_extended->trp_permanent_preadopt_tg);
3493 		os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3494 	} else if (task_is_app(current_task())) {
3495 		/*
3496 		 * Not a specially preconfigured kqwl so it is open to participate in sync IPC
3497 		 * thread group preadoption; but, apps will never adopt a thread group that
3498 		 * is not their own. This is a gross hack to simulate the post-process that
3499 		 * is done in the voucher subsystem today for thread groups.
3500 		 */
3501 		os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3502 	}
3503 #endif
3504 	if (trp_extended) {
3505 		if (trp_extended->trp_work_interval) {
3506 			/*
3507 			 * The +1 ref on the work interval is transferred to the kqwl.
3508 			 */
3509 			assert(tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
3510 			kqwl->kqwl_request.tr_work_interval = trp_extended->trp_work_interval;
3511 		}
3512 	}
3513 	for (int i = 0; i < KQWL_NBUCKETS; i++) {
3514 		TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3515 	}
3516 	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3517 
3518 	lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3519 
3520 	kqueue_init(kqwl);
3521 }
3522 
3523 #if CONFIG_PROC_RESOURCE_LIMITS
3524 void
kqworkloop_check_limit_exceeded(struct filedesc * fdp)3525 kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3526 {
3527 	int num_kqwls = fdp->num_kqwls;
3528 	if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > 0 &&
3529 	    num_kqwls > fdp->kqwl_dyn_soft_limit) {
3530 		kqwl_above_soft_limit_send_notification(fdp);
3531 		act_set_astproc_resource(current_thread());
3532 	} else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > 0
3533 	    && num_kqwls > fdp->kqwl_dyn_hard_limit) {
3534 		kqwl_above_hard_limit_send_notification(fdp);
3535 		act_set_astproc_resource(current_thread());
3536 	}
3537 }
3538 #endif
3539 
3540 /*!
3541  * @function kqworkloop_get_or_create
3542  *
3543  * @brief
3544  * Wrapper around kqworkloop_init that handles the uniquing of workloops.
3545  *
3546  * @returns
3547  * 0:      success
3548  * EINVAL: invalid parameters
3549  * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3550  * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3551  * ENOMEM: allocation failed
3552  */
3553 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended,unsigned int flags,struct kqworkloop ** kqwlp)3554 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3555     workq_threadreq_param_t *trp,
3556     struct workq_threadreq_extended_param_s *trp_extended,
3557     unsigned int flags, struct kqworkloop **kqwlp)
3558 {
3559 	struct filedesc *fdp = &p->p_fd;
3560 	struct kqworkloop *alloc_kqwl = NULL;
3561 	struct kqworkloop *kqwl = NULL;
3562 	int error = 0;
3563 
3564 	assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3565 
3566 	if (id == 0 || id == (kqueue_id_t)-1) {
3567 		return EINVAL;
3568 	}
3569 
3570 	for (;;) {
3571 		kqhash_lock(fdp);
3572 		if (__improbable(fdp->fd_kqhash == NULL)) {
3573 			kqworkloop_hash_init(fdp);
3574 		}
3575 
3576 		kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3577 		if (kqwl) {
3578 			if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3579 				/*
3580 				 * If MUST_NOT_EXIST was passed, even if we would have failed
3581 				 * the try_retain, it could have gone the other way, and
3582 				 * userspace can't tell. Let'em fix their race.
3583 				 */
3584 				error = EEXIST;
3585 				break;
3586 			}
3587 
3588 			if (__probable(kqworkloop_try_retain(kqwl))) {
3589 				/*
3590 				 * This is a valid live workloop !
3591 				 */
3592 				*kqwlp = kqwl;
3593 				error = 0;
3594 				break;
3595 			}
3596 		}
3597 
3598 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3599 			error = ENOENT;
3600 			break;
3601 		}
3602 
3603 		/*
3604 		 * We didn't find what we were looking for.
3605 		 *
3606 		 * If this is the second time we reach this point (alloc_kqwl != NULL),
3607 		 * then we're done.
3608 		 *
3609 		 * If this is the first time we reach this point (alloc_kqwl == NULL),
3610 		 * then try to allocate one without blocking.
3611 		 */
3612 		if (__probable(alloc_kqwl == NULL)) {
3613 			alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3614 		}
3615 		if (__probable(alloc_kqwl)) {
3616 #if CONFIG_PROC_RESOURCE_LIMITS
3617 			fdp->num_kqwls++;
3618 			kqworkloop_check_limit_exceeded(fdp);
3619 #endif
3620 			kqworkloop_init(alloc_kqwl, p, id, trp, trp_extended);
3621 			/*
3622 			 * The newly allocated and initialized kqwl has a retain count of 1.
3623 			 */
3624 			kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3625 			if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3626 				/*
3627 				 * If this kqworkloop is configured to be permanently bound to
3628 				 * a thread, we take +1 ref on that thread's behalf before we
3629 				 * unlock the kqhash below. The reason being this new kqwl is
3630 				 * findable in the hash table as soon as we unlock the kqhash
3631 				 * and we want to make sure this kqwl does not get deleted from
3632 				 * under us by the time we create a new thread and bind to it.
3633 				 *
3634 				 * This ref is released when the bound thread unbinds itself
3635 				 * from the kqwl on its way to termination.
3636 				 * See uthread_cleanup -> kqueue_threadreq_unbind.
3637 				 *
3638 				 * The kqwl now has a retain count of 2.
3639 				 */
3640 				kqworkloop_retain(alloc_kqwl);
3641 			}
3642 			kqhash_unlock(fdp);
3643 			/*
3644 			 * We do not want to keep holding kqhash lock when workq is
3645 			 * busy creating and initializing a new thread to bind to this
3646 			 * kqworkloop.
3647 			 */
3648 			if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3649 				error = workq_kern_threadreq_permanent_bind(p, &alloc_kqwl->kqwl_request);
3650 				if (error != KERN_SUCCESS) {
3651 					/*
3652 					 * The kqwl we just created and initialized has a retain
3653 					 * count of 2 at this point i.e. 1 from kqworkloop_init and
3654 					 * 1 on behalf of the bound thread. We need to release
3655 					 * both the references here to successfully deallocate this
3656 					 * kqwl before we return an error.
3657 					 *
3658 					 * The latter release should take care of deallocating
3659 					 * the kqwl itself and removing it from the kqhash.
3660 					 */
3661 					kqworkloop_release(alloc_kqwl);
3662 					kqworkloop_release(alloc_kqwl);
3663 					alloc_kqwl = NULL;
3664 					if (trp_extended) {
3665 						/*
3666 						 * Since we transferred these refs to kqwl during
3667 						 * kqworkloop_init, the kqwl takes care of releasing them.
3668 						 * We don't have any refs to return to our caller
3669 						 * in this case.
3670 						 */
3671 #if CONFIG_PREADOPT_TG
3672 						if (trp_extended->trp_permanent_preadopt_tg) {
3673 							trp_extended->trp_permanent_preadopt_tg = NULL;
3674 						}
3675 #endif
3676 						if (trp_extended->trp_work_interval) {
3677 							trp_extended->trp_work_interval = NULL;
3678 						}
3679 					}
3680 					return error;
3681 				} else {
3682 					/*
3683 					 * For kqwl configured with a bound thread, KQ_SLEEP is used
3684 					 * to track whether the bound thread needs to be woken up
3685 					 * when such a kqwl is woken up.
3686 					 *
3687 					 * See kqworkloop_bound_thread_wakeup and
3688 					 * kqworkloop_bound_thread_park_prepost.
3689 					 *
3690 					 * Once the kqwl is initialized, this state
3691 					 * should always be manipulated under kqlock.
3692 					 */
3693 					kqlock(alloc_kqwl);
3694 					alloc_kqwl->kqwl_state |= KQ_SLEEP;
3695 					kqunlock(alloc_kqwl);
3696 				}
3697 			}
3698 			*kqwlp = alloc_kqwl;
3699 			return 0;
3700 		}
3701 
3702 		/*
3703 		 * We have to block to allocate a workloop, drop the lock,
3704 		 * allocate one, but then we need to retry lookups as someone
3705 		 * else could race with us.
3706 		 */
3707 		kqhash_unlock(fdp);
3708 
3709 		alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3710 	}
3711 
3712 	kqhash_unlock(fdp);
3713 
3714 	if (__improbable(alloc_kqwl)) {
3715 		zfree(kqworkloop_zone, alloc_kqwl);
3716 	}
3717 
3718 	return error;
3719 }
3720 
3721 #pragma mark - knotes
3722 
3723 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3724 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3725 {
3726 	knote_set_error(kn, ENOTSUP);
3727 	return 0;
3728 }
3729 
3730 static void
filt_no_detach(__unused struct knote * kn)3731 filt_no_detach(__unused struct knote *kn)
3732 {
3733 }
3734 
3735 static int __dead2
filt_bad_event(struct knote * kn,long hint)3736 filt_bad_event(struct knote *kn, long hint)
3737 {
3738 	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3739 }
3740 
3741 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3742 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3743 {
3744 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3745 }
3746 
3747 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3748 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3749 {
3750 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3751 }
3752 
3753 /*
3754  * knotes_dealloc - detach all knotes for the process and drop them
3755  *
3756  *		Process is in such a state that it will not try to allocate
3757  *		any more knotes during this process (stopped for exit or exec).
3758  */
3759 void
knotes_dealloc(proc_t p)3760 knotes_dealloc(proc_t p)
3761 {
3762 	struct filedesc *fdp = &p->p_fd;
3763 	struct kqueue *kq;
3764 	struct knote *kn;
3765 	struct  klist *kn_hash = NULL;
3766 	u_long kn_hashmask;
3767 	int i;
3768 
3769 	proc_fdlock(p);
3770 
3771 	/* Close all the fd-indexed knotes up front */
3772 	if (fdp->fd_knlistsize > 0) {
3773 		for (i = 0; i < fdp->fd_knlistsize; i++) {
3774 			while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3775 				kq = knote_get_kq(kn);
3776 				kqlock(kq);
3777 				proc_fdunlock(p);
3778 				knote_drop(kq, kn, NULL);
3779 				proc_fdlock(p);
3780 			}
3781 		}
3782 		/* free the table */
3783 		kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3784 	}
3785 	fdp->fd_knlistsize = 0;
3786 
3787 	proc_fdunlock(p);
3788 
3789 	knhash_lock(fdp);
3790 
3791 	/* Clean out all the hashed knotes as well */
3792 	if (fdp->fd_knhashmask != 0) {
3793 		for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3794 			while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3795 				kq = knote_get_kq(kn);
3796 				kqlock(kq);
3797 				knhash_unlock(fdp);
3798 				knote_drop(kq, kn, NULL);
3799 				knhash_lock(fdp);
3800 			}
3801 		}
3802 		kn_hash = fdp->fd_knhash;
3803 		kn_hashmask = fdp->fd_knhashmask;
3804 		fdp->fd_knhashmask = 0;
3805 		fdp->fd_knhash = NULL;
3806 	}
3807 
3808 	knhash_unlock(fdp);
3809 
3810 	if (kn_hash) {
3811 		hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3812 	}
3813 }
3814 
3815 /*
3816  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3817  * scheduling parameters
3818  *
3819  * Process is in such a state that it will not try to allocate
3820  * any more kqs or knotes during this process (stopped for exit or exec).
3821  */
3822 void
kqworkloops_dealloc(proc_t p)3823 kqworkloops_dealloc(proc_t p)
3824 {
3825 	struct filedesc *fdp = &p->p_fd;
3826 	struct kqworkloop *kqwl, *kqwln;
3827 	struct kqwllist tofree;
3828 
3829 	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3830 		return;
3831 	}
3832 
3833 	kqhash_lock(fdp);
3834 
3835 	if (fdp->fd_kqhashmask == 0) {
3836 		kqhash_unlock(fdp);
3837 		return;
3838 	}
3839 
3840 	LIST_INIT(&tofree);
3841 
3842 	for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3843 		LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3844 #if CONFIG_PREADOPT_TG
3845 			/*
3846 			 * kqworkloops that have scheduling parameters have an
3847 			 * implicit retain from kqueue_workloop_ctl that needs
3848 			 * to be balanced on process exit.
3849 			 */
3850 			__assert_only thread_group_qos_t preadopt_tg;
3851 			preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3852 #endif
3853 			assert(kqwl->kqwl_params
3854 #if CONFIG_PREADOPT_TG
3855 			    || KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3856 #endif
3857 			    );
3858 
3859 			LIST_REMOVE(kqwl, kqwl_hashlink);
3860 			LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3861 		}
3862 	}
3863 #if CONFIG_PROC_RESOURCE_LIMITS
3864 	fdp->num_kqwls = 0;
3865 #endif
3866 	kqhash_unlock(fdp);
3867 
3868 	LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3869 		uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3870 		if (ref != 1) {
3871 			panic("kq(%p) invalid refcount %d", kqwl, ref);
3872 		}
3873 		kqworkloop_dealloc(kqwl, false);
3874 	}
3875 }
3876 
3877 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3878 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3879     struct kevent_qos_s *kev)
3880 {
3881 	/* We don't care about the priority of a disabled or deleted knote */
3882 	if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3883 		return 0;
3884 	}
3885 
3886 	if (kq->kq_state & KQ_WORKLOOP) {
3887 		/*
3888 		 * Workloops need valid priorities with a QOS (excluding manager) for
3889 		 * any enabled knote.
3890 		 *
3891 		 * When it is pre-existing, just make sure it has a valid QoS as
3892 		 * kevent_register() will not use the incoming priority (filters who do
3893 		 * have the responsibility to validate it again, see filt_wltouch).
3894 		 *
3895 		 * If the knote is being made, validate the incoming priority.
3896 		 */
3897 		if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3898 			return ERANGE;
3899 		}
3900 	}
3901 
3902 	return 0;
3903 }
3904 
3905 /*
3906  * Prepare a filter for waiting after register.
3907  *
3908  * The f_post_register_wait hook will be called later by kevent_register()
3909  * and should call kevent_register_wait_block()
3910  */
3911 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3912 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3913 {
3914 	thread_t thread = current_thread();
3915 
3916 	assert(knote_fops(kn)->f_extended_codes);
3917 
3918 	if (kn->kn_thread == NULL) {
3919 		thread_reference(thread);
3920 		kn->kn_thread = thread;
3921 	} else if (kn->kn_thread != thread) {
3922 		/*
3923 		 * kn_thread may be set from a previous aborted wait
3924 		 * However, it has to be from the same thread.
3925 		 */
3926 		kev->flags |= EV_ERROR;
3927 		kev->data = EXDEV;
3928 		return 0;
3929 	}
3930 
3931 	return FILTER_REGISTER_WAIT | rc;
3932 }
3933 
3934 /*
3935  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3936  * aborted instead of properly woken up with thread_wakeup_thread().
3937  */
3938 static void
kevent_register_wait_cleanup(struct knote * kn)3939 kevent_register_wait_cleanup(struct knote *kn)
3940 {
3941 	thread_t thread = kn->kn_thread;
3942 	kn->kn_thread = NULL;
3943 	thread_deallocate(thread);
3944 }
3945 
3946 /*
3947  * Must be called at the end of a f_post_register_wait call from a filter.
3948  */
3949 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3950 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3951     thread_continue_t cont, struct _kevent_register *cont_args)
3952 {
3953 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3954 	kqunlock(cont_args->kqwl);
3955 	cont_args->handoff_thread = thread;
3956 	thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3957 }
3958 
3959 /*
3960  * Called by Filters using a f_post_register_wait to return from their wait.
3961  */
3962 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3963 kevent_register_wait_return(struct _kevent_register *cont_args)
3964 {
3965 	struct kqworkloop *kqwl = cont_args->kqwl;
3966 	struct kevent_qos_s *kev = &cont_args->kev;
3967 	int error = 0;
3968 
3969 	if (cont_args->handoff_thread) {
3970 		thread_deallocate(cont_args->handoff_thread);
3971 	}
3972 
3973 	if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3974 		if ((kev->flags & EV_ERROR) == 0) {
3975 			kev->flags |= EV_ERROR;
3976 			kev->data = 0;
3977 		}
3978 		error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3979 		if (error == 0) {
3980 			cont_args->eventout++;
3981 		}
3982 	}
3983 
3984 	kqworkloop_release(kqwl);
3985 	if (error == 0) {
3986 		*(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3987 	}
3988 	unix_syscall_return(error);
3989 }
3990 
3991 /*
3992  * kevent_register - add a new event to a kqueue
3993  *
3994  *	Creates a mapping between the event source and
3995  *	the kqueue via a knote data structure.
3996  *
3997  *	Because many/most the event sources are file
3998  *	descriptor related, the knote is linked off
3999  *	the filedescriptor table for quick access.
4000  *
4001  *	called with nothing locked
4002  *	caller holds a reference on the kqueue
4003  */
4004 
4005 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)4006 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
4007     struct knote **kn_out)
4008 {
4009 	struct proc *p = kq->kq_p;
4010 	const struct filterops *fops;
4011 	struct knote *kn = NULL;
4012 	int result = 0, error = 0;
4013 	unsigned short kev_flags = kev->flags;
4014 	KNOTE_LOCK_CTX(knlc);
4015 
4016 	if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
4017 		fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
4018 	} else {
4019 		error = EINVAL;
4020 		goto out;
4021 	}
4022 
4023 	/* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4024 	if (__improbable((kev->flags & EV_VANISHED) &&
4025 	    (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
4026 		error = EINVAL;
4027 		goto out;
4028 	}
4029 
4030 	/* Simplify the flags - delete and disable overrule */
4031 	if (kev->flags & EV_DELETE) {
4032 		kev->flags &= ~EV_ADD;
4033 	}
4034 	if (kev->flags & EV_DISABLE) {
4035 		kev->flags &= ~EV_ENABLE;
4036 	}
4037 
4038 	if (kq->kq_state & KQ_WORKLOOP) {
4039 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4040 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4041 		    kev->udata, kev->flags, kev->filter);
4042 	} else if (kq->kq_state & KQ_WORKQ) {
4043 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4044 		    0, kev->udata, kev->flags, kev->filter);
4045 	} else {
4046 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4047 		    VM_KERNEL_UNSLIDE_OR_PERM(kq),
4048 		    kev->udata, kev->flags, kev->filter);
4049 	}
4050 
4051 restart:
4052 	/* find the matching knote from the fd tables/hashes */
4053 	kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4054 	error = kevent_register_validate_priority(kq, kn, kev);
4055 	result = 0;
4056 	if (error) {
4057 		if (kn) {
4058 			kqunlock(kq);
4059 		}
4060 		goto out;
4061 	}
4062 
4063 	if (kn == NULL && (kev->flags & EV_ADD) == 0) {
4064 		/*
4065 		 * No knote found, EV_ADD wasn't specified
4066 		 */
4067 
4068 		if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4069 		    (kq->kq_state & KQ_WORKLOOP)) {
4070 			/*
4071 			 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4072 			 * that doesn't care about ENOENT, so just pretend the deletion
4073 			 * happened.
4074 			 */
4075 		} else {
4076 			error = ENOENT;
4077 		}
4078 		goto out;
4079 	} else if (kn == NULL) {
4080 		/*
4081 		 * No knote found, need to attach a new one (attach)
4082 		 */
4083 
4084 		struct fileproc *knote_fp = NULL;
4085 
4086 		/* grab a file reference for the new knote */
4087 		if (fops->f_isfd) {
4088 			if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
4089 				goto out;
4090 			}
4091 		}
4092 
4093 		kn = knote_alloc();
4094 		kn->kn_fp = knote_fp;
4095 		kn->kn_is_fd = fops->f_isfd;
4096 		kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
4097 		kn->kn_status = 0;
4098 
4099 		/* was vanish support requested */
4100 		if (kev->flags & EV_VANISHED) {
4101 			kev->flags &= ~EV_VANISHED;
4102 			kn->kn_status |= KN_REQVANISH;
4103 		}
4104 
4105 		/* snapshot matching/dispatching protocol flags into knote */
4106 		if (kev->flags & EV_DISABLE) {
4107 			kn->kn_status |= KN_DISABLED;
4108 		}
4109 
4110 		/*
4111 		 * copy the kevent state into knote
4112 		 * protocol is that fflags and data
4113 		 * are saved off, and cleared before
4114 		 * calling the attach routine.
4115 		 *
4116 		 * - kn->kn_sfflags aliases with kev->xflags
4117 		 * - kn->kn_sdata   aliases with kev->data
4118 		 * - kn->kn_filter  is the top 8 bits of kev->filter
4119 		 */
4120 		kn->kn_kevent  = *(struct kevent_internal_s *)kev;
4121 		kn->kn_sfflags = kev->fflags;
4122 		kn->kn_filtid  = (uint8_t)~kev->filter;
4123 		kn->kn_fflags  = 0;
4124 		knote_reset_priority(kq, kn, kev->qos);
4125 
4126 		/* Add the knote for lookup thru the fd table */
4127 		error = kq_add_knote(kq, kn, &knlc, p);
4128 		if (error) {
4129 			knote_free(kn);
4130 			if (knote_fp != NULL) {
4131 				fp_drop(p, (int)kev->ident, knote_fp, 0);
4132 			}
4133 
4134 			if (error == ERESTART) {
4135 				goto restart;
4136 			}
4137 			goto out;
4138 		}
4139 
4140 		/* fp reference count now applies to knote */
4141 
4142 		/*
4143 		 * we can't use filter_call() because f_attach can change the filter ops
4144 		 * for a filter that supports f_extended_codes, so we need to reload
4145 		 * knote_fops() and not use `fops`.
4146 		 */
4147 		result = fops->f_attach(kn, kev);
4148 		if (result && !knote_fops(kn)->f_extended_codes) {
4149 			result = FILTER_ACTIVE;
4150 		}
4151 
4152 		kqlock(kq);
4153 
4154 		if (result & FILTER_THREADREQ_NODEFEER) {
4155 			enable_preemption();
4156 		}
4157 
4158 		if (kn->kn_flags & EV_ERROR) {
4159 			/*
4160 			 * Failed to attach correctly, so drop.
4161 			 */
4162 			kn->kn_filtid = EVFILTID_DETACHED;
4163 			error = (int)kn->kn_sdata;
4164 			knote_drop(kq, kn, &knlc);
4165 			result = 0;
4166 			goto out;
4167 		}
4168 
4169 		/*
4170 		 * end "attaching" phase - now just attached
4171 		 *
4172 		 * Mark the thread request overcommit, if appropos
4173 		 *
4174 		 * If the attach routine indicated that an
4175 		 * event is already fired, activate the knote.
4176 		 */
4177 		if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4178 		    (kq->kq_state & KQ_WORKLOOP)) {
4179 			kqworkloop_set_overcommit((struct kqworkloop *)kq);
4180 		}
4181 	} else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4182 		/*
4183 		 * The knote was dropped while we were waiting for the lock,
4184 		 * we need to re-evaluate entirely
4185 		 */
4186 
4187 		goto restart;
4188 	} else if (kev->flags & EV_DELETE) {
4189 		/*
4190 		 * Deletion of a knote (drop)
4191 		 *
4192 		 * If the filter wants to filter drop events, let it do so.
4193 		 *
4194 		 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4195 		 * we must wait for the knote to be re-enabled (unless it is being
4196 		 * re-enabled atomically here).
4197 		 */
4198 
4199 		if (knote_fops(kn)->f_allow_drop) {
4200 			bool drop;
4201 
4202 			kqunlock(kq);
4203 			drop = knote_fops(kn)->f_allow_drop(kn, kev);
4204 			kqlock(kq);
4205 
4206 			if (!drop) {
4207 				goto out_unlock;
4208 			}
4209 		}
4210 
4211 		if ((kev->flags & EV_ENABLE) == 0 &&
4212 		    (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4213 		    (kn->kn_status & KN_DISABLED) != 0) {
4214 			kn->kn_status |= KN_DEFERDELETE;
4215 			error = EINPROGRESS;
4216 			goto out_unlock;
4217 		}
4218 
4219 		knote_drop(kq, kn, &knlc);
4220 		goto out;
4221 	} else {
4222 		/*
4223 		 * Regular update of a knote (touch)
4224 		 *
4225 		 * Call touch routine to notify filter of changes in filter values
4226 		 * (and to re-determine if any events are fired).
4227 		 *
4228 		 * If the knote is in defer-delete, avoid calling the filter touch
4229 		 * routine (it has delivered its last event already).
4230 		 *
4231 		 * If the touch routine had no failure,
4232 		 * apply the requested side effects to the knote.
4233 		 */
4234 
4235 		if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4236 			if (kev->flags & EV_ENABLE) {
4237 				result = FILTER_ACTIVE;
4238 			}
4239 		} else {
4240 			kqunlock(kq);
4241 			result = filter_call(knote_fops(kn), f_touch(kn, kev));
4242 			kqlock(kq);
4243 			if (result & FILTER_THREADREQ_NODEFEER) {
4244 				enable_preemption();
4245 			}
4246 		}
4247 
4248 		if (kev->flags & EV_ERROR) {
4249 			result = 0;
4250 			goto out_unlock;
4251 		}
4252 
4253 		if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4254 		    kn->kn_udata != kev->udata) {
4255 			// this allows klist_copy_udata() not to take locks
4256 			os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4257 		}
4258 		if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4259 			kn->kn_status |= KN_DISABLED;
4260 			knote_dequeue(kq, kn);
4261 		}
4262 	}
4263 
4264 	/* accept new kevent state */
4265 	knote_apply_touch(kq, kn, kev, result);
4266 
4267 out_unlock:
4268 	/*
4269 	 * When the filter asked for a post-register wait,
4270 	 * we leave the kqueue locked for kevent_register()
4271 	 * to call the filter's f_post_register_wait hook.
4272 	 */
4273 	if (result & FILTER_REGISTER_WAIT) {
4274 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4275 		*kn_out = kn;
4276 	} else {
4277 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4278 	}
4279 
4280 out:
4281 	/* output local errors through the kevent */
4282 	if (error) {
4283 		kev->flags |= EV_ERROR;
4284 		kev->data = error;
4285 	}
4286 	return result;
4287 }
4288 
4289 /*
4290  * knote_process - process a triggered event
4291  *
4292  *	Validate that it is really still a triggered event
4293  *	by calling the filter routines (if necessary).  Hold
4294  *	a use reference on the knote to avoid it being detached.
4295  *
4296  *	If it is still considered triggered, we will have taken
4297  *	a copy of the state under the filter lock.  We use that
4298  *	snapshot to dispatch the knote for future processing (or
4299  *	not, if this was a lost event).
4300  *
4301  *	Our caller assures us that nobody else can be processing
4302  *	events from this knote during the whole operation. But
4303  *	others can be touching or posting events to the knote
4304  *	interspersed with our processing it.
4305  *
4306  *	caller holds a reference on the kqueue.
4307  *	kqueue locked on entry and exit - but may be dropped
4308  */
4309 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4310 knote_process(struct knote *kn, kevent_ctx_t kectx,
4311     kevent_callback_t callback)
4312 {
4313 	struct kevent_qos_s kev;
4314 	struct kqueue *kq = knote_get_kq(kn);
4315 	KNOTE_LOCK_CTX(knlc);
4316 	int result = FILTER_ACTIVE;
4317 	int error = 0;
4318 	bool drop = false;
4319 
4320 	/*
4321 	 * Must be active
4322 	 * Must be queued and not disabled/suppressed or dropping
4323 	 */
4324 	assert(kn->kn_status & KN_QUEUED);
4325 	assert(kn->kn_status & KN_ACTIVE);
4326 	assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4327 
4328 	if (kq->kq_state & KQ_WORKLOOP) {
4329 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4330 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4331 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4332 		    kn->kn_filtid);
4333 	} else if (kq->kq_state & KQ_WORKQ) {
4334 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4335 		    0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4336 		    kn->kn_filtid);
4337 	} else {
4338 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4339 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4340 		    kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4341 	}
4342 
4343 	if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4344 		/*
4345 		 * When the knote is dropping or has dropped,
4346 		 * then there's nothing we want to process.
4347 		 */
4348 		return EJUSTRETURN;
4349 	}
4350 
4351 	/*
4352 	 * While waiting for the knote lock, we may have dropped the kq lock.
4353 	 * and a touch may have disabled and dequeued the knote.
4354 	 */
4355 	if (!(kn->kn_status & KN_QUEUED)) {
4356 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4357 		return EJUSTRETURN;
4358 	}
4359 
4360 	/*
4361 	 * For deferred-drop or vanished events, we just create a fake
4362 	 * event to acknowledge end-of-life.  Otherwise, we call the
4363 	 * filter's process routine to snapshot the kevent state under
4364 	 * the filter's locking protocol.
4365 	 *
4366 	 * suppress knotes to avoid returning the same event multiple times in
4367 	 * a single call.
4368 	 */
4369 	knote_suppress(kq, kn);
4370 
4371 	if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4372 		uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4373 		if (kn->kn_status & KN_DEFERDELETE) {
4374 			kev_flags |= EV_DELETE;
4375 		} else {
4376 			kev_flags |= EV_VANISHED;
4377 		}
4378 
4379 		/* create fake event */
4380 		kev = (struct kevent_qos_s){
4381 			.filter = kn->kn_filter,
4382 			.ident  = kn->kn_id,
4383 			.flags  = kev_flags,
4384 			.udata  = kn->kn_udata,
4385 		};
4386 	} else {
4387 		kqunlock(kq);
4388 		kev = (struct kevent_qos_s) { };
4389 		result = filter_call(knote_fops(kn), f_process(kn, &kev));
4390 		kqlock(kq);
4391 	}
4392 
4393 	/*
4394 	 * Determine how to dispatch the knote for future event handling.
4395 	 * not-fired: just return (do not callout, leave deactivated).
4396 	 * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4397 	 *            is the deferred delete event delivery itself).  Otherwise,
4398 	 *            drop it.
4399 	 * Dispatch:  don't clear state, just mark it disabled.
4400 	 * Cleared:   just leave it deactivated.
4401 	 * Others:    re-activate as there may be more events to handle.
4402 	 *            This will not wake up more handlers right now, but
4403 	 *            at the completion of handling events it may trigger
4404 	 *            more handler threads (TODO: optimize based on more than
4405 	 *            just this one event being detected by the filter).
4406 	 */
4407 	if ((result & FILTER_ACTIVE) == 0) {
4408 		if ((kn->kn_status & KN_ACTIVE) == 0) {
4409 			/*
4410 			 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4411 			 * within f_process() but that doesn't necessarily make them
4412 			 * ready to process, so we should leave them be.
4413 			 *
4414 			 * For other knotes, since we will not return an event,
4415 			 * there's no point keeping the knote suppressed.
4416 			 */
4417 			knote_unsuppress(kq, kn);
4418 		}
4419 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4420 		return EJUSTRETURN;
4421 	}
4422 
4423 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4424 		knote_adjust_qos(kq, kn, result);
4425 	}
4426 
4427 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4428 		kqueue_update_iotier_override(kq);
4429 	}
4430 
4431 	kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4432 
4433 	if (kev.flags & EV_ONESHOT) {
4434 		if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4435 		    (kn->kn_status & KN_DEFERDELETE) == 0) {
4436 			/* defer dropping non-delete oneshot dispatch2 events */
4437 			kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4438 		} else {
4439 			drop = true;
4440 		}
4441 	} else if (kn->kn_flags & EV_DISPATCH) {
4442 		/* disable all dispatch knotes */
4443 		kn->kn_status |= KN_DISABLED;
4444 	} else if ((kn->kn_flags & EV_CLEAR) == 0) {
4445 		/* re-activate in case there are more events */
4446 		knote_activate(kq, kn, FILTER_ACTIVE);
4447 	}
4448 
4449 	/*
4450 	 * callback to handle each event as we find it.
4451 	 * If we have to detach and drop the knote, do
4452 	 * it while we have the kq unlocked.
4453 	 */
4454 	if (drop) {
4455 		knote_drop(kq, kn, &knlc);
4456 	} else {
4457 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4458 	}
4459 
4460 	if (kev.flags & EV_VANISHED) {
4461 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4462 		    kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4463 		    kn->kn_filtid);
4464 	}
4465 
4466 	error = (callback)(&kev, kectx);
4467 	kqlock(kq);
4468 	return error;
4469 }
4470 
4471 /*
4472  * Returns -1 if the kqueue was unbound and processing should not happen
4473  */
4474 #define KQWQAE_BEGIN_PROCESSING 1
4475 #define KQWQAE_END_PROCESSING   2
4476 #define KQWQAE_UNBIND           3
4477 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4478 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4479     int kevent_flags, int kqwqae_op)
4480 {
4481 	struct knote *kn;
4482 	int rc = 0;
4483 	bool unbind;
4484 	struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4485 	struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4486 
4487 	kqlock_held(&kqwq->kqwq_kqueue);
4488 
4489 	/*
4490 	 * Return suppressed knotes to their original state.
4491 	 * For workq kqueues, suppressed ones that are still
4492 	 * truly active (not just forced into the queue) will
4493 	 * set flags we check below to see if anything got
4494 	 * woken up.
4495 	 */
4496 	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4497 		knote_unsuppress(kqwq, kn);
4498 	}
4499 
4500 	if (kqwqae_op == KQWQAE_UNBIND) {
4501 		unbind = true;
4502 	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4503 		unbind = false;
4504 	} else {
4505 		unbind = TAILQ_EMPTY(queue);
4506 	}
4507 	if (unbind) {
4508 		thread_t thread = kqr_thread_fast(kqr);
4509 		thread_qos_t old_override;
4510 
4511 #if MACH_ASSERT
4512 		thread_t self = current_thread();
4513 		struct uthread *ut = get_bsdthread_info(self);
4514 
4515 		assert(thread == self);
4516 		assert(ut->uu_kqr_bound == kqr);
4517 #endif // MACH_ASSERT
4518 
4519 		old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4520 		if (!TAILQ_EMPTY(queue)) {
4521 			/*
4522 			 * Request a new thread if we didn't process the whole
4523 			 * queue.
4524 			 */
4525 			kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4526 			    kqr->tr_kq_qos_index, 0);
4527 		}
4528 		if (old_override) {
4529 			thread_drop_kevent_override(thread);
4530 		}
4531 		rc = -1;
4532 	}
4533 
4534 	return rc;
4535 }
4536 
4537 /*
4538  * Return 0 to indicate that processing should proceed,
4539  * -1 if there is nothing to process.
4540  *
4541  * Called with kqueue locked and returns the same way,
4542  * but may drop lock temporarily.
4543  */
4544 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4545 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4546     int kevent_flags)
4547 {
4548 	int rc = 0;
4549 
4550 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4551 	    0, kqr->tr_kq_qos_index);
4552 
4553 	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4554 	    KQWQAE_BEGIN_PROCESSING);
4555 
4556 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4557 	    thread_tid(kqr_thread(kqr)),
4558 	    !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4559 
4560 	return rc;
4561 }
4562 
4563 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4564 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4565 {
4566 	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4567 	struct knote *kn, *tmp;
4568 
4569 	kqlock_held(kqwl);
4570 
4571 	TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4572 		/*
4573 		 * If a knote that can adjust QoS is disabled because of the automatic
4574 		 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4575 		 * further overrides keep pushing.
4576 		 */
4577 		if (knote_fops(kn)->f_adjusts_qos &&
4578 		    (kn->kn_status & KN_DISABLED) != 0 &&
4579 		    (kn->kn_status & KN_DROPPING) == 0 &&
4580 		    (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4581 			qos = MAX(qos, kn->kn_qos_override);
4582 			continue;
4583 		}
4584 		knote_unsuppress(kqwl, kn);
4585 	}
4586 
4587 	return qos;
4588 }
4589 
4590 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4591 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4592 {
4593 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4594 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4595 	int rc = 0, op = KQWL_UTQ_NONE;
4596 
4597 	kqlock_held(kq);
4598 
4599 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4600 	    kqwl->kqwl_dynamicid, 0, 0);
4601 
4602 	/* nobody else should still be processing */
4603 	assert((kq->kq_state & KQ_PROCESSING) == 0);
4604 
4605 	kq->kq_state |= KQ_PROCESSING;
4606 
4607 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4608 		/*
4609 		 * When "parking" we want to process events and if no events are found
4610 		 * unbind. (Except for WORKQ_TR_FLAG_PERMANENT_BIND where the soft unbind
4611 		 * and bound thread park happen in the caller.)
4612 		 *
4613 		 * However, non overcommit threads sometimes park even when they have
4614 		 * more work so that the pool can narrow.  For these, we need to unbind
4615 		 * early, so that calling kqworkloop_update_threads_qos() can ask the
4616 		 * workqueue subsystem whether the thread should park despite having
4617 		 * pending events.
4618 		 *
4619 		 */
4620 		if (kqr->tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT | WORKQ_TR_FLAG_PERMANENT_BIND)) {
4621 			op = KQWL_UTQ_PARKING;
4622 		} else {
4623 			op = KQWL_UTQ_UNBINDING;
4624 		}
4625 	} else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4626 		op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4627 	}
4628 
4629 	if (op != KQWL_UTQ_NONE) {
4630 		thread_qos_t qos_override;
4631 		thread_t thread = kqr_thread_fast(kqr);
4632 
4633 		qos_override = kqworkloop_acknowledge_events(kqwl);
4634 
4635 		if (op == KQWL_UTQ_UNBINDING) {
4636 			kqworkloop_unbind_locked(kqwl, thread,
4637 			    KQWL_OVERRIDE_DROP_IMMEDIATELY, 0);
4638 			kqworkloop_release_live(kqwl);
4639 		}
4640 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
4641 		if (op == KQWL_UTQ_PARKING &&
4642 		    (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4643 			if ((kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) &&
4644 			    (!(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND))) {
4645 				kqworkloop_unbind_locked(kqwl, thread,
4646 				    KQWL_OVERRIDE_DROP_DELAYED, 0);
4647 				kqworkloop_release_live(kqwl);
4648 			}
4649 			rc = -1; /* To indicate stop begin processing. */
4650 		} else if (op == KQWL_UTQ_UNBINDING &&
4651 		    kqr_thread(kqr) != thread) {
4652 			rc = -1; /* To indicate stop begin processing. */
4653 		}
4654 
4655 		if (rc == -1) {
4656 			kq->kq_state &= ~KQ_PROCESSING;
4657 			if (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) {
4658 				goto done;
4659 			}
4660 			kqworkloop_unbind_delayed_override_drop(thread);
4661 		}
4662 	}
4663 done:
4664 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4665 	    kqwl->kqwl_dynamicid, 0, 0);
4666 
4667 	return rc;
4668 }
4669 
4670 /*
4671  * Return 0 to indicate that processing should proceed,
4672  * -1 if there is nothing to process.
4673  * EBADF if the kqueue is draining
4674  *
4675  * Called with kqueue locked and returns the same way,
4676  * but may drop lock temporarily.
4677  * May block.
4678  */
4679 static int
kqfile_begin_processing(struct kqfile * kq)4680 kqfile_begin_processing(struct kqfile *kq)
4681 {
4682 	kqlock_held(kq);
4683 
4684 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4685 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4686 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4687 
4688 	/* wait to become the exclusive processing thread */
4689 	while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4690 		kq->kqf_state |= KQ_PROCWAIT;
4691 		lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4692 		    &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4693 	}
4694 
4695 	if (kq->kqf_state & KQ_DRAIN) {
4696 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4697 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4698 		return EBADF;
4699 	}
4700 
4701 	/* Nobody else processing */
4702 
4703 	/* anything left to process? */
4704 	if (kq->kqf_count == 0) {
4705 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4706 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4707 		return -1;
4708 	}
4709 
4710 	/* convert to processing mode */
4711 	kq->kqf_state |= KQ_PROCESSING;
4712 
4713 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4714 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4715 	return 0;
4716 }
4717 
4718 /*
4719  * Try to end the processing, only called when a workq thread is attempting to
4720  * park (KEVENT_FLAG_PARKING is set).
4721  *
4722  * When returning -1, the kqworkq is setup again so that it is ready to be
4723  * processed.
4724  */
4725 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4726 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4727     int kevent_flags)
4728 {
4729 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4730 		/*
4731 		 * if acknowledge events "succeeds" it means there are events,
4732 		 * which is a failure condition for end_processing.
4733 		 */
4734 		int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4735 		    KQWQAE_END_PROCESSING);
4736 		if (rc == 0) {
4737 			return -1;
4738 		}
4739 	}
4740 
4741 	return 0;
4742 }
4743 
4744 /*
4745  * Try to end the processing, only called when a workq thread is attempting to
4746  * park (KEVENT_FLAG_PARKING is set).
4747  *
4748  * When returning -1, the kqworkq is setup again so that it is ready to be
4749  * processed (as if kqworkloop_begin_processing had just been called).
4750  *
4751  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4752  * the kqworkloop is unbound from its servicer as a side effect.
4753  */
4754 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4755 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4756 {
4757 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4758 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4759 	int rc = 0;
4760 
4761 	kqlock_held(kq);
4762 
4763 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4764 	    kqwl->kqwl_dynamicid, 0, 0);
4765 
4766 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4767 		thread_t thread = kqr_thread_fast(kqr);
4768 		thread_qos_t qos_override;
4769 
4770 		/*
4771 		 * When KEVENT_FLAG_PARKING is set, we need to attempt
4772 		 * an unbind while still under the lock.
4773 		 *
4774 		 * So we do everything kqworkloop_unbind() would do, but because
4775 		 * we're inside kqueue_process(), if the workloop actually
4776 		 * received events while our locks were dropped, we have
4777 		 * the opportunity to fail the end processing and loop again.
4778 		 *
4779 		 * This avoids going through the process-wide workqueue lock
4780 		 * hence scales better.
4781 		 */
4782 		assert(flags & KQ_PROCESSING);
4783 		qos_override = kqworkloop_acknowledge_events(kqwl);
4784 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4785 
4786 		if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4787 			rc = -1; /* To indicate we should continue processing. */
4788 		} else {
4789 			if (kqr_thread_permanently_bound(kqr)) {
4790 				/*
4791 				 * For these, the actual soft unbind and bound thread park
4792 				 * happen in the caller.
4793 				 */
4794 				kq->kq_state &= ~flags;
4795 			} else {
4796 				kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
4797 				kqworkloop_release_live(kqwl);
4798 				kq->kq_state &= ~flags;
4799 				kqworkloop_unbind_delayed_override_drop(thread);
4800 			}
4801 		}
4802 	} else {
4803 		kq->kq_state &= ~flags;
4804 		kq->kq_state |= KQ_R2K_ARMED;
4805 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4806 	}
4807 
4808 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4809 	    kqwl->kqwl_dynamicid, 0, 0);
4810 
4811 	return rc;
4812 }
4813 
4814 /*
4815  * Called with kqueue lock held.
4816  *
4817  * 0: no more events
4818  * -1: has more events
4819  * EBADF: kqueue is in draining mode
4820  */
4821 static int
kqfile_end_processing(struct kqfile * kq)4822 kqfile_end_processing(struct kqfile *kq)
4823 {
4824 	struct knote *kn;
4825 	int procwait;
4826 
4827 	kqlock_held(kq);
4828 
4829 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4830 
4831 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4832 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4833 
4834 	/*
4835 	 * Return suppressed knotes to their original state.
4836 	 */
4837 	while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4838 		knote_unsuppress(kq, kn);
4839 	}
4840 
4841 	procwait = (kq->kqf_state & KQ_PROCWAIT);
4842 	kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4843 
4844 	if (procwait) {
4845 		/* first wake up any thread already waiting to process */
4846 		thread_wakeup(&kq->kqf_suppressed);
4847 	}
4848 
4849 	if (kq->kqf_state & KQ_DRAIN) {
4850 		return EBADF;
4851 	}
4852 	return kq->kqf_count != 0 ? -1 : 0;
4853 }
4854 
4855 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4856 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4857     struct kqueue_workloop_params *params, int *retval)
4858 {
4859 	int error = 0;
4860 	struct kqworkloop *kqwl;
4861 	struct filedesc *fdp = &p->p_fd;
4862 	workq_threadreq_param_t trp = { };
4863 	struct workq_threadreq_extended_param_s trp_extended = {0};
4864 	integer_t trp_preadopt_priority = 0;
4865 	integer_t trp_preadopt_policy = 0;
4866 
4867 	switch (cmd) {
4868 	case KQ_WORKLOOP_CREATE:
4869 		if (!params->kqwlp_flags) {
4870 			error = EINVAL;
4871 			break;
4872 		}
4873 
4874 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4875 		    (params->kqwlp_sched_pri < 1 ||
4876 		    params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4877 			error = EINVAL;
4878 			break;
4879 		}
4880 
4881 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4882 		    invalid_policy(params->kqwlp_sched_pol)) {
4883 			error = EINVAL;
4884 			break;
4885 		}
4886 
4887 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4888 		    (params->kqwlp_cpu_percent <= 0 ||
4889 		    params->kqwlp_cpu_percent > 100 ||
4890 		    params->kqwlp_cpu_refillms <= 0 ||
4891 		    params->kqwlp_cpu_refillms > 0x00ffffff)) {
4892 			error = EINVAL;
4893 			break;
4894 		}
4895 
4896 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WITH_BOUND_THREAD) {
4897 			if (!bootarg_thread_bound_kqwl_support_enabled) {
4898 				error = ENOTSUP;
4899 				break;
4900 			}
4901 			trp.trp_flags |= TRP_BOUND_THREAD;
4902 		}
4903 
4904 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4905 			/*
4906 			 * This flag serves the purpose of preadopting tg from work interval
4907 			 * on servicer/creator/bound thread at wakeup/creation time in kernel.
4908 			 *
4909 			 * Additionally, it helps the bound thread join the work interval
4910 			 * before it comes out to userspace for the first time.
4911 			 */
4912 			struct work_interval *work_interval = NULL;
4913 			kern_return_t kr;
4914 
4915 			kr = kern_port_name_to_work_interval(params->kqwl_wi_port,
4916 			    &work_interval);
4917 			if (kr != KERN_SUCCESS) {
4918 				error = EINVAL;
4919 				break;
4920 			}
4921 			/* work_interval has a +1 ref */
4922 
4923 			kr = kern_work_interval_get_policy(work_interval,
4924 			    &trp_preadopt_policy,
4925 			    &trp_preadopt_priority);
4926 			if (kr != KERN_SUCCESS) {
4927 				kern_work_interval_release(work_interval);
4928 				error = EINVAL;
4929 				break;
4930 			}
4931 			/* The work interval comes with scheduling policy. */
4932 			if (trp_preadopt_policy) {
4933 				trp.trp_flags |= TRP_POLICY;
4934 				trp.trp_pol = (uint8_t)trp_preadopt_policy;
4935 
4936 				trp.trp_flags |= TRP_PRIORITY;
4937 				trp.trp_pri = (uint8_t)trp_preadopt_priority;
4938 			}
4939 #if CONFIG_PREADOPT_TG
4940 			kr = kern_work_interval_get_thread_group(work_interval,
4941 			    &trp_extended.trp_permanent_preadopt_tg);
4942 			if (kr != KERN_SUCCESS) {
4943 				kern_work_interval_release(work_interval);
4944 				error = EINVAL;
4945 				break;
4946 			}
4947 			/*
4948 			 * In case of KERN_SUCCESS, we take
4949 			 * : +1 ref on a thread group backing this work interval
4950 			 * via kern_work_interval_get_thread_group and pass it on to kqwl.
4951 			 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4952 			 * get back this ref, we release them before returning.
4953 			 */
4954 #endif
4955 			if (trp.trp_flags & TRP_BOUND_THREAD) {
4956 				/*
4957 				 * For TRP_BOUND_THREAD, we pass +1 ref on the work_interval on to
4958 				 * kqwl so the bound thread can join it before coming out to
4959 				 * userspace.
4960 				 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4961 				 * get back this ref, we release them before returning.
4962 				 */
4963 				trp_extended.trp_work_interval = work_interval;
4964 			} else {
4965 				kern_work_interval_release(work_interval);
4966 			}
4967 		}
4968 
4969 		if (!(trp.trp_flags & (TRP_POLICY | TRP_PRIORITY))) {
4970 			/*
4971 			 * We always prefer scheduling policy + priority that comes with
4972 			 * a work interval. It it does not exist, we fallback to what the user
4973 			 * has asked.
4974 			 */
4975 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4976 				trp.trp_flags |= TRP_PRIORITY;
4977 				trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4978 			}
4979 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4980 				trp.trp_flags |= TRP_POLICY;
4981 				trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4982 			}
4983 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4984 				trp.trp_flags |= TRP_CPUPERCENT;
4985 				trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4986 				trp.trp_refillms = params->kqwlp_cpu_refillms;
4987 			}
4988 		}
4989 
4990 #if CONFIG_PREADOPT_TG
4991 		if ((trp.trp_flags == 0) &&
4992 		    (trp_extended.trp_permanent_preadopt_tg == NULL)) {
4993 #else
4994 		if (trp.trp_flags == 0) {
4995 #endif
4996 			error = EINVAL;
4997 			break;
4998 		}
4999 
5000 		error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
5001 		    &trp_extended,
5002 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5003 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
5004 		if (error) {
5005 			/* kqworkloop_get_or_create did not consume these refs. */
5006 #if CONFIG_PREADOPT_TG
5007 			if (trp_extended.trp_permanent_preadopt_tg) {
5008 				thread_group_release(trp_extended.trp_permanent_preadopt_tg);
5009 			}
5010 #endif
5011 			if (trp_extended.trp_work_interval) {
5012 				kern_work_interval_release(trp_extended.trp_work_interval);
5013 			}
5014 			break;
5015 		}
5016 
5017 		if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
5018 			/* FD_WORKLOOP indicates we've ever created a workloop
5019 			 * via this syscall but its only ever added to a process, never
5020 			 * removed.
5021 			 */
5022 			proc_fdlock(p);
5023 			fdt_flag_set(fdp, FD_WORKLOOP);
5024 			proc_fdunlock(p);
5025 		}
5026 		break;
5027 	case KQ_WORKLOOP_DESTROY:
5028 		error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL, NULL,
5029 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5030 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
5031 		if (error) {
5032 			break;
5033 		}
5034 		kqlock(kqwl);
5035 		trp.trp_value = kqwl->kqwl_params;
5036 		if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5037 			trp.trp_flags |= TRP_RELEASED;
5038 			kqwl->kqwl_params = trp.trp_value;
5039 			if (trp.trp_flags & TRP_BOUND_THREAD) {
5040 				kqworkloop_bound_thread_wakeup(kqwl);
5041 			}
5042 			kqworkloop_release_live(kqwl);
5043 		} else {
5044 			error = EINVAL;
5045 		}
5046 		kqunlock(kqwl);
5047 		kqworkloop_release(kqwl);
5048 		break;
5049 	}
5050 	*retval = 0;
5051 	return error;
5052 }
5053 
5054 int
5055 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
5056 {
5057 	struct kqueue_workloop_params params = {
5058 		.kqwlp_id = 0,
5059 	};
5060 	if (uap->sz < sizeof(params.kqwlp_version)) {
5061 		return EINVAL;
5062 	}
5063 
5064 	size_t copyin_sz = MIN(sizeof(params), uap->sz);
5065 	int rv = copyin(uap->addr, &params, copyin_sz);
5066 	if (rv) {
5067 		return rv;
5068 	}
5069 
5070 	if (params.kqwlp_version != (int)uap->sz) {
5071 		return EINVAL;
5072 	}
5073 
5074 	return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
5075 	           retval);
5076 }
5077 
5078 static int
5079 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
5080 {
5081 	struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
5082 	int retnum = 0;
5083 
5084 	assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5085 
5086 	if (which == FREAD) {
5087 		kqlock(kq);
5088 		if (kqfile_begin_processing(kq) == 0) {
5089 			retnum = kq->kqf_count;
5090 			kqfile_end_processing(kq);
5091 		} else if ((kq->kqf_state & KQ_DRAIN) == 0) {
5092 			selrecord(kq->kqf_p, &kq->kqf_sel, wql);
5093 		}
5094 		kqunlock(kq);
5095 	}
5096 	return retnum;
5097 }
5098 
5099 /*
5100  * kqueue_close -
5101  */
5102 static int
5103 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5104 {
5105 	struct kqfile *kqf = fg_get_data(fg);
5106 
5107 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5108 	kqlock(kqf);
5109 	selthreadclear(&kqf->kqf_sel);
5110 	kqunlock(kqf);
5111 	kqueue_dealloc(&kqf->kqf_kqueue);
5112 	fg_set_data(fg, NULL);
5113 	return 0;
5114 }
5115 
5116 /*
5117  * Max depth of the nested kq path that can be created.
5118  * Note that this has to be less than the size of kq_level
5119  * to avoid wrapping around and mislabeling the level. We also
5120  * want to be aggressive about this so that we don't overflow the
5121  * kernel stack while posting kevents
5122  */
5123 #define MAX_NESTED_KQ 10
5124 
5125 /*
5126  * The callers has taken a use-count reference on this kqueue and will donate it
5127  * to the kqueue we are being added to.  This keeps the kqueue from closing until
5128  * that relationship is torn down.
5129  */
5130 static int
5131 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
5132     __unused struct kevent_qos_s *kev)
5133 {
5134 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5135 	struct kqueue *kq = &kqf->kqf_kqueue;
5136 	struct kqueue *parentkq = knote_get_kq(kn);
5137 
5138 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5139 
5140 	if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
5141 		knote_set_error(kn, EINVAL);
5142 		return 0;
5143 	}
5144 
5145 	/*
5146 	 * We have to avoid creating a cycle when nesting kqueues
5147 	 * inside another.  Rather than trying to walk the whole
5148 	 * potential DAG of nested kqueues, we just use a simple
5149 	 * ceiling protocol.  When a kqueue is inserted into another,
5150 	 * we check that the (future) parent is not already nested
5151 	 * into another kqueue at a lower level than the potenial
5152 	 * child (because it could indicate a cycle).  If that test
5153 	 * passes, we just mark the nesting levels accordingly.
5154 	 *
5155 	 * Only up to MAX_NESTED_KQ can be nested.
5156 	 *
5157 	 * Note: kqworkq and kqworkloop cannot be nested and have reused their
5158 	 *       kq_level field, so ignore these as parent.
5159 	 */
5160 
5161 	kqlock(parentkq);
5162 
5163 	if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
5164 		if (parentkq->kq_level > 0 &&
5165 		    parentkq->kq_level < kq->kq_level) {
5166 			kqunlock(parentkq);
5167 			knote_set_error(kn, EINVAL);
5168 			return 0;
5169 		}
5170 
5171 		/* set parent level appropriately */
5172 		uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5173 		if (plevel < kq->kq_level + 1) {
5174 			if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5175 				kqunlock(parentkq);
5176 				knote_set_error(kn, EINVAL);
5177 				return 0;
5178 			}
5179 			plevel = kq->kq_level + 1;
5180 		}
5181 
5182 		parentkq->kq_level = plevel;
5183 	}
5184 
5185 	kqunlock(parentkq);
5186 
5187 	kn->kn_filtid = EVFILTID_KQREAD;
5188 	kqlock(kq);
5189 	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5190 	/* indicate nesting in child, if needed */
5191 	if (kq->kq_level == 0) {
5192 		kq->kq_level = 1;
5193 	}
5194 
5195 	int count = kq->kq_count;
5196 	kqunlock(kq);
5197 	return count > 0;
5198 }
5199 
5200 __attribute__((noinline))
5201 static void
5202 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
5203 {
5204 	/* wakeup a thread waiting on this queue */
5205 	selwakeup(&kqf->kqf_sel);
5206 
5207 	/* wake up threads in kqueue_scan() */
5208 	if (kqf->kqf_state & KQ_SLEEP) {
5209 		kqf->kqf_state &= ~KQ_SLEEP;
5210 		thread_wakeup_with_result(&kqf->kqf_count, wr);
5211 	}
5212 
5213 	if (hint == NOTE_REVOKE) {
5214 		/* wakeup threads waiting their turn to process */
5215 		if (kqf->kqf_state & KQ_PROCWAIT) {
5216 			assert(kqf->kqf_state & KQ_PROCESSING);
5217 			kqf->kqf_state &= ~KQ_PROCWAIT;
5218 			thread_wakeup(&kqf->kqf_suppressed);
5219 		}
5220 
5221 		/* no need to KNOTE: knote_fdclose() takes care of it */
5222 	} else {
5223 		/* wakeup other kqueues/select sets we're inside */
5224 		KNOTE(&kqf->kqf_sel.si_note, hint);
5225 	}
5226 }
5227 
5228 /*
5229  * kqueue_drain - called when kq is closed
5230  */
5231 static int
5232 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5233 {
5234 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5235 
5236 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5237 
5238 	kqlock(kqf);
5239 	kqf->kqf_state |= KQ_DRAIN;
5240 	kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5241 	kqunlock(kqf);
5242 	return 0;
5243 }
5244 
5245 int
5246 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5247 {
5248 	assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5249 
5250 	kqlock(kq);
5251 	if (isstat64 != 0) {
5252 		struct stat64 *sb64 = (struct stat64 *)ub;
5253 
5254 		bzero((void *)sb64, sizeof(*sb64));
5255 		sb64->st_size = kq->kq_count;
5256 		if (kq->kq_state & KQ_KEV_QOS) {
5257 			sb64->st_blksize = sizeof(struct kevent_qos_s);
5258 		} else if (kq->kq_state & KQ_KEV64) {
5259 			sb64->st_blksize = sizeof(struct kevent64_s);
5260 		} else if (IS_64BIT_PROCESS(p)) {
5261 			sb64->st_blksize = sizeof(struct user64_kevent);
5262 		} else {
5263 			sb64->st_blksize = sizeof(struct user32_kevent);
5264 		}
5265 		sb64->st_mode = S_IFIFO;
5266 	} else {
5267 		struct stat *sb = (struct stat *)ub;
5268 
5269 		bzero((void *)sb, sizeof(*sb));
5270 		sb->st_size = kq->kq_count;
5271 		if (kq->kq_state & KQ_KEV_QOS) {
5272 			sb->st_blksize = sizeof(struct kevent_qos_s);
5273 		} else if (kq->kq_state & KQ_KEV64) {
5274 			sb->st_blksize = sizeof(struct kevent64_s);
5275 		} else if (IS_64BIT_PROCESS(p)) {
5276 			sb->st_blksize = sizeof(struct user64_kevent);
5277 		} else {
5278 			sb->st_blksize = sizeof(struct user32_kevent);
5279 		}
5280 		sb->st_mode = S_IFIFO;
5281 	}
5282 	kqunlock(kq);
5283 	return 0;
5284 }
5285 
5286 static inline bool
5287 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5288 {
5289 	if (current_proc() == kq->kq_p) {
5290 		/*
5291 		 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5292 		 * do combined send/receive and in the case of self-IPC, the AST may bet
5293 		 * set on a thread that will not return to userspace and needs the
5294 		 * thread the AST would create to unblock itself.
5295 		 *
5296 		 * At this time, we really want to target:
5297 		 *
5298 		 * - kevent variants that can cause thread creations, and dispatch
5299 		 *   really only uses kevent_qos and kevent_id,
5300 		 *
5301 		 * - workq_kernreturn (directly about thread creations)
5302 		 *
5303 		 * - bsdthread_ctl which is used for qos changes and has direct impact
5304 		 *   on the creator thread scheduling decisions.
5305 		 */
5306 		switch (current_uthread()->syscall_code) {
5307 		case SYS_kevent_qos:
5308 		case SYS_kevent_id:
5309 		case SYS_workq_kernreturn:
5310 		case SYS_bsdthread_ctl:
5311 			return true;
5312 		}
5313 	}
5314 	return false;
5315 }
5316 
5317 /*
5318  * Interact with the pthread kext to request a servicing there at a specific QoS
5319  * level.
5320  *
5321  * - Caller holds the kqlock
5322  *
5323  * - May be called with the kqueue's wait queue set locked,
5324  *   so cannot do anything that could recurse on that.
5325  */
5326 static void
5327 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5328     kq_index_t qos, int flags)
5329 {
5330 	assert(kqr_thread(kqr) == THREAD_NULL);
5331 	assert(!kqr_thread_requested(kqr));
5332 	struct turnstile *ts = TURNSTILE_NULL;
5333 
5334 	if (workq_is_exiting(kqu.kq->kq_p)) {
5335 		return;
5336 	}
5337 
5338 	kqlock_held(kqu);
5339 
5340 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5341 		struct kqworkloop *kqwl = kqu.kqwl;
5342 
5343 		assert(kqwl->kqwl_owner == THREAD_NULL);
5344 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5345 		    kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5346 		ts = kqwl->kqwl_turnstile;
5347 		/* Add a thread request reference on the kqueue. */
5348 		kqworkloop_retain(kqwl);
5349 
5350 #if CONFIG_PREADOPT_TG
5351 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5352 			&kqwl->kqwl_preadopt_tg, relaxed);
5353 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5354 			/*
5355 			 * This kqwl has been permanently configured with a thread group.
5356 			 * See kqworkloops with scheduling parameters.
5357 			 */
5358 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5359 		} else {
5360 			/*
5361 			 * This thread is the one which is ack-ing the thread group on the kqwl
5362 			 * under the kqlock and will take action accordingly, pairs with the
5363 			 * release barrier in kqueue_set_preadopted_thread_group
5364 			 */
5365 			uint16_t tg_acknowledged;
5366 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5367 			    KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5368 			    &tg_acknowledged, acquire)) {
5369 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5370 			}
5371 		}
5372 #endif
5373 	} else {
5374 		assert(kqu.kq->kq_state & KQ_WORKQ);
5375 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5376 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5377 	}
5378 
5379 	/*
5380 	 * New-style thread request supported.
5381 	 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5382 	 * its use until a corresponding kqueue_threadreq_bind callback.
5383 	 */
5384 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5385 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5386 	}
5387 	if (qos == KQWQ_QOS_MANAGER) {
5388 		qos = WORKQ_THREAD_QOS_MANAGER;
5389 	}
5390 
5391 	if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5392 		/*
5393 		 * Process is shutting down or exec'ing.
5394 		 * All the kqueues are going to be cleaned up
5395 		 * soon. Forget we even asked for a thread -
5396 		 * and make sure we don't ask for more.
5397 		 */
5398 		kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5399 		kqueue_release_live(kqu);
5400 	}
5401 }
5402 
5403 /*
5404  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5405  *
5406  * This is used when kqueue_threadreq_bind may cause a lock inversion.
5407  */
5408 __attribute__((always_inline))
5409 void
5410 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5411     struct uthread *ut)
5412 {
5413 	ut->uu_kqr_bound = kqr;
5414 	kqr->tr_thread = get_machthread(ut);
5415 	kqr->tr_state = WORKQ_TR_STATE_BINDING;
5416 }
5417 
5418 /*
5419  * kqueue_threadreq_bind_commit - commit a bind prepost
5420  *
5421  * The workq code has to commit any binding prepost before the thread has
5422  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5423  */
5424 void
5425 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5426 {
5427 	struct uthread *ut = get_bsdthread_info(thread);
5428 	workq_threadreq_t kqr = ut->uu_kqr_bound;
5429 	kqueue_t kqu = kqr_kqueue(p, kqr);
5430 
5431 	kqlock(kqu);
5432 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5433 		kqueue_threadreq_bind(p, kqr, thread, 0);
5434 	}
5435 	kqunlock(kqu);
5436 }
5437 
5438 void
5439 kqworkloop_bound_thread_terminate(workq_threadreq_t kqr,
5440     uint16_t *uu_workq_flags_orig)
5441 {
5442 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5443 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5444 
5445 	assert(uth == current_uthread());
5446 
5447 	kqlock(kqwl);
5448 
5449 	*uu_workq_flags_orig = uth->uu_workq_flags;
5450 
5451 	uth->uu_workq_flags &= ~UT_WORKQ_NEW;
5452 	uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_JOINED;
5453 	uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_FAILED;
5454 
5455 	workq_kern_bound_thread_reset_pri(NULL, uth);
5456 
5457 	kqunlock(kqwl);
5458 }
5459 
5460 /*
5461  * This is called from kqueue_process with kqlock held.
5462  */
5463 __attribute__((noreturn, noinline))
5464 static void
5465 kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread)
5466 {
5467 	assert(thread == current_thread());
5468 
5469 	kqlock_held(kqwl);
5470 
5471 	assert(!kqwl->kqwl_count);
5472 
5473 	/*
5474 	 * kevent entry points will take a reference on workloops so we need to
5475 	 * undo it before we park for good.
5476 	 */
5477 	kqworkloop_release_live(kqwl);
5478 
5479 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5480 	workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5481 
5482 	if (trp.trp_flags & TRP_RELEASED) {
5483 		/*
5484 		 * We need this check since the kqlock is dropped and retaken
5485 		 * multiple times during kqueue_process and because KQ_SLEEP is not
5486 		 * set, kqworkloop_bound_thread_wakeup is going to be a no-op.
5487 		 */
5488 		kqunlock(kqwl);
5489 		workq_kern_bound_thread_terminate(kqr);
5490 	} else {
5491 		kqworkloop_unbind_locked(kqwl,
5492 		    thread, KQWL_OVERRIDE_DROP_DELAYED, KQUEUE_THREADREQ_UNBIND_SOFT);
5493 		workq_kern_bound_thread_park(kqr);
5494 	}
5495 	__builtin_unreachable();
5496 }
5497 
5498 /*
5499  * A helper function for pthread workqueue subsystem.
5500  *
5501  * This is used to keep things that the workq code needs to do after
5502  * the bound thread's assert_wait minimum.
5503  */
5504 void
5505 kqworkloop_bound_thread_park_prepost(workq_threadreq_t kqr)
5506 {
5507 	assert(current_thread() == kqr->tr_thread);
5508 
5509 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5510 
5511 	kqlock_held(kqwl);
5512 
5513 	kqwl->kqwl_state |= KQ_SLEEP;
5514 
5515 	/* uu_kqueue_override is protected under kqlock. */
5516 	kqworkloop_unbind_delayed_override_drop(kqr->tr_thread);
5517 
5518 	kqunlock(kqwl);
5519 }
5520 
5521 /*
5522  * A helper function for pthread workqueue subsystem.
5523  *
5524  * This is used to keep things that the workq code needs to do after
5525  * the bound thread's assert_wait minimum.
5526  */
5527 void
5528 kqworkloop_bound_thread_park_commit(workq_threadreq_t kqr,
5529     event_t event,
5530     thread_continue_t continuation)
5531 {
5532 	assert(current_thread() == kqr->tr_thread);
5533 
5534 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5535 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5536 
5537 	kqlock(kqwl);
5538 	if (!(kqwl->kqwl_state & KQ_SLEEP)) {
5539 		/*
5540 		 * When we dropped the kqlock to unset the voucher, someone came
5541 		 * around and made us runnable.  But because we weren't waiting on the
5542 		 * event their thread_wakeup() was ineffectual.  To correct for that,
5543 		 * we just run the continuation ourselves.
5544 		 */
5545 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)));
5546 		if (uth->uu_workq_flags & UT_WORKQ_DYING) {
5547 			__assert_only workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5548 			assert(trp.trp_flags & TRP_RELEASED);
5549 		}
5550 		kqunlock(kqwl);
5551 		continuation(NULL, THREAD_AWAKENED);
5552 	} else {
5553 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)) == 0);
5554 		thread_set_pending_block_hint(get_machthread(uth),
5555 		    kThreadWaitParkedBoundWorkQueue);
5556 		assert_wait(event, THREAD_INTERRUPTIBLE);
5557 		kqunlock(kqwl);
5558 		thread_block(continuation);
5559 	}
5560 }
5561 
5562 static void
5563 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5564     workq_kern_threadreq_flags_t flags)
5565 {
5566 	assert(kqr_thread_requested_pending(kqr));
5567 
5568 	kqlock_held(kqu);
5569 
5570 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5571 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5572 	}
5573 
5574 #if CONFIG_PREADOPT_TG
5575 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5576 		struct kqworkloop *kqwl = kqu.kqwl;
5577 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5578 			&kqwl->kqwl_preadopt_tg, relaxed);
5579 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5580 			/*
5581 			 * This kqwl has been permanently configured with a thread group.
5582 			 * See kqworkloops with scheduling parameters.
5583 			 */
5584 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5585 		} else {
5586 			uint16_t tg_ack_status;
5587 			/*
5588 			 * This thread is the one which is ack-ing the thread group on the kqwl
5589 			 * under the kqlock and will take action accordingly, needs acquire
5590 			 * barrier.
5591 			 */
5592 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5593 			    KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5594 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5595 			}
5596 		}
5597 	}
5598 #endif
5599 
5600 	workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5601 }
5602 
5603 /*
5604  * kqueue_threadreq_bind - bind thread to processing kqrequest
5605  *
5606  * The provided thread will be responsible for delivering events
5607  * associated with the given kqrequest.  Bind it and get ready for
5608  * the thread to eventually arrive.
5609  */
5610 void
5611 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5612     unsigned int flags)
5613 {
5614 	kqueue_t kqu = kqr_kqueue(p, kqr);
5615 	struct uthread *ut = get_bsdthread_info(thread);
5616 
5617 	kqlock_held(kqu);
5618 
5619 	assert(ut->uu_kqueue_override == 0);
5620 
5621 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5622 		assert(ut->uu_kqr_bound == kqr);
5623 		assert(kqr->tr_thread == thread);
5624 	} else if (kqr->tr_state == WORKQ_TR_STATE_BOUND) {
5625 		assert(flags & KQUEUE_THREADREQ_BIND_SOFT);
5626 		assert(kqr_thread_permanently_bound(kqr));
5627 	} else {
5628 		assert(kqr_thread_requested_pending(kqr));
5629 		assert(kqr->tr_thread == THREAD_NULL);
5630 		assert(ut->uu_kqr_bound == NULL);
5631 		ut->uu_kqr_bound = kqr;
5632 		kqr->tr_thread = thread;
5633 	}
5634 
5635 	kqr->tr_state = WORKQ_TR_STATE_BOUND;
5636 
5637 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5638 		struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5639 
5640 		if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5641 			/*
5642 			 * <rdar://problem/38626999> shows that asserting here is not ok.
5643 			 *
5644 			 * This is not supposed to happen for correct use of the interface,
5645 			 * but it is sadly possible for userspace (with the help of memory
5646 			 * corruption, such as over-release of a dispatch queue) to make
5647 			 * the creator thread the "owner" of a workloop.
5648 			 *
5649 			 * Once that happens, and that creator thread picks up the same
5650 			 * workloop as a servicer, we trip this codepath. We need to fixup
5651 			 * the state to forget about this thread being the owner, as the
5652 			 * entire workloop state machine expects servicers to never be
5653 			 * owners and everything would basically go downhill from here.
5654 			 */
5655 			kqu.kqwl->kqwl_owner = THREAD_NULL;
5656 			if (kqworkloop_override(kqu.kqwl)) {
5657 				thread_drop_kevent_override(thread);
5658 			}
5659 		}
5660 
5661 		if (ts && (flags & KQUEUE_THREADREQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5662 			/*
5663 			 * Past this point, the interlock is the kq req lock again,
5664 			 * so we can fix the inheritor for good.
5665 			 */
5666 			filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5667 			turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5668 		}
5669 
5670 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5671 		    thread_tid(thread), kqr->tr_kq_qos_index,
5672 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5673 
5674 		ut->uu_kqueue_override = kqr->tr_kq_override_index;
5675 		if (kqr->tr_kq_override_index) {
5676 			thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5677 		}
5678 
5679 #if CONFIG_PREADOPT_TG
5680 		/* Remove reference from kqwl and mark it as bound with the SENTINEL */
5681 		thread_group_qos_t old_tg;
5682 		thread_group_qos_t new_tg;
5683 		int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5684 			if ((old_tg == KQWL_PREADOPTED_TG_NEVER) || KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5685 			        /*
5686 			         * Either an app or a kqwl permanently configured with a thread group.
5687 			         * Nothing to do.
5688 			         */
5689 			        os_atomic_rmw_loop_give_up(break);
5690 			}
5691 			assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5692 			new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5693 		});
5694 
5695 		if (ret) {
5696 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5697 
5698 			if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5699 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5700 				assert(tg != NULL);
5701 
5702 				thread_set_preadopt_thread_group(thread, tg);
5703 				thread_group_release_live(tg); // The thread has a reference
5704 			} else {
5705 				/*
5706 				 * The thread may already have a preadopt thread group on it -
5707 				 * we need to make sure to clear that.
5708 				 */
5709 				thread_set_preadopt_thread_group(thread, NULL);
5710 			}
5711 
5712 			/* We have taken action on the preadopted thread group set on the
5713 			 * set on the kqwl, clear any redrive requests */
5714 			os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5715 		} else {
5716 			if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5717 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5718 				assert(tg != NULL);
5719 				/*
5720 				 * For KQUEUE_THREADREQ_BIND_SOFT, technically the following
5721 				 * set_preadopt should be a no-op since this bound servicer thread
5722 				 * preadopts kqwl's permanent tg at first-initial bind time and
5723 				 * never leaves it until its termination.
5724 				 */
5725 				thread_set_preadopt_thread_group(thread, tg);
5726 				/*
5727 				 * From this point on, kqwl and thread both have +1 ref on this tg.
5728 				 */
5729 			}
5730 		}
5731 #endif
5732 		kqueue_update_iotier_override(kqu);
5733 	} else {
5734 		assert(kqr->tr_kq_override_index == 0);
5735 
5736 #if CONFIG_PREADOPT_TG
5737 		/*
5738 		 * The thread may have a preadopt thread group on it already because it
5739 		 * got tagged with it as a creator thread. So we need to make sure to
5740 		 * clear that since we don't have preadopt thread groups for non-kqwl
5741 		 * cases
5742 		 */
5743 		thread_set_preadopt_thread_group(thread, NULL);
5744 #endif
5745 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5746 		    thread_tid(thread), kqr->tr_kq_qos_index,
5747 		    (kqr->tr_kq_override_index << 16) |
5748 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5749 	}
5750 }
5751 
5752 /*
5753  * kqueue_threadreq_cancel - abort a pending thread request
5754  *
5755  * Called when exiting/exec'ing. Forget our pending request.
5756  */
5757 void
5758 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5759 {
5760 	kqueue_release(kqr_kqueue(p, kqr));
5761 }
5762 
5763 workq_threadreq_param_t
5764 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5765 {
5766 	struct kqworkloop *kqwl;
5767 	workq_threadreq_param_t trp;
5768 
5769 	assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5770 	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5771 	trp.trp_value = kqwl->kqwl_params;
5772 	return trp;
5773 }
5774 
5775 /*
5776  *	kqueue_threadreq_unbind - unbind thread from processing kqueue
5777  *
5778  *	End processing the per-QoS bucket of events and allow other threads
5779  *	to be requested for future servicing.
5780  *
5781  *	caller holds a reference on the kqueue.
5782  */
5783 void
5784 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5785 {
5786 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5787 		kqworkloop_unbind(kqr_kqworkloop(kqr));
5788 	} else {
5789 		kqworkq_unbind(p, kqr);
5790 	}
5791 }
5792 
5793 /*
5794  * If we aren't already busy processing events [for this QoS],
5795  * request workq thread support as appropriate.
5796  *
5797  * TBD - for now, we don't segregate out processing by QoS.
5798  *
5799  * - May be called with the kqueue's wait queue set locked,
5800  *   so cannot do anything that could recurse on that.
5801  */
5802 static void
5803 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5804 {
5805 	workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5806 
5807 	/* convert to thread qos value */
5808 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5809 
5810 	if (!kqr_thread_requested(kqr)) {
5811 		kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5812 	}
5813 }
5814 
5815 /*
5816  * This represent the asynchronous QoS a given workloop contributes,
5817  * hence is the max of the current active knotes (override index)
5818  * and the workloop max qos (userspace async qos).
5819  */
5820 static kq_index_t
5821 kqworkloop_override(struct kqworkloop *kqwl)
5822 {
5823 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5824 	return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5825 }
5826 
5827 static inline void
5828 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5829 {
5830 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5831 
5832 	kqlock_held(kqwl);
5833 
5834 	if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5835 		kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5836 		act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5837 	}
5838 }
5839 
5840 static void
5841 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5842 {
5843 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5844 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5845 	kq_index_t old_override = kqworkloop_override(kqwl);
5846 
5847 	kqlock_held(kqwl);
5848 
5849 	switch (op) {
5850 	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5851 		kqwl->kqwl_wakeup_qos = qos;
5852 		kqworkloop_request_fire_r2k_notification(kqwl);
5853 		goto recompute;
5854 
5855 	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5856 		kqr->tr_kq_override_index = qos;
5857 		goto recompute;
5858 
5859 	case KQWL_UTQ_PARKING:
5860 	case KQWL_UTQ_UNBINDING:
5861 		kqr->tr_kq_override_index = qos;
5862 		OS_FALLTHROUGH;
5863 
5864 	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5865 		if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5866 			assert(qos == THREAD_QOS_UNSPECIFIED);
5867 		}
5868 		if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5869 			kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5870 		}
5871 		kqwl->kqwl_wakeup_qos = 0;
5872 		for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5873 			if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5874 				kqwl->kqwl_wakeup_qos = i;
5875 				kqworkloop_request_fire_r2k_notification(kqwl);
5876 				break;
5877 			}
5878 		}
5879 		OS_FALLTHROUGH;
5880 
5881 	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5882 recompute:
5883 		/*
5884 		 * When modifying the wakeup QoS or the override QoS, we always need to
5885 		 * maintain our invariant that kqr_override_index is at least as large
5886 		 * as the highest QoS for which an event is fired.
5887 		 *
5888 		 * However this override index can be larger when there is an overriden
5889 		 * suppressed knote pushing on the kqueue.
5890 		 */
5891 		if (qos < kqwl->kqwl_wakeup_qos) {
5892 			qos = kqwl->kqwl_wakeup_qos;
5893 		}
5894 		if (kqr->tr_kq_override_index < qos) {
5895 			kqr->tr_kq_override_index = qos;
5896 		}
5897 		break;
5898 
5899 	case KQWL_UTQ_REDRIVE_EVENTS:
5900 		break;
5901 
5902 	case KQWL_UTQ_SET_QOS_INDEX:
5903 		kqr->tr_kq_qos_index = qos;
5904 		break;
5905 
5906 	default:
5907 		panic("unknown kqwl thread qos update operation: %d", op);
5908 	}
5909 
5910 	thread_t kqwl_owner = kqwl->kqwl_owner;
5911 	thread_t servicer = kqr_thread(kqr);
5912 	boolean_t qos_changed = FALSE;
5913 	kq_index_t new_override = kqworkloop_override(kqwl);
5914 
5915 	/*
5916 	 * Apply the diffs to the owner if applicable
5917 	 */
5918 	if (kqwl_owner) {
5919 #if 0
5920 		/* JMM - need new trace hooks for owner overrides */
5921 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5922 		    kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5923 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5924 #endif
5925 		if (new_override == old_override) {
5926 			// nothing to do
5927 		} else if (old_override == THREAD_QOS_UNSPECIFIED) {
5928 			thread_add_kevent_override(kqwl_owner, new_override);
5929 		} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5930 			thread_drop_kevent_override(kqwl_owner);
5931 		} else { /*  old_override != new_override */
5932 			thread_update_kevent_override(kqwl_owner, new_override);
5933 		}
5934 	}
5935 
5936 	/*
5937 	 * apply the diffs to the servicer
5938 	 */
5939 
5940 	if (!kqr_thread_requested(kqr)) {
5941 		/*
5942 		 * No servicer, nor thread-request
5943 		 *
5944 		 * Make a new thread request, unless there is an owner (or the workloop
5945 		 * is suspended in userland) or if there is no asynchronous work in the
5946 		 * first place.
5947 		 */
5948 
5949 		if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5950 			int initiate_flags = 0;
5951 			if (op == KQWL_UTQ_UNBINDING) {
5952 				initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5953 			}
5954 
5955 			/* kqueue_threadreq_initiate handles the acknowledgement of the TG
5956 			 * if needed */
5957 			kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5958 		}
5959 	} else if (servicer) {
5960 		/*
5961 		 * Servicer in flight
5962 		 *
5963 		 * Just apply the diff to the servicer
5964 		 */
5965 
5966 #if CONFIG_PREADOPT_TG
5967 		/* When there's a servicer for the kqwl already, then the servicer will
5968 		 * adopt the thread group in the kqr, we don't need to poke the
5969 		 * workqueue subsystem to make different decisions due to the thread
5970 		 * group. Consider the current request ack-ed.
5971 		 */
5972 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5973 #endif
5974 
5975 		if (kqr_thread_permanently_bound(kqr) && (kqwl->kqwl_state & KQ_SLEEP)) {
5976 			kqr->tr_qos = new_override;
5977 			workq_kern_bound_thread_reset_pri(kqr, get_bsdthread_info(servicer));
5978 		} else {
5979 			struct uthread *ut = get_bsdthread_info(servicer);
5980 			if (ut->uu_kqueue_override != new_override) {
5981 				if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5982 					thread_add_servicer_override(servicer, new_override);
5983 				} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5984 					thread_drop_servicer_override(servicer);
5985 				} else { /* ut->uu_kqueue_override != new_override */
5986 					thread_update_servicer_override(servicer, new_override);
5987 				}
5988 				ut->uu_kqueue_override = new_override;
5989 				qos_changed = TRUE;
5990 			}
5991 		}
5992 	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5993 		/*
5994 		 * No events to deliver anymore.
5995 		 *
5996 		 * However canceling with turnstiles is challenging, so the fact that
5997 		 * the request isn't useful will be discovered by the servicer himself
5998 		 * later on.
5999 		 */
6000 	} else if (old_override != new_override) {
6001 		/*
6002 		 * Request is in flight
6003 		 *
6004 		 * Apply the diff to the thread request.
6005 		 */
6006 		kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
6007 		qos_changed = TRUE;
6008 	}
6009 
6010 	if (qos_changed) {
6011 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6012 		    thread_tid(servicer), kqr->tr_kq_qos_index,
6013 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
6014 	}
6015 }
6016 
6017 static void
6018 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
6019 {
6020 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6021 	thread_t servicer = kqr_thread(kqr);
6022 	uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
6023 
6024 	kqlock_held(kqwl);
6025 
6026 	if (servicer) {
6027 		thread_update_servicer_iotier_override(servicer, iotier);
6028 	}
6029 }
6030 
6031 static void
6032 kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl)
6033 {
6034 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6035 
6036 	kqlock_held(kqwl);
6037 
6038 	assert(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
6039 
6040 	__assert_only struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
6041 	assert(workq_thread_is_permanently_bound(uth));
6042 
6043 	/*
6044 	 * The bound thread takes up the responsibility of setting the KQ_SLEEP
6045 	 * on its way to parking. See kqworkloop_bound_thread_park_prepost.
6046 	 * This state is always manipulated under kqlock.
6047 	 */
6048 	if (kqwl->kqwl_state & KQ_SLEEP) {
6049 		kqwl->kqwl_state &= ~KQ_SLEEP;
6050 		kqueue_threadreq_bind(current_proc(),
6051 		    kqr, kqr->tr_thread, KQUEUE_THREADREQ_BIND_SOFT);
6052 		workq_kern_bound_thread_wakeup(kqr);
6053 	}
6054 }
6055 
6056 static void
6057 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
6058 {
6059 	if (qos <= kqwl->kqwl_wakeup_qos) {
6060 		/*
6061 		 * Shortcut wakeups that really do nothing useful
6062 		 */
6063 		return;
6064 	}
6065 
6066 	if ((kqwl->kqwl_state & KQ_PROCESSING) &&
6067 	    kqr_thread(&kqwl->kqwl_request) == current_thread()) {
6068 		/*
6069 		 * kqworkloop_end_processing() will perform the required QoS
6070 		 * computations when it unsets the processing mode.
6071 		 */
6072 		return;
6073 	}
6074 
6075 	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
6076 
6077 	/*
6078 	 * In case of thread bound kqwl, we let the kqworkloop_update_threads_qos
6079 	 * take care of overriding the servicer first before it waking up. This
6080 	 * simplifies the soft bind of the parked bound thread later.
6081 	 */
6082 	if (kqr_thread_permanently_bound(&kqwl->kqwl_request)) {
6083 		kqworkloop_bound_thread_wakeup(kqwl);
6084 	}
6085 }
6086 
6087 static struct kqtailq *
6088 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6089 {
6090 	if (kq.kq->kq_state & KQ_WORKLOOP) {
6091 		return &kq.kqwl->kqwl_suppressed;
6092 	} else if (kq.kq->kq_state & KQ_WORKQ) {
6093 		return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
6094 	} else {
6095 		return &kq.kqf->kqf_suppressed;
6096 	}
6097 }
6098 
6099 struct turnstile *
6100 kqueue_alloc_turnstile(kqueue_t kqu)
6101 {
6102 	struct kqworkloop *kqwl = kqu.kqwl;
6103 	kq_state_t kq_state;
6104 
6105 	kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
6106 	if (kq_state & KQ_HAS_TURNSTILE) {
6107 		/* force a dependency to pair with the atomic or with release below */
6108 		return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
6109 		           (uintptr_t)kq_state);
6110 	}
6111 
6112 	if (!(kq_state & KQ_WORKLOOP)) {
6113 		return TURNSTILE_NULL;
6114 	}
6115 
6116 	struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
6117 	bool workq_locked = false;
6118 
6119 	kqlock(kqu);
6120 
6121 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6122 		workq_locked = true;
6123 		workq_kern_threadreq_lock(kqwl->kqwl_p);
6124 	}
6125 
6126 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
6127 		free_ts = ts;
6128 		ts = kqwl->kqwl_turnstile;
6129 	} else {
6130 		ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
6131 		    ts, TURNSTILE_WORKLOOPS);
6132 
6133 		/* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6134 		os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
6135 
6136 		if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6137 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
6138 			    &kqwl->kqwl_request, kqwl->kqwl_owner,
6139 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
6140 			/*
6141 			 * The workq may no longer be the interlock after this.
6142 			 * In which case the inheritor wasn't updated.
6143 			 */
6144 		}
6145 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
6146 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6147 		}
6148 	}
6149 
6150 	if (workq_locked) {
6151 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
6152 	}
6153 
6154 	kqunlock(kqu);
6155 
6156 	if (free_ts) {
6157 		turnstile_deallocate(free_ts);
6158 	} else {
6159 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
6160 	}
6161 	return ts;
6162 }
6163 
6164 __attribute__((always_inline))
6165 struct turnstile *
6166 kqueue_turnstile(kqueue_t kqu)
6167 {
6168 	kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
6169 	if (kq_state & KQ_WORKLOOP) {
6170 		return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
6171 	}
6172 	return TURNSTILE_NULL;
6173 }
6174 
6175 __attribute__((always_inline))
6176 struct turnstile *
6177 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
6178 {
6179 	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6180 	if (kqwl) {
6181 		return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
6182 	}
6183 	return TURNSTILE_NULL;
6184 }
6185 
6186 static void
6187 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
6188 {
6189 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6190 
6191 	/*
6192 	 * This test is racy, but since we never remove this bit,
6193 	 * it allows us to avoid taking a lock.
6194 	 */
6195 	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
6196 		return;
6197 	}
6198 
6199 	kqlock_held(kqwl);
6200 
6201 	if (kqr_thread_requested_pending(kqr)) {
6202 		kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
6203 		    WORKQ_THREADREQ_MAKE_OVERCOMMIT);
6204 	} else {
6205 		kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
6206 	}
6207 }
6208 
6209 static void
6210 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
6211     kq_index_t override_index)
6212 {
6213 	workq_threadreq_t kqr;
6214 	kq_index_t old_override_index;
6215 	kq_index_t queue_index = kn->kn_qos_index;
6216 
6217 	if (override_index <= queue_index) {
6218 		return;
6219 	}
6220 
6221 	kqr = kqworkq_get_request(kqwq, queue_index);
6222 
6223 	kqlock_held(kqwq);
6224 
6225 	old_override_index = kqr->tr_kq_override_index;
6226 	if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
6227 		thread_t servicer = kqr_thread(kqr);
6228 		kqr->tr_kq_override_index = override_index;
6229 
6230 		/* apply the override to [incoming?] servicing thread */
6231 		if (servicer) {
6232 			if (old_override_index) {
6233 				thread_update_kevent_override(servicer, override_index);
6234 			} else {
6235 				thread_add_kevent_override(servicer, override_index);
6236 			}
6237 		}
6238 	}
6239 }
6240 
6241 static void
6242 kqueue_update_iotier_override(kqueue_t kqu)
6243 {
6244 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6245 		kqworkloop_update_iotier_override(kqu.kqwl);
6246 	}
6247 }
6248 
6249 static void
6250 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
6251 {
6252 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6253 		kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6254 		    qos);
6255 	} else {
6256 		kqworkq_update_override(kqu.kqwq, kn, qos);
6257 	}
6258 }
6259 
6260 static void
6261 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
6262     enum kqwl_unbind_locked_mode how, unsigned int flags)
6263 {
6264 	struct uthread *ut = get_bsdthread_info(thread);
6265 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6266 
6267 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6268 	    thread_tid(thread), 0, 0);
6269 
6270 	kqlock_held(kqwl);
6271 
6272 	assert(ut->uu_kqr_bound == kqr);
6273 
6274 	if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6275 		ut->uu_kqr_bound = NULL;
6276 	}
6277 
6278 	if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
6279 	    ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6280 		thread_drop_servicer_override(thread);
6281 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6282 	}
6283 
6284 	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6285 		turnstile_update_inheritor(kqwl->kqwl_turnstile,
6286 		    TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6287 		turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6288 		    TURNSTILE_INTERLOCK_HELD);
6289 	}
6290 
6291 #if CONFIG_PREADOPT_TG
6292 	/* The kqueue is able to adopt a thread group again */
6293 
6294 	thread_group_qos_t old_tg, new_tg = NULL;
6295 	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
6296 		new_tg = old_tg;
6297 		if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
6298 		        new_tg = KQWL_PREADOPTED_TG_NULL;
6299 		}
6300 	});
6301 
6302 	if (ret) {
6303 		if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) &&
6304 		    KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
6305 			// The permanently configured bound thread remains a part of the
6306 			// thread group until its termination.
6307 		} else {
6308 			// Servicer can drop any preadopt thread group it has since it has
6309 			// unbound.
6310 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
6311 			thread_set_preadopt_thread_group(thread, NULL);
6312 		}
6313 	}
6314 #endif
6315 	thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
6316 
6317 	if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6318 		kqr->tr_thread = THREAD_NULL;
6319 		kqr->tr_state = WORKQ_TR_STATE_IDLE;
6320 	}
6321 	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
6322 }
6323 
6324 static void
6325 kqworkloop_unbind_delayed_override_drop(thread_t thread)
6326 {
6327 	struct uthread *ut = get_bsdthread_info(thread);
6328 	if (!workq_thread_is_permanently_bound(ut)) {
6329 		assert(ut->uu_kqr_bound == NULL);
6330 	}
6331 	if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6332 		thread_drop_servicer_override(thread);
6333 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6334 	}
6335 }
6336 
6337 /*
6338  *	kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6339  *
6340  *	It will acknowledge events, and possibly request a new thread if:
6341  *	- there were active events left
6342  *	- we pended waitq hook callouts during processing
6343  *	- we pended wakeups while processing (or unsuppressing)
6344  *
6345  *	Called with kqueue lock held.
6346  */
6347 static void
6348 kqworkloop_unbind(struct kqworkloop *kqwl)
6349 {
6350 	struct kqueue *kq = &kqwl->kqwl_kqueue;
6351 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6352 	thread_t thread = kqr_thread_fast(kqr);
6353 	int op = KQWL_UTQ_PARKING;
6354 	kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6355 
6356 	/*
6357 	 * For kqwl permanently bound to a thread, this path is only
6358 	 * exercised when the thread is on its way to terminate.
6359 	 * We don't care about asking for a new thread in that case.
6360 	 */
6361 	bool kqwl_had_bound_thread = kqr_thread_permanently_bound(kqr);
6362 
6363 	assert(thread == current_thread());
6364 
6365 	kqlock(kqwl);
6366 
6367 	if (!kqwl_had_bound_thread) {
6368 		/*
6369 		 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6370 		 * unsuppressing knotes not to be applied until the eventual call to
6371 		 * kqworkloop_update_threads_qos() below.
6372 		 */
6373 		assert((kq->kq_state & KQ_PROCESSING) == 0);
6374 		if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6375 			kq->kq_state |= KQ_PROCESSING;
6376 			qos_override = kqworkloop_acknowledge_events(kqwl);
6377 			kq->kq_state &= ~KQ_PROCESSING;
6378 		}
6379 	}
6380 
6381 	kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
6382 
6383 	if (!kqwl_had_bound_thread) {
6384 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
6385 	}
6386 
6387 	kqunlock(kqwl);
6388 
6389 	/*
6390 	 * Drop the override on the current thread last, after the call to
6391 	 * kqworkloop_update_threads_qos above.
6392 	 */
6393 	kqworkloop_unbind_delayed_override_drop(thread);
6394 
6395 	/* If last reference, dealloc the workloop kq */
6396 	kqworkloop_release(kqwl);
6397 }
6398 
6399 static thread_qos_t
6400 kqworkq_unbind_locked(struct kqworkq *kqwq,
6401     workq_threadreq_t kqr, thread_t thread)
6402 {
6403 	struct uthread *ut = get_bsdthread_info(thread);
6404 	kq_index_t old_override = kqr->tr_kq_override_index;
6405 
6406 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6407 	    thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
6408 
6409 	kqlock_held(kqwq);
6410 
6411 	assert(ut->uu_kqr_bound == kqr);
6412 	ut->uu_kqr_bound = NULL;
6413 	kqr->tr_thread = THREAD_NULL;
6414 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
6415 	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6416 	kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6417 
6418 	return old_override;
6419 }
6420 
6421 /*
6422  *	kqworkq_unbind - unbind of a workq kqueue from a thread
6423  *
6424  *	We may have to request new threads.
6425  *	This can happen there are no waiting processing threads and:
6426  *	- there were active events we never got to (count > 0)
6427  *	- we pended waitq hook callouts during processing
6428  *	- we pended wakeups while processing (or unsuppressing)
6429  */
6430 static void
6431 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6432 {
6433 	struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
6434 	__assert_only int rc;
6435 
6436 	kqlock(kqwq);
6437 	rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6438 	assert(rc == -1);
6439 	kqunlock(kqwq);
6440 }
6441 
6442 workq_threadreq_t
6443 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6444 {
6445 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6446 	return &kqwq->kqwq_request[qos_index - 1];
6447 }
6448 
6449 static void
6450 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6451 {
6452 	kq_index_t qos = _pthread_priority_thread_qos(pp);
6453 
6454 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6455 		assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6456 		pp = _pthread_priority_normalize(pp);
6457 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6458 		if (qos == THREAD_QOS_UNSPECIFIED) {
6459 			/* On workqueues, outside of QoS means MANAGER */
6460 			qos = KQWQ_QOS_MANAGER;
6461 			pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6462 		} else {
6463 			pp = _pthread_priority_normalize(pp);
6464 		}
6465 	} else {
6466 		pp = _pthread_unspecified_priority();
6467 		qos = THREAD_QOS_UNSPECIFIED;
6468 	}
6469 
6470 	kn->kn_qos = (int32_t)pp;
6471 
6472 	if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6473 		/* Never lower QoS when in "Merge" mode */
6474 		kn->kn_qos_override = qos;
6475 	}
6476 
6477 	/* only adjust in-use qos index when not suppressed */
6478 	if (kn->kn_status & KN_SUPPRESSED) {
6479 		kqueue_update_override(kqu, kn, qos);
6480 	} else if (kn->kn_qos_index != qos) {
6481 		knote_dequeue(kqu, kn);
6482 		kn->kn_qos_index = qos;
6483 	}
6484 }
6485 
6486 static void
6487 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
6488 {
6489 	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6490 
6491 	kqlock_held(kq);
6492 
6493 	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6494 	assert(qos_index < THREAD_QOS_LAST);
6495 
6496 	/*
6497 	 * Early exit for knotes that should not change QoS
6498 	 */
6499 	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6500 		panic("filter %d cannot change QoS", kn->kn_filtid);
6501 	} else if (__improbable(!knote_has_qos(kn))) {
6502 		return;
6503 	}
6504 
6505 	/*
6506 	 * knotes with the FALLBACK flag will only use their registration QoS if the
6507 	 * incoming event has no QoS, else, the registration QoS acts as a floor.
6508 	 */
6509 	thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
6510 	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6511 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
6512 			qos_index = req_qos;
6513 		}
6514 	} else {
6515 		if (qos_index < req_qos) {
6516 			qos_index = req_qos;
6517 		}
6518 	}
6519 	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6520 		/* Never lower QoS when in "Merge" mode */
6521 		return;
6522 	}
6523 
6524 	if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6525 		/*
6526 		 * When we're trying to update the QoS override and that both an
6527 		 * f_event() and other f_* calls are running concurrently, any of these
6528 		 * in flight calls may want to perform overrides that aren't properly
6529 		 * serialized with each other.
6530 		 *
6531 		 * The first update that observes this racy situation enters a "Merge"
6532 		 * mode which causes subsequent override requests to saturate the
6533 		 * override instead of replacing its value.
6534 		 *
6535 		 * This mode is left when knote_unlock() or knote_post()
6536 		 * observe that no other f_* routine is in flight.
6537 		 */
6538 		kn->kn_status |= KN_MERGE_QOS;
6539 	}
6540 
6541 	/*
6542 	 * Now apply the override if it changed.
6543 	 */
6544 
6545 	if (kn->kn_qos_override == qos_index) {
6546 		return;
6547 	}
6548 
6549 	kn->kn_qos_override = qos_index;
6550 
6551 	if (kn->kn_status & KN_SUPPRESSED) {
6552 		/*
6553 		 * For suppressed events, the kn_qos_index field cannot be touched as it
6554 		 * allows us to know on which supress queue the knote is for a kqworkq.
6555 		 *
6556 		 * Also, there's no natural push applied on the kqueues when this field
6557 		 * changes anyway. We hence need to apply manual overrides in this case,
6558 		 * which will be cleared when the events are later acknowledged.
6559 		 */
6560 		kqueue_update_override(kq, kn, qos_index);
6561 	} else if (kn->kn_qos_index != qos_index) {
6562 		knote_dequeue(kq, kn);
6563 		kn->kn_qos_index = qos_index;
6564 	}
6565 }
6566 
6567 void
6568 klist_init(struct klist *list)
6569 {
6570 	SLIST_INIT(list);
6571 }
6572 
6573 
6574 /*
6575  *	Query/Post each knote in the object's list
6576  *
6577  *	The object lock protects the list. It is assumed that the filter/event
6578  *	routine for the object can determine that the object is already locked (via
6579  *	the hint) and not deadlock itself.
6580  *
6581  *	Autodetach is a specific contract which will detach all knotes from the
6582  *	object prior to posting the final event for that knote. This is done while
6583  *	under the object lock. A breadcrumb is left in the knote's next pointer to
6584  *	indicate to future calls to f_detach routines that they need not reattempt
6585  *	to knote_detach from the object's klist again. This is currently used by
6586  *	EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6587  *
6588  */
6589 void
6590 knote(struct klist *list, long hint, bool autodetach)
6591 {
6592 	struct knote *kn;
6593 	struct knote *tmp_kn;
6594 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6595 		/*
6596 		 * We can modify the knote's next pointer since since we are holding the
6597 		 * object lock and the list can't be concurrently modified. Anyone
6598 		 * determining auto-detached-ness of a knote should take the primitive lock
6599 		 * to synchronize.
6600 		 *
6601 		 * Note that we do this here instead of the filter's f_event since we may
6602 		 * not even post the event if the knote is being dropped.
6603 		 */
6604 		if (autodetach) {
6605 			kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6606 		}
6607 		knote_post(kn, hint);
6608 	}
6609 
6610 	/* Blast away the entire klist */
6611 	if (autodetach) {
6612 		klist_init(list);
6613 	}
6614 }
6615 
6616 /*
6617  * attach a knote to the specified list.  Return true if this is the first entry.
6618  * The list is protected by whatever lock the object it is associated with uses.
6619  */
6620 int
6621 knote_attach(struct klist *list, struct knote *kn)
6622 {
6623 	int ret = SLIST_EMPTY(list);
6624 	SLIST_INSERT_HEAD(list, kn, kn_selnext);
6625 	return ret;
6626 }
6627 
6628 /*
6629  * detach a knote from the specified list.  Return true if that was the last
6630  * entry.  The list is protected by whatever lock the object it is associated
6631  * with uses.
6632  */
6633 int
6634 knote_detach(struct klist *list, struct knote *kn)
6635 {
6636 	assert(!KNOTE_IS_AUTODETACHED(kn));
6637 
6638 	SLIST_REMOVE(list, kn, knote, kn_selnext);
6639 	return SLIST_EMPTY(list);
6640 }
6641 
6642 /*
6643  * knote_vanish - Indicate that the source has vanished
6644  *
6645  * Used only for vanishing ports - vanishing fds go
6646  * through knote_fdclose()
6647  *
6648  * If the knote has requested EV_VANISHED delivery,
6649  * arrange for that. Otherwise, deliver a NOTE_REVOKE
6650  * event for backward compatibility.
6651  *
6652  * The knote is marked as having vanished. The source's
6653  * reference to the knote is dropped by caller, but the knote's
6654  * source reference is only cleaned up later when the knote is dropped.
6655  *
6656  * Our caller already has the object lock held. Calling
6657  * the detach routine would try to take that lock
6658  * recursively - which likely is not supported.
6659  */
6660 void
6661 knote_vanish(struct klist *list, bool make_active)
6662 {
6663 	struct knote *kn;
6664 	struct knote *kn_next;
6665 
6666 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6667 		struct kqueue *kq = knote_get_kq(kn);
6668 
6669 		kqlock(kq);
6670 		if (__probable(kn->kn_status & KN_REQVANISH)) {
6671 			/*
6672 			 * If EV_VANISH supported - prepare to deliver one
6673 			 */
6674 			kn->kn_status |= KN_VANISHED;
6675 		} else {
6676 			/*
6677 			 * Handle the legacy way to indicate that the port/portset was
6678 			 * deallocated or left the current Mach portspace (modern technique
6679 			 * is with an EV_VANISHED protocol).
6680 			 *
6681 			 * Deliver an EV_EOF event for these changes (hopefully it will get
6682 			 * delivered before the port name recycles to the same generation
6683 			 * count and someone tries to re-register a kevent for it or the
6684 			 * events are udata-specific - avoiding a conflict).
6685 			 */
6686 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
6687 		}
6688 		if (make_active) {
6689 			knote_activate(kq, kn, FILTER_ACTIVE);
6690 		}
6691 		kqunlock(kq);
6692 	}
6693 }
6694 
6695 /*
6696  * remove all knotes referencing a specified fd
6697  *
6698  * Entered with the proc_fd lock already held.
6699  * It returns the same way, but may drop it temporarily.
6700  */
6701 void
6702 knote_fdclose(struct proc *p, int fd)
6703 {
6704 	struct filedesc *fdt = &p->p_fd;
6705 	struct klist *list;
6706 	struct knote *kn;
6707 	KNOTE_LOCK_CTX(knlc);
6708 
6709 restart:
6710 	list = &fdt->fd_knlist[fd];
6711 	SLIST_FOREACH(kn, list, kn_link) {
6712 		struct kqueue *kq = knote_get_kq(kn);
6713 
6714 		kqlock(kq);
6715 
6716 		if (kq->kq_p != p) {
6717 			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6718 			    __func__, kq->kq_p, p);
6719 		}
6720 
6721 		/*
6722 		 * If the knote supports EV_VANISHED delivery,
6723 		 * transition it to vanished mode (or skip over
6724 		 * it if already vanished).
6725 		 */
6726 		if (kn->kn_status & KN_VANISHED) {
6727 			kqunlock(kq);
6728 			continue;
6729 		}
6730 
6731 		proc_fdunlock(p);
6732 		if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6733 			/* the knote was dropped by someone, nothing to do */
6734 		} else if (kn->kn_status & KN_REQVANISH) {
6735 			/*
6736 			 * Since we have REQVANISH for this knote, we need to notify clients about
6737 			 * the EV_VANISHED.
6738 			 *
6739 			 * But unlike mach ports, we want to do the detach here as well and not
6740 			 * defer it so that we can release the iocount that is on the knote and
6741 			 * close the fp.
6742 			 */
6743 			kn->kn_status |= KN_VANISHED;
6744 
6745 			/*
6746 			 * There may be a concurrent post happening, make sure to wait for it
6747 			 * before we detach. knote_wait_for_post() unlocks on kq on exit
6748 			 */
6749 			knote_wait_for_post(kq, kn);
6750 
6751 			knote_fops(kn)->f_detach(kn);
6752 			if (kn->kn_is_fd) {
6753 				fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6754 			}
6755 			kn->kn_filtid = EVFILTID_DETACHED;
6756 			kqlock(kq);
6757 
6758 			knote_activate(kq, kn, FILTER_ACTIVE);
6759 			knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6760 		} else {
6761 			knote_drop(kq, kn, &knlc);
6762 		}
6763 
6764 		proc_fdlock(p);
6765 		goto restart;
6766 	}
6767 }
6768 
6769 /*
6770  * knote_fdfind - lookup a knote in the fd table for process
6771  *
6772  * If the filter is file-based, lookup based on fd index.
6773  * Otherwise use a hash based on the ident.
6774  *
6775  * Matching is based on kq, filter, and ident. Optionally,
6776  * it may also be based on the udata field in the kevent -
6777  * allowing multiple event registration for the file object
6778  * per kqueue.
6779  *
6780  * fd_knhashlock or fdlock held on entry (and exit)
6781  */
6782 static struct knote *
6783 knote_fdfind(struct kqueue *kq,
6784     const struct kevent_internal_s *kev,
6785     bool is_fd,
6786     struct proc *p)
6787 {
6788 	struct filedesc *fdp = &p->p_fd;
6789 	struct klist *list = NULL;
6790 	struct knote *kn = NULL;
6791 
6792 	/*
6793 	 * determine where to look for the knote
6794 	 */
6795 	if (is_fd) {
6796 		/* fd-based knotes are linked off the fd table */
6797 		if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6798 			list = &fdp->fd_knlist[kev->kei_ident];
6799 		}
6800 	} else if (fdp->fd_knhashmask != 0) {
6801 		/* hash non-fd knotes here too */
6802 		list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6803 	}
6804 
6805 	/*
6806 	 * scan the selected list looking for a match
6807 	 */
6808 	if (list != NULL) {
6809 		SLIST_FOREACH(kn, list, kn_link) {
6810 			if (kq == knote_get_kq(kn) &&
6811 			    kev->kei_ident == kn->kn_id &&
6812 			    kev->kei_filter == kn->kn_filter) {
6813 				if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6814 					if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6815 					    kev->kei_udata == kn->kn_udata) {
6816 						break; /* matching udata-specific knote */
6817 					}
6818 				} else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6819 					break; /* matching non-udata-specific knote */
6820 				}
6821 			}
6822 		}
6823 	}
6824 	return kn;
6825 }
6826 
6827 /*
6828  * kq_add_knote- Add knote to the fd table for process
6829  * while checking for duplicates.
6830  *
6831  * All file-based filters associate a list of knotes by file
6832  * descriptor index. All other filters hash the knote by ident.
6833  *
6834  * May have to grow the table of knote lists to cover the
6835  * file descriptor index presented.
6836  *
6837  * fd_knhashlock and fdlock unheld on entry (and exit).
6838  *
6839  * Takes a rwlock boost if inserting the knote is successful.
6840  */
6841 static int
6842 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6843     struct proc *p)
6844 {
6845 	struct filedesc *fdp = &p->p_fd;
6846 	struct klist *list = NULL;
6847 	int ret = 0;
6848 	bool is_fd = kn->kn_is_fd;
6849 
6850 	if (is_fd) {
6851 		proc_fdlock(p);
6852 	} else {
6853 		knhash_lock(fdp);
6854 	}
6855 
6856 	if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6857 		/* found an existing knote: we can't add this one */
6858 		ret = ERESTART;
6859 		goto out_locked;
6860 	}
6861 
6862 	/* knote was not found: add it now */
6863 	if (!is_fd) {
6864 		if (fdp->fd_knhashmask == 0) {
6865 			u_long size = 0;
6866 
6867 			list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6868 			if (list == NULL) {
6869 				ret = ENOMEM;
6870 				goto out_locked;
6871 			}
6872 
6873 			fdp->fd_knhash = list;
6874 			fdp->fd_knhashmask = size;
6875 		}
6876 
6877 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6878 		SLIST_INSERT_HEAD(list, kn, kn_link);
6879 		ret = 0;
6880 		goto out_locked;
6881 	} else {
6882 		/* knote is fd based */
6883 
6884 		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6885 			u_int size = 0;
6886 
6887 			/* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6888 			if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6889 				ret = EINVAL;
6890 				goto out_locked;
6891 			}
6892 			/* have to grow the fd_knlist */
6893 			size = fdp->fd_knlistsize;
6894 			while (size <= kn->kn_id) {
6895 				size += KQEXTENT;
6896 			}
6897 
6898 			if (size >= (UINT_MAX / sizeof(struct klist))) {
6899 				ret = EINVAL;
6900 				goto out_locked;
6901 			}
6902 
6903 			list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6904 			if (list == NULL) {
6905 				ret = ENOMEM;
6906 				goto out_locked;
6907 			}
6908 
6909 			bcopy(fdp->fd_knlist, list,
6910 			    fdp->fd_knlistsize * sizeof(struct klist));
6911 			kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6912 			fdp->fd_knlist = list;
6913 			fdp->fd_knlistsize = size;
6914 		}
6915 
6916 		list = &fdp->fd_knlist[kn->kn_id];
6917 		SLIST_INSERT_HEAD(list, kn, kn_link);
6918 		ret = 0;
6919 		goto out_locked;
6920 	}
6921 
6922 out_locked:
6923 	if (ret == 0) {
6924 		kqlock(kq);
6925 		assert((kn->kn_status & KN_LOCKED) == 0);
6926 		(void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6927 		kqueue_retain(kq); /* retain a kq ref */
6928 	}
6929 	if (is_fd) {
6930 		proc_fdunlock(p);
6931 	} else {
6932 		knhash_unlock(fdp);
6933 	}
6934 
6935 	return ret;
6936 }
6937 
6938 /*
6939  * kq_remove_knote - remove a knote from the fd table for process
6940  *
6941  * If the filter is file-based, remove based on fd index.
6942  * Otherwise remove from the hash based on the ident.
6943  *
6944  * fd_knhashlock and fdlock unheld on entry (and exit).
6945  */
6946 static void
6947 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6948     struct knote_lock_ctx *knlc)
6949 {
6950 	struct filedesc *fdp = &p->p_fd;
6951 	struct klist *list = NULL;
6952 	uint16_t kq_state;
6953 	bool is_fd = kn->kn_is_fd;
6954 
6955 	if (is_fd) {
6956 		proc_fdlock(p);
6957 	} else {
6958 		knhash_lock(fdp);
6959 	}
6960 
6961 	if (is_fd) {
6962 		assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6963 		list = &fdp->fd_knlist[kn->kn_id];
6964 	} else {
6965 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6966 	}
6967 	SLIST_REMOVE(list, kn, knote, kn_link);
6968 
6969 	kqlock(kq);
6970 
6971 	/* Update the servicer iotier override */
6972 	kqueue_update_iotier_override(kq);
6973 
6974 	kq_state = kq->kq_state;
6975 	if (knlc) {
6976 		knote_unlock_cancel(kq, kn, knlc);
6977 	} else {
6978 		kqunlock(kq);
6979 	}
6980 	if (is_fd) {
6981 		proc_fdunlock(p);
6982 	} else {
6983 		knhash_unlock(fdp);
6984 	}
6985 
6986 	if (kq_state & KQ_DYNAMIC) {
6987 		kqworkloop_release((struct kqworkloop *)kq);
6988 	}
6989 }
6990 
6991 /*
6992  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6993  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6994  *
6995  * fd_knhashlock or fdlock unheld on entry (and exit)
6996  */
6997 
6998 static struct knote *
6999 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
7000     bool is_fd, struct proc *p)
7001 {
7002 	struct filedesc *fdp = &p->p_fd;
7003 	struct knote *kn;
7004 
7005 	if (is_fd) {
7006 		proc_fdlock(p);
7007 	} else {
7008 		knhash_lock(fdp);
7009 	}
7010 
7011 	/*
7012 	 * Temporary horrible hack:
7013 	 * this cast is gross and will go away in a future change.
7014 	 * It is OK to do because we don't look at xflags/s_fflags,
7015 	 * and that when we cast down the kev this way,
7016 	 * the truncated filter field works.
7017 	 */
7018 	kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
7019 
7020 	if (kn) {
7021 		kqlock(kq);
7022 		assert(knote_get_kq(kn) == kq);
7023 	}
7024 
7025 	if (is_fd) {
7026 		proc_fdunlock(p);
7027 	} else {
7028 		knhash_unlock(fdp);
7029 	}
7030 
7031 	return kn;
7032 }
7033 
7034 static struct kqtailq *
7035 knote_get_tailq(kqueue_t kqu, struct knote *kn)
7036 {
7037 	kq_index_t qos_index = kn->kn_qos_index;
7038 
7039 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
7040 		assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
7041 		return &kqu.kqwl->kqwl_queue[qos_index - 1];
7042 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
7043 		assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
7044 		return &kqu.kqwq->kqwq_queue[qos_index - 1];
7045 	} else {
7046 		assert(qos_index == QOS_INDEX_KQFILE);
7047 		return &kqu.kqf->kqf_queue;
7048 	}
7049 }
7050 
7051 static void
7052 knote_enqueue(kqueue_t kqu, struct knote *kn)
7053 {
7054 	kqlock_held(kqu);
7055 
7056 	if ((kn->kn_status & KN_ACTIVE) == 0) {
7057 		return;
7058 	}
7059 
7060 	if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
7061 		return;
7062 	}
7063 
7064 	struct kqtailq *queue = knote_get_tailq(kqu, kn);
7065 	bool wakeup = TAILQ_EMPTY(queue);
7066 
7067 	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7068 	kn->kn_status |= KN_QUEUED;
7069 	kqu.kq->kq_count++;
7070 
7071 	if (wakeup) {
7072 		if (kqu.kq->kq_state & KQ_WORKLOOP) {
7073 			kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
7074 		} else if (kqu.kq->kq_state & KQ_WORKQ) {
7075 			kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
7076 		} else {
7077 			kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
7078 		}
7079 	}
7080 }
7081 
7082 __attribute__((always_inline))
7083 static inline void
7084 knote_dequeue(kqueue_t kqu, struct knote *kn)
7085 {
7086 	if (kn->kn_status & KN_QUEUED) {
7087 		struct kqtailq *queue = knote_get_tailq(kqu, kn);
7088 
7089 		// attaching the knote calls knote_reset_priority() without
7090 		// the kqlock which is fine, so we can't call kqlock_held()
7091 		// if we're not queued.
7092 		kqlock_held(kqu);
7093 
7094 		TAILQ_REMOVE(queue, kn, kn_tqe);
7095 		kn->kn_status &= ~KN_QUEUED;
7096 		kqu.kq->kq_count--;
7097 		if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
7098 			assert((kqu.kq->kq_count == 0) ==
7099 			    (bool)TAILQ_EMPTY(queue));
7100 		}
7101 	}
7102 }
7103 
7104 /* called with kqueue lock held */
7105 static void
7106 knote_suppress(kqueue_t kqu, struct knote *kn)
7107 {
7108 	struct kqtailq *suppressq;
7109 
7110 	kqlock_held(kqu);
7111 
7112 	assert((kn->kn_status & KN_SUPPRESSED) == 0);
7113 	assert(kn->kn_status & KN_QUEUED);
7114 
7115 	knote_dequeue(kqu, kn);
7116 	/* deactivate - so new activations indicate a wakeup */
7117 	kn->kn_status &= ~KN_ACTIVE;
7118 	kn->kn_status |= KN_SUPPRESSED;
7119 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
7120 	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7121 }
7122 
7123 __attribute__((always_inline))
7124 static inline void
7125 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
7126 {
7127 	struct kqtailq *suppressq;
7128 
7129 	kqlock_held(kqu);
7130 
7131 	assert(kn->kn_status & KN_SUPPRESSED);
7132 
7133 	kn->kn_status &= ~KN_SUPPRESSED;
7134 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
7135 	TAILQ_REMOVE(suppressq, kn, kn_tqe);
7136 
7137 	/*
7138 	 * If the knote is no longer active, reset its push,
7139 	 * and resynchronize kn_qos_index with kn_qos_override
7140 	 * for knotes with a real qos.
7141 	 */
7142 	if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
7143 		kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
7144 	}
7145 	kn->kn_qos_index = kn->kn_qos_override;
7146 }
7147 
7148 /* called with kqueue lock held */
7149 static void
7150 knote_unsuppress(kqueue_t kqu, struct knote *kn)
7151 {
7152 	knote_unsuppress_noqueue(kqu, kn);
7153 	knote_enqueue(kqu, kn);
7154 }
7155 
7156 __attribute__((always_inline))
7157 static inline void
7158 knote_mark_active(struct knote *kn)
7159 {
7160 	if ((kn->kn_status & KN_ACTIVE) == 0) {
7161 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7162 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
7163 		    kn->kn_filtid);
7164 	}
7165 
7166 	kn->kn_status |= KN_ACTIVE;
7167 }
7168 
7169 /* called with kqueue lock held */
7170 static void
7171 knote_activate(kqueue_t kqu, struct knote *kn, int result)
7172 {
7173 	assert(result & FILTER_ACTIVE);
7174 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
7175 		// may dequeue the knote
7176 		knote_adjust_qos(kqu.kq, kn, result);
7177 	}
7178 	knote_mark_active(kn);
7179 	knote_enqueue(kqu, kn);
7180 }
7181 
7182 /*
7183  * This function applies changes requested by f_attach or f_touch for
7184  * a given filter. It proceeds in a carefully chosen order to help
7185  * every single transition do the minimal amount of work possible.
7186  */
7187 static void
7188 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
7189     int result)
7190 {
7191 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
7192 		kn->kn_status &= ~KN_DISABLED;
7193 
7194 		/*
7195 		 * it is possible for userland to have knotes registered for a given
7196 		 * workloop `wl_orig` but really handled on another workloop `wl_new`.
7197 		 *
7198 		 * In that case, rearming will happen from the servicer thread of
7199 		 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7200 		 * this knote to stay suppressed forever if we only relied on
7201 		 * kqworkloop_acknowledge_events to be called by `wl_orig`.
7202 		 *
7203 		 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7204 		 * unsuppress because that would mess with the processing phase of
7205 		 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7206 		 * will be called.
7207 		 */
7208 		if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
7209 			if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
7210 				knote_unsuppress_noqueue(kqu, kn);
7211 			}
7212 		}
7213 	}
7214 
7215 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
7216 		kqueue_update_iotier_override(kqu);
7217 	}
7218 
7219 	if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
7220 		// may dequeue the knote
7221 		knote_reset_priority(kqu, kn, kev->qos);
7222 	}
7223 
7224 	/*
7225 	 * When we unsuppress above, or because of knote_reset_priority(),
7226 	 * the knote may have been dequeued, we need to restore the invariant
7227 	 * that if the knote is active it needs to be queued now that
7228 	 * we're done applying changes.
7229 	 */
7230 	if (result & FILTER_ACTIVE) {
7231 		knote_activate(kqu, kn, result);
7232 	} else {
7233 		knote_enqueue(kqu, kn);
7234 	}
7235 
7236 	if ((result & FILTER_THREADREQ_NODEFEER) &&
7237 	    act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
7238 		workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7239 	}
7240 }
7241 
7242 /*
7243  * knote_drop - disconnect and drop the knote
7244  *
7245  * Called with the kqueue locked, returns with the kqueue unlocked.
7246  *
7247  * If a knote locking context is passed, it is canceled.
7248  *
7249  * The knote may have already been detached from
7250  * (or not yet attached to) its source object.
7251  */
7252 static void
7253 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
7254 {
7255 	struct proc *p = kq->kq_p;
7256 
7257 	kqlock_held(kq);
7258 
7259 	assert((kn->kn_status & KN_DROPPING) == 0);
7260 	if (knlc == NULL) {
7261 		assert((kn->kn_status & KN_LOCKED) == 0);
7262 	}
7263 	kn->kn_status |= KN_DROPPING;
7264 
7265 	if (kn->kn_status & KN_SUPPRESSED) {
7266 		knote_unsuppress_noqueue(kq, kn);
7267 	} else {
7268 		knote_dequeue(kq, kn);
7269 	}
7270 	knote_wait_for_post(kq, kn);
7271 
7272 	/* Even if we are autodetached, the filter may need to do cleanups of any
7273 	 * stuff stashed on the knote so always make the call and let each filter
7274 	 * handle the possibility of autodetached-ness */
7275 	knote_fops(kn)->f_detach(kn);
7276 
7277 	/* kq may be freed when kq_remove_knote() returns */
7278 	kq_remove_knote(kq, kn, p, knlc);
7279 	if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
7280 		fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
7281 	}
7282 
7283 	knote_free(kn);
7284 }
7285 
7286 void
7287 knote_init(void)
7288 {
7289 #if CONFIG_MEMORYSTATUS
7290 	/* Initialize the memorystatus list lock */
7291 	memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
7292 #endif
7293 }
7294 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
7295 
7296 const struct filterops *
7297 knote_fops(struct knote *kn)
7298 {
7299 	return sysfilt_ops[kn->kn_filtid];
7300 }
7301 
7302 static struct knote *
7303 knote_alloc(void)
7304 {
7305 	return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
7306 }
7307 
7308 static void
7309 knote_free(struct knote *kn)
7310 {
7311 	assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
7312 	zfree(knote_zone, kn);
7313 }
7314 
7315 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
7316 
7317 kevent_ctx_t
7318 kevent_get_context(thread_t thread)
7319 {
7320 	uthread_t ut = get_bsdthread_info(thread);
7321 	return &ut->uu_save.uus_kevent;
7322 }
7323 
7324 static inline bool
7325 kevent_args_requesting_events(unsigned int flags, int nevents)
7326 {
7327 	return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
7328 }
7329 
7330 static inline int
7331 kevent_adjust_flags_for_proc(proc_t p, int flags)
7332 {
7333 	__builtin_assume(p);
7334 	return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
7335 }
7336 
7337 /*!
7338  * @function kevent_get_kqfile
7339  *
7340  * @brief
7341  * Lookup a kqfile by fd.
7342  *
7343  * @discussion
7344  * Callers: kevent, kevent64, kevent_qos
7345  *
7346  * This is not assumed to be a fastpath (kqfile interfaces are legacy)
7347  */
7348 OS_NOINLINE
7349 static int
7350 kevent_get_kqfile(struct proc *p, int fd, int flags,
7351     struct fileproc **fpp, struct kqueue **kqp)
7352 {
7353 	int error = 0;
7354 	struct kqueue *kq;
7355 
7356 	error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
7357 	if (__improbable(error)) {
7358 		return error;
7359 	}
7360 	kq = (struct kqueue *)fp_get_data((*fpp));
7361 
7362 	uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
7363 	if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
7364 		kqlock(kq);
7365 		kq_state = kq->kq_state;
7366 		if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
7367 			if (flags & KEVENT_FLAG_LEGACY32) {
7368 				kq_state |= KQ_KEV32;
7369 			} else if (flags & KEVENT_FLAG_LEGACY64) {
7370 				kq_state |= KQ_KEV64;
7371 			} else {
7372 				kq_state |= KQ_KEV_QOS;
7373 			}
7374 			kq->kq_state = kq_state;
7375 		}
7376 		kqunlock(kq);
7377 	}
7378 
7379 	/*
7380 	 * kqfiles can't be used through the legacy kevent()
7381 	 * and other interfaces at the same time.
7382 	 */
7383 	if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7384 	    (bool)(kq_state & KQ_KEV32))) {
7385 		fp_drop(p, fd, *fpp, 0);
7386 		return EINVAL;
7387 	}
7388 
7389 	*kqp = kq;
7390 	return 0;
7391 }
7392 
7393 /*!
7394  * @function kevent_get_kqwq
7395  *
7396  * @brief
7397  * Lookup or create the process kqwq (faspath).
7398  *
7399  * @discussion
7400  * Callers: kevent64, kevent_qos
7401  */
7402 OS_ALWAYS_INLINE
7403 static int
7404 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7405 {
7406 	struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7407 
7408 	if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7409 		return EINVAL;
7410 	}
7411 	if (__improbable(kqwq == NULL)) {
7412 		kqwq = kqworkq_alloc(p, flags);
7413 		if (__improbable(kqwq == NULL)) {
7414 			return ENOMEM;
7415 		}
7416 	}
7417 
7418 	*kqp = &kqwq->kqwq_kqueue;
7419 	return 0;
7420 }
7421 
7422 #pragma mark kevent copyio
7423 
7424 /*!
7425  * @function kevent_get_data_size
7426  *
7427  * @brief
7428  * Copies in the extra data size from user-space.
7429  */
7430 static int
7431 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7432     kevent_ctx_t kectx)
7433 {
7434 	if (!data_avail || !data_out) {
7435 		kectx->kec_data_size  = 0;
7436 		kectx->kec_data_resid = 0;
7437 	} else if (flags & KEVENT_FLAG_PROC64) {
7438 		user64_size_t usize = 0;
7439 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7440 		if (__improbable(error)) {
7441 			return error;
7442 		}
7443 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7444 	} else {
7445 		user32_size_t usize = 0;
7446 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7447 		if (__improbable(error)) {
7448 			return error;
7449 		}
7450 		kectx->kec_data_avail = data_avail;
7451 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7452 	}
7453 	kectx->kec_data_out   = data_out;
7454 	kectx->kec_data_avail = data_avail;
7455 	return 0;
7456 }
7457 
7458 /*!
7459  * @function kevent_put_data_size
7460  *
7461  * @brief
7462  * Copies out the residual data size to user-space if any has been used.
7463  */
7464 static int
7465 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7466 {
7467 	if (kectx->kec_data_resid == kectx->kec_data_size) {
7468 		return 0;
7469 	}
7470 	if (flags & KEVENT_FLAG_KERNEL) {
7471 		*(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7472 		return 0;
7473 	}
7474 	if (flags & KEVENT_FLAG_PROC64) {
7475 		user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7476 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7477 	} else {
7478 		user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7479 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7480 	}
7481 }
7482 
7483 /*!
7484  * @function kevent_legacy_copyin
7485  *
7486  * @brief
7487  * Handles the copyin of a kevent/kevent64 event.
7488  */
7489 static int
7490 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
7491 {
7492 	int error;
7493 
7494 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7495 
7496 	if (flags & KEVENT_FLAG_LEGACY64) {
7497 		struct kevent64_s kev64;
7498 
7499 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7500 		if (__improbable(error)) {
7501 			return error;
7502 		}
7503 		*addrp += sizeof(kev64);
7504 		*kevp = (struct kevent_qos_s){
7505 			.ident  = kev64.ident,
7506 			.filter = kev64.filter,
7507 			/* Make sure user doesn't pass in any system flags */
7508 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7509 			.udata  = kev64.udata,
7510 			.fflags = kev64.fflags,
7511 			.data   = kev64.data,
7512 			.ext[0] = kev64.ext[0],
7513 			.ext[1] = kev64.ext[1],
7514 		};
7515 	} else if (flags & KEVENT_FLAG_PROC64) {
7516 		struct user64_kevent kev64;
7517 
7518 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7519 		if (__improbable(error)) {
7520 			return error;
7521 		}
7522 		*addrp += sizeof(kev64);
7523 		*kevp = (struct kevent_qos_s){
7524 			.ident  = kev64.ident,
7525 			.filter = kev64.filter,
7526 			/* Make sure user doesn't pass in any system flags */
7527 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7528 			.udata  = kev64.udata,
7529 			.fflags = kev64.fflags,
7530 			.data   = kev64.data,
7531 		};
7532 	} else {
7533 		struct user32_kevent kev32;
7534 
7535 		error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7536 		if (__improbable(error)) {
7537 			return error;
7538 		}
7539 		*addrp += sizeof(kev32);
7540 		*kevp = (struct kevent_qos_s){
7541 			.ident  = (uintptr_t)kev32.ident,
7542 			.filter = kev32.filter,
7543 			/* Make sure user doesn't pass in any system flags */
7544 			.flags  = kev32.flags & ~EV_SYSFLAGS,
7545 			.udata  = CAST_USER_ADDR_T(kev32.udata),
7546 			.fflags = kev32.fflags,
7547 			.data   = (intptr_t)kev32.data,
7548 		};
7549 	}
7550 
7551 	return 0;
7552 }
7553 
7554 /*!
7555  * @function kevent_modern_copyin
7556  *
7557  * @brief
7558  * Handles the copyin of a kevent_qos/kevent_id event.
7559  */
7560 static int
7561 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7562 {
7563 	int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7564 	if (__probable(!error)) {
7565 		/* Make sure user doesn't pass in any system flags */
7566 		*addrp += sizeof(struct kevent_qos_s);
7567 		kevp->flags &= ~EV_SYSFLAGS;
7568 	}
7569 	return error;
7570 }
7571 
7572 /*!
7573  * @function kevent_legacy_copyout
7574  *
7575  * @brief
7576  * Handles the copyout of a kevent/kevent64 event.
7577  */
7578 static int
7579 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7580 {
7581 	int advance;
7582 	int error;
7583 
7584 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7585 
7586 	/*
7587 	 * fully initialize the differnt output event structure
7588 	 * types from the internal kevent (and some universal
7589 	 * defaults for fields not represented in the internal
7590 	 * form).
7591 	 *
7592 	 * Note: these structures have no padding hence the C99
7593 	 *       initializers below do not leak kernel info.
7594 	 */
7595 	if (flags & KEVENT_FLAG_LEGACY64) {
7596 		struct kevent64_s kev64 = {
7597 			.ident  = kevp->ident,
7598 			.filter = kevp->filter,
7599 			.flags  = kevp->flags,
7600 			.fflags = kevp->fflags,
7601 			.data   = (int64_t)kevp->data,
7602 			.udata  = kevp->udata,
7603 			.ext[0] = kevp->ext[0],
7604 			.ext[1] = kevp->ext[1],
7605 		};
7606 		advance = sizeof(struct kevent64_s);
7607 		error = copyout((caddr_t)&kev64, *addrp, advance);
7608 	} else if (flags & KEVENT_FLAG_PROC64) {
7609 		/*
7610 		 * deal with the special case of a user-supplied
7611 		 * value of (uintptr_t)-1.
7612 		 */
7613 		uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7614 		    (uint64_t)-1LL : (uint64_t)kevp->ident;
7615 		struct user64_kevent kev64 = {
7616 			.ident  = ident,
7617 			.filter = kevp->filter,
7618 			.flags  = kevp->flags,
7619 			.fflags = kevp->fflags,
7620 			.data   = (int64_t) kevp->data,
7621 			.udata  = (user_addr_t) kevp->udata,
7622 		};
7623 		advance = sizeof(kev64);
7624 		error = copyout((caddr_t)&kev64, *addrp, advance);
7625 	} else {
7626 		struct user32_kevent kev32 = {
7627 			.ident  = (uint32_t)kevp->ident,
7628 			.filter = kevp->filter,
7629 			.flags  = kevp->flags,
7630 			.fflags = kevp->fflags,
7631 			.data   = (int32_t)kevp->data,
7632 			.udata  = (uint32_t)kevp->udata,
7633 		};
7634 		advance = sizeof(kev32);
7635 		error = copyout((caddr_t)&kev32, *addrp, advance);
7636 	}
7637 	if (__probable(!error)) {
7638 		*addrp += advance;
7639 	}
7640 	return error;
7641 }
7642 
7643 /*!
7644  * @function kevent_modern_copyout
7645  *
7646  * @brief
7647  * Handles the copyout of a kevent_qos/kevent_id event.
7648  */
7649 OS_ALWAYS_INLINE
7650 static inline int
7651 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7652 {
7653 	int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7654 	if (__probable(!error)) {
7655 		*addrp += sizeof(struct kevent_qos_s);
7656 	}
7657 	return error;
7658 }
7659 
7660 #pragma mark kevent core implementation
7661 
7662 /*!
7663  * @function kevent_callback_inline
7664  *
7665  * @brief
7666  * Callback for each individual event
7667  *
7668  * @discussion
7669  * This is meant to be inlined in kevent_modern_callback and
7670  * kevent_legacy_callback.
7671  */
7672 OS_ALWAYS_INLINE
7673 static inline int
7674 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7675 {
7676 	int error;
7677 
7678 	assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7679 
7680 	/*
7681 	 * Copy out the appropriate amount of event data for this user.
7682 	 */
7683 	if (legacy) {
7684 		error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7685 		    kectx->kec_process_flags);
7686 	} else {
7687 		error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7688 	}
7689 
7690 	/*
7691 	 * If there isn't space for additional events, return
7692 	 * a harmless error to stop the processing here
7693 	 */
7694 	if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7695 		error = EWOULDBLOCK;
7696 	}
7697 	return error;
7698 }
7699 
7700 /*!
7701  * @function kevent_modern_callback
7702  *
7703  * @brief
7704  * Callback for each individual modern event.
7705  *
7706  * @discussion
7707  * This callback handles kevent_qos/kevent_id events.
7708  */
7709 static int
7710 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7711 {
7712 	return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7713 }
7714 
7715 /*!
7716  * @function kevent_legacy_callback
7717  *
7718  * @brief
7719  * Callback for each individual legacy event.
7720  *
7721  * @discussion
7722  * This callback handles kevent/kevent64 events.
7723  */
7724 static int
7725 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7726 {
7727 	return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7728 }
7729 
7730 /*!
7731  * @function kevent_cleanup
7732  *
7733  * @brief
7734  * Handles the cleanup returning from a kevent call.
7735  *
7736  * @discussion
7737  * kevent entry points will take a reference on workloops,
7738  * and a usecount on the fileglob of kqfiles.
7739  *
7740  * This function undoes this on the exit paths of kevents.
7741  *
7742  * @returns
7743  * The error to return to userspace.
7744  */
7745 static int
7746 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7747 {
7748 	// poll should not call any codepath leading to this
7749 	assert((flags & KEVENT_FLAG_POLL) == 0);
7750 
7751 	if (flags & KEVENT_FLAG_WORKLOOP) {
7752 		kqworkloop_release(kqu.kqwl);
7753 	} else if (flags & KEVENT_FLAG_WORKQ) {
7754 		/* nothing held */
7755 	} else {
7756 		fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7757 	}
7758 
7759 	/* don't restart after signals... */
7760 	if (error == ERESTART) {
7761 		error = EINTR;
7762 	} else if (error == 0) {
7763 		/* don't abandon other output just because of residual copyout failures */
7764 		(void)kevent_put_data_size(flags, kectx);
7765 	}
7766 
7767 	if (flags & KEVENT_FLAG_PARKING) {
7768 		thread_t th = current_thread();
7769 		struct uthread *uth = get_bsdthread_info(th);
7770 		workq_threadreq_t kqr = uth->uu_kqr_bound;
7771 		if (kqr && !(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND)) {
7772 			thread_unfreeze_base_pri(th);
7773 		}
7774 	}
7775 	return error;
7776 }
7777 
7778 /*!
7779  * @function kqueue_process
7780  *
7781  * @brief
7782  * Process the triggered events in a kqueue.
7783  *
7784  * @discussion
7785  * Walk the queued knotes and validate that they are really still triggered
7786  * events by calling the filter routines (if necessary).
7787  *
7788  * For each event that is still considered triggered, invoke the callback
7789  * routine provided.
7790  *
7791  * caller holds a reference on the kqueue.
7792  * kqueue locked on entry and exit - but may be dropped
7793  * kqueue list locked (held for duration of call)
7794  *
7795  * This is only called by kqueue_scan() so that the compiler can inline it.
7796  *
7797  * For kqworkloops that are permanently configured with a bound thread, this
7798  * function parks the bound thread (instead of returning) if there are no events
7799  * or errors to be returned and KEVENT_FLAG_PARKING was specified.
7800  *
7801  * @returns
7802  * - 0:            no event was returned, no other error occured
7803  * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
7804  * - EWOULDBLOCK:  (not an error) events have been found and we should return
7805  * - EFAULT:       copyout failed
7806  * - filter specific errors
7807  */
7808 static int
7809 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7810     kevent_callback_t callback)
7811 {
7812 	workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7813 	struct knote *kn;
7814 	int error = 0, rc = 0;
7815 	struct kqtailq *base_queue, *queue;
7816 	uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7817 	bool kqwl_permanently_bound = false;
7818 
7819 	if (kq_type & KQ_WORKQ) {
7820 		rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7821 	} else if (kq_type & KQ_WORKLOOP) {
7822 		kqwl_permanently_bound = kqr_thread_permanently_bound(kqr);
7823 		rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7824 	} else {
7825 kqfile_retry:
7826 		rc = kqfile_begin_processing(kqu.kqf);
7827 		if (rc == EBADF) {
7828 			return EBADF;
7829 		}
7830 	}
7831 
7832 	if (rc == -1) {
7833 		/* Nothing to process */
7834 		if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7835 		    kqwl_permanently_bound) {
7836 			goto kqwl_bound_thread_park;
7837 		}
7838 		return 0;
7839 	}
7840 
7841 	/*
7842 	 * loop through the enqueued knotes associated with this request,
7843 	 * processing each one. Each request may have several queues
7844 	 * of knotes to process (depending on the type of kqueue) so we
7845 	 * have to loop through all the queues as long as we have additional
7846 	 * space.
7847 	 */
7848 
7849 process_again:
7850 	if (kq_type & KQ_WORKQ) {
7851 		base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7852 	} else if (kq_type & KQ_WORKLOOP) {
7853 		base_queue = &kqu.kqwl->kqwl_queue[0];
7854 		queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7855 	} else {
7856 		base_queue = queue = &kqu.kqf->kqf_queue;
7857 	}
7858 
7859 	do {
7860 		while ((kn = TAILQ_FIRST(queue)) != NULL) {
7861 			error = knote_process(kn, kectx, callback);
7862 			if (error == EJUSTRETURN) {
7863 				error = 0;
7864 			} else if (__improbable(error)) {
7865 				/* error is EWOULDBLOCK when the out event array is full */
7866 				goto stop_processing;
7867 			}
7868 		}
7869 	} while (queue-- > base_queue);
7870 
7871 	if (kectx->kec_process_noutputs) {
7872 		/* callers will transform this into no error */
7873 		error = EWOULDBLOCK;
7874 	}
7875 
7876 stop_processing:
7877 	/*
7878 	 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7879 	 * we want to unbind the kqrequest from the thread.
7880 	 *
7881 	 * However, because the kq locks are dropped several times during process,
7882 	 * new knotes may have fired again, in which case, we want to fail the end
7883 	 * processing and process again, until it converges.
7884 	 *
7885 	 * If we have an error or returned events, end processing never fails.
7886 	 */
7887 	if (error) {
7888 		flags &= ~KEVENT_FLAG_PARKING;
7889 	}
7890 	if (kq_type & KQ_WORKQ) {
7891 		rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7892 	} else if (kq_type & KQ_WORKLOOP) {
7893 		rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7894 	} else {
7895 		rc = kqfile_end_processing(kqu.kqf);
7896 	}
7897 
7898 	if (__probable(error)) {
7899 		return error;
7900 	}
7901 
7902 	if (__probable(rc >= 0)) {
7903 		assert(rc == 0 || rc == EBADF);
7904 		if (rc == 0) {
7905 			if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7906 			    kqwl_permanently_bound) {
7907 				goto kqwl_bound_thread_park;
7908 			}
7909 		}
7910 		return rc;
7911 	}
7912 
7913 	if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7914 		assert(flags & KEVENT_FLAG_PARKING);
7915 		goto process_again;
7916 	} else {
7917 		goto kqfile_retry;
7918 	}
7919 
7920 kqwl_bound_thread_park:
7921 #if DEVELOPMENT | DEBUG
7922 	assert(current_thread() == kqr_thread_fast(kqr));
7923 	assert(workq_thread_is_permanently_bound(current_uthread()));
7924 #endif
7925 	kqworkloop_bound_thread_park(kqu.kqwl, kqr_thread_fast(kqr));
7926 	__builtin_unreachable();
7927 }
7928 
7929 /*!
7930  * @function kqueue_scan_continue
7931  *
7932  * @brief
7933  * The continuation used by kqueue_scan for kevent entry points.
7934  *
7935  * @discussion
7936  * Assumes we inherit a use/ref count on the kq or its fileglob.
7937  *
7938  * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7939  * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7940  */
7941 OS_NORETURN OS_NOINLINE
7942 static void
7943 kqueue_scan_continue(void *data, wait_result_t wait_result)
7944 {
7945 	uthread_t ut = current_uthread();
7946 	kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7947 	int error = 0, flags = kectx->kec_process_flags;
7948 	struct kqueue *kq = data;
7949 
7950 	/*
7951 	 * only kevent variants call in here, so we know the callback is
7952 	 * kevent_legacy_callback or kevent_modern_callback.
7953 	 */
7954 	assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7955 
7956 	switch (wait_result) {
7957 	case THREAD_AWAKENED:
7958 		if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7959 			error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7960 		} else {
7961 			error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7962 		}
7963 		break;
7964 	case THREAD_TIMED_OUT:
7965 		error = 0;
7966 		break;
7967 	case THREAD_INTERRUPTED:
7968 		error = EINTR;
7969 		break;
7970 	case THREAD_RESTART:
7971 		error = EBADF;
7972 		break;
7973 	default:
7974 		panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7975 	}
7976 
7977 
7978 	error = kevent_cleanup(kq, flags, error, kectx);
7979 	*(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7980 	unix_syscall_return(error);
7981 }
7982 
7983 /*!
7984  * @function kqueue_scan
7985  *
7986  * @brief
7987  * Scan and wait for events in a kqueue (used by poll & kevent).
7988  *
7989  * @discussion
7990  * Process the triggered events in a kqueue.
7991  *
7992  * If there are no events triggered arrange to wait for them:
7993  * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7994  * - possibly until kectx->kec_deadline expires
7995  *
7996  * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7997  * are set, then it will wait in the kqueue_scan_continue continuation.
7998  *
7999  * poll() will block in place, and KEVENT_FLAG_KERNEL calls
8000  * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
8001  *
8002  * @param kqu
8003  * The kqueue being scanned.
8004  *
8005  * @param flags
8006  * The KEVENT_FLAG_* flags for this call.
8007  *
8008  * @param kectx
8009  * The context used for this scan.
8010  * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
8011  *
8012  * @param callback
8013  * The callback to be called on events sucessfully processed.
8014  * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
8015  */
8016 int
8017 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
8018     kevent_callback_t callback)
8019 {
8020 	int error;
8021 
8022 	for (;;) {
8023 		kqlock(kqu);
8024 		error = kqueue_process(kqu, flags, kectx, callback);
8025 
8026 		/*
8027 		 * If we got an error, events returned (EWOULDBLOCK)
8028 		 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
8029 		 * just return.
8030 		 */
8031 		if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
8032 			kqunlock(kqu);
8033 			return error == EWOULDBLOCK ? 0 : error;
8034 		}
8035 
8036 		assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
8037 
8038 		kqu.kqf->kqf_state |= KQ_SLEEP;
8039 		assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
8040 		    kectx->kec_deadline);
8041 		kqunlock(kqu);
8042 
8043 		if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
8044 			thread_block_parameter(kqueue_scan_continue, kqu.kqf);
8045 			__builtin_unreachable();
8046 		}
8047 
8048 		wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
8049 		switch (wr) {
8050 		case THREAD_AWAKENED:
8051 			break;
8052 		case THREAD_TIMED_OUT:
8053 			return 0;
8054 		case THREAD_INTERRUPTED:
8055 			return EINTR;
8056 		case THREAD_RESTART:
8057 			return EBADF;
8058 		default:
8059 			panic("%s: - bad wait_result (%d)", __func__, wr);
8060 		}
8061 	}
8062 }
8063 
8064 /*!
8065  * @function kevent_internal
8066  *
8067  * @brief
8068  * Common kevent code.
8069  *
8070  * @discussion
8071  * Needs to be inlined to specialize for legacy or modern and
8072  * eliminate dead code.
8073  *
8074  * This is the core logic of kevent entry points, that will:
8075  * - register kevents
8076  * - optionally scan the kqueue for events
8077  *
8078  * The caller is giving kevent_internal a reference on the kqueue
8079  * or its fileproc that needs to be cleaned up by kevent_cleanup().
8080  */
8081 OS_ALWAYS_INLINE
8082 static inline int
8083 kevent_internal(kqueue_t kqu,
8084     user_addr_t changelist, int nchanges,
8085     user_addr_t ueventlist, int nevents,
8086     int flags, kevent_ctx_t kectx, int32_t *retval,
8087     bool legacy)
8088 {
8089 	int error = 0, noutputs = 0, register_rc;
8090 
8091 	/* only bound threads can receive events on workloops */
8092 	if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
8093 #if CONFIG_WORKLOOP_DEBUG
8094 		UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
8095 			.uu_kqid = kqu.kqwl->kqwl_dynamicid,
8096 			.uu_kq = error ? NULL : kqu.kq,
8097 			.uu_error = error,
8098 			.uu_nchanges = nchanges,
8099 			.uu_nevents = nevents,
8100 			.uu_flags = flags,
8101 		});
8102 #endif // CONFIG_WORKLOOP_DEBUG
8103 
8104 		if (flags & KEVENT_FLAG_KERNEL) {
8105 			/* see kevent_workq_internal */
8106 			error = copyout(&kqu.kqwl->kqwl_dynamicid,
8107 			    ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
8108 			kectx->kec_data_resid -= sizeof(kqueue_id_t);
8109 			if (__improbable(error)) {
8110 				goto out;
8111 			}
8112 		}
8113 
8114 		if (kevent_args_requesting_events(flags, nevents)) {
8115 			/*
8116 			 * Disable the R2K notification while doing a register, if the
8117 			 * caller wants events too, we don't want the AST to be set if we
8118 			 * will process these events soon.
8119 			 */
8120 			kqlock(kqu);
8121 			kqu.kq->kq_state &= ~KQ_R2K_ARMED;
8122 			kqunlock(kqu);
8123 			flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
8124 		}
8125 	}
8126 
8127 	/* register all the change requests the user provided... */
8128 	while (nchanges > 0 && error == 0) {
8129 		struct kevent_qos_s kev;
8130 		struct knote *kn = NULL;
8131 
8132 		if (legacy) {
8133 			error = kevent_legacy_copyin(&changelist, &kev, flags);
8134 		} else {
8135 			error = kevent_modern_copyin(&changelist, &kev);
8136 		}
8137 		if (error) {
8138 			break;
8139 		}
8140 
8141 		register_rc = kevent_register(kqu.kq, &kev, &kn);
8142 		if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
8143 			thread_t thread = current_thread();
8144 
8145 			kqlock_held(kqu);
8146 
8147 			if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
8148 				workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
8149 			}
8150 
8151 			// f_post_register_wait is meant to call a continuation and not to
8152 			// return, which is why we don't support FILTER_REGISTER_WAIT if
8153 			// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
8154 			// waits isn't the last.
8155 			//
8156 			// It is implementable, but not used by any userspace code at the
8157 			// moment, so for now return ENOTSUP if someone tries to do it.
8158 			if (nchanges == 1 && noutputs < nevents &&
8159 			    (flags & KEVENT_FLAG_KERNEL) == 0 &&
8160 			    (flags & KEVENT_FLAG_PARKING) == 0 &&
8161 			    (flags & KEVENT_FLAG_ERROR_EVENTS) &&
8162 			    (flags & KEVENT_FLAG_WORKLOOP)) {
8163 				uthread_t ut = get_bsdthread_info(thread);
8164 
8165 				/*
8166 				 * store the continuation/completion data in the uthread
8167 				 *
8168 				 * Note: the kectx aliases with this,
8169 				 * and is destroyed in the process.
8170 				 */
8171 				ut->uu_save.uus_kevent_register = (struct _kevent_register){
8172 					.kev        = kev,
8173 					.kqwl       = kqu.kqwl,
8174 					.eventout   = noutputs,
8175 					.ueventlist = ueventlist,
8176 				};
8177 				knote_fops(kn)->f_post_register_wait(ut, kn,
8178 				    &ut->uu_save.uus_kevent_register);
8179 				__builtin_unreachable();
8180 			}
8181 			kqunlock(kqu);
8182 
8183 			kev.flags |= EV_ERROR;
8184 			kev.data = ENOTSUP;
8185 		} else {
8186 			assert((register_rc & FILTER_REGISTER_WAIT) == 0);
8187 		}
8188 
8189 		// keep in sync with kevent_register_wait_return()
8190 		if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
8191 			if ((kev.flags & EV_ERROR) == 0) {
8192 				kev.flags |= EV_ERROR;
8193 				kev.data = 0;
8194 			}
8195 			if (legacy) {
8196 				error = kevent_legacy_copyout(&kev, &ueventlist, flags);
8197 			} else {
8198 				error = kevent_modern_copyout(&kev, &ueventlist);
8199 			}
8200 			if (error == 0) {
8201 				noutputs++;
8202 			}
8203 		} else if (kev.flags & EV_ERROR) {
8204 			error = (int)kev.data;
8205 		}
8206 		nchanges--;
8207 	}
8208 
8209 	if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
8210 	    nevents > 0 && noutputs == 0 && error == 0) {
8211 		kectx->kec_process_flags = flags;
8212 		kectx->kec_process_nevents = nevents;
8213 		kectx->kec_process_noutputs = 0;
8214 		kectx->kec_process_eventlist = ueventlist;
8215 
8216 		if (legacy) {
8217 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
8218 		} else {
8219 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
8220 		}
8221 
8222 		noutputs = kectx->kec_process_noutputs;
8223 	} else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
8224 		/*
8225 		 * If we didn't through kqworkloop_end_processing(),
8226 		 * we need to do it here.
8227 		 *
8228 		 * kqueue_scan will call kqworkloop_end_processing(),
8229 		 * so we only need to do it if we didn't scan.
8230 		 */
8231 		kqlock(kqu);
8232 		kqworkloop_end_processing(kqu.kqwl, 0, 0);
8233 		kqunlock(kqu);
8234 	}
8235 
8236 	*retval = noutputs;
8237 out:
8238 	return kevent_cleanup(kqu.kq, flags, error, kectx);
8239 }
8240 
8241 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
8242 
8243 /*!
8244  * @function kevent_modern_internal
8245  *
8246  * @brief
8247  * The backend of the kevent_id and kevent_workq_internal entry points.
8248  *
8249  * @discussion
8250  * Needs to be inline due to the number of arguments.
8251  */
8252 OS_NOINLINE
8253 static int
8254 kevent_modern_internal(kqueue_t kqu,
8255     user_addr_t changelist, int nchanges,
8256     user_addr_t ueventlist, int nevents,
8257     int flags, kevent_ctx_t kectx, int32_t *retval)
8258 {
8259 	return kevent_internal(kqu.kq, changelist, nchanges,
8260 	           ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
8261 }
8262 
8263 /*!
8264  * @function kevent_id
8265  *
8266  * @brief
8267  * The kevent_id() syscall.
8268  */
8269 int
8270 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
8271 {
8272 	int error, flags = uap->flags & KEVENT_FLAG_USER;
8273 	uthread_t uth = current_uthread();
8274 	workq_threadreq_t kqr = uth->uu_kqr_bound;
8275 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8276 	kqueue_t kqu;
8277 
8278 	flags = kevent_adjust_flags_for_proc(p, flags);
8279 	flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
8280 
8281 	if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
8282 	    KEVENT_FLAG_WORKLOOP)) {
8283 		return EINVAL;
8284 	}
8285 
8286 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8287 	if (__improbable(error)) {
8288 		return error;
8289 	}
8290 
8291 	kectx->kec_deadline = 0;
8292 	kectx->kec_fp       = NULL;
8293 	kectx->kec_fd       = -1;
8294 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8295 
8296 	/*
8297 	 * Get the kq we are going to be working on
8298 	 * As a fastpath, look at the currently bound workloop.
8299 	 */
8300 	kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
8301 	if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
8302 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
8303 			return EEXIST;
8304 		}
8305 		kqworkloop_retain(kqu.kqwl);
8306 	} else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
8307 		return EXDEV;
8308 	} else {
8309 		error = kqworkloop_get_or_create(p, uap->id, NULL, NULL,
8310 		    flags, &kqu.kqwl);
8311 		if (__improbable(error)) {
8312 			return error;
8313 		}
8314 	}
8315 
8316 	return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
8317 	           uap->eventlist, uap->nevents, flags, kectx, retval);
8318 }
8319 
8320 /**!
8321  * @function kevent_workq_internal
8322  *
8323  * @discussion
8324  * This function is exported for the sake of the workqueue subsystem.
8325  *
8326  * It is called in two ways:
8327  * - when a thread is about to go to userspace to ask for pending event
8328  * - when a thread is returning from userspace with events back
8329  *
8330  * the workqueue subsystem will only use the following flags:
8331  * - KEVENT_FLAG_STACK_DATA (always)
8332  * - KEVENT_FLAG_IMMEDIATE (always)
8333  * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
8334  *   userspace).
8335  *
8336  * It implicitly acts on the bound kqueue, and for the case of workloops
8337  * will copyout the kqueue ID before anything else.
8338  *
8339  *
8340  * Pthread will have setup the various arguments to fit this stack layout:
8341  *
8342  * +-------....----+--------------+-----------+--------------------+
8343  * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
8344  * +-------....----+--------------+-----------+--------------------+
8345  *                 ^              ^
8346  *             data_out       eventlist
8347  *
8348  * When a workloop is used, the workloop ID is copied out right before
8349  * the eventlist and is taken from the data buffer.
8350  *
8351  * @warning
8352  * This function is carefuly tailored to not make any call except the final tail
8353  * call into kevent_modern_internal. (LTO inlines current_uthread()).
8354  *
8355  * This function is performance sensitive due to the workq subsystem.
8356  */
8357 int
8358 kevent_workq_internal(struct proc *p,
8359     user_addr_t changelist, int nchanges,
8360     user_addr_t eventlist, int nevents,
8361     user_addr_t data_out, user_size_t *data_available,
8362     unsigned int flags, int32_t *retval)
8363 {
8364 	uthread_t uth = current_uthread();
8365 	workq_threadreq_t kqr = uth->uu_kqr_bound;
8366 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8367 	kqueue_t kqu;
8368 
8369 	assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
8370 	    flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
8371 
8372 	kectx->kec_data_out   = data_out;
8373 	kectx->kec_data_avail = (uint64_t)data_available;
8374 	kectx->kec_data_size  = *data_available;
8375 	kectx->kec_data_resid = *data_available;
8376 	kectx->kec_deadline   = 0;
8377 	kectx->kec_fp         = NULL;
8378 	kectx->kec_fd         = -1;
8379 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8380 
8381 	flags = kevent_adjust_flags_for_proc(p, flags);
8382 
8383 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
8384 		kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
8385 		kqworkloop_retain(kqu.kqwl);
8386 
8387 		flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
8388 		    KEVENT_FLAG_KERNEL;
8389 	} else {
8390 		kqu.kqwq = p->p_fd.fd_wqkqueue;
8391 
8392 		flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
8393 	}
8394 
8395 	return kevent_modern_internal(kqu, changelist, nchanges,
8396 	           eventlist, nevents, flags, kectx, retval);
8397 }
8398 
8399 /*!
8400  * @function kevent_qos
8401  *
8402  * @brief
8403  * The kevent_qos() syscall.
8404  */
8405 int
8406 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
8407 {
8408 	uthread_t uth = current_uthread();
8409 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8410 	int error, flags = uap->flags & KEVENT_FLAG_USER;
8411 	struct kqueue *kq;
8412 
8413 	if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8414 		return EINVAL;
8415 	}
8416 
8417 	flags = kevent_adjust_flags_for_proc(p, flags);
8418 
8419 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8420 	if (__improbable(error)) {
8421 		return error;
8422 	}
8423 
8424 	kectx->kec_deadline = 0;
8425 	kectx->kec_fp       = NULL;
8426 	kectx->kec_fd       = uap->fd;
8427 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8428 
8429 	/* get the kq we are going to be working on */
8430 	if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8431 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8432 	} else {
8433 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8434 	}
8435 	if (__improbable(error)) {
8436 		return error;
8437 	}
8438 
8439 	return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
8440 	           uap->eventlist, uap->nevents, flags, kectx, retval);
8441 }
8442 
8443 #pragma mark legacy syscalls: kevent, kevent64
8444 
8445 /*!
8446  * @function kevent_legacy_get_deadline
8447  *
8448  * @brief
8449  * Compute the deadline for the legacy kevent syscalls.
8450  *
8451  * @discussion
8452  * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8453  * as this takes precedence over the deadline.
8454  *
8455  * This function will fail if utimeout is USER_ADDR_NULL
8456  * (the caller should check).
8457  */
8458 static int
8459 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8460 {
8461 	struct timespec ts;
8462 
8463 	if (flags & KEVENT_FLAG_PROC64) {
8464 		struct user64_timespec ts64;
8465 		int error = copyin(utimeout, &ts64, sizeof(ts64));
8466 		if (__improbable(error)) {
8467 			return error;
8468 		}
8469 		ts.tv_sec = (unsigned long)ts64.tv_sec;
8470 		ts.tv_nsec = (long)ts64.tv_nsec;
8471 	} else {
8472 		struct user32_timespec ts32;
8473 		int error = copyin(utimeout, &ts32, sizeof(ts32));
8474 		if (__improbable(error)) {
8475 			return error;
8476 		}
8477 		ts.tv_sec = ts32.tv_sec;
8478 		ts.tv_nsec = ts32.tv_nsec;
8479 	}
8480 	if (!timespec_is_valid(&ts)) {
8481 		return EINVAL;
8482 	}
8483 
8484 	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
8485 	return 0;
8486 }
8487 
8488 /*!
8489  * @function kevent_legacy_internal
8490  *
8491  * @brief
8492  * The core implementation for kevent and kevent64
8493  */
8494 OS_NOINLINE
8495 static int
8496 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
8497     int32_t *retval, int flags)
8498 {
8499 	uthread_t uth = current_uthread();
8500 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8501 	struct kqueue *kq;
8502 	int error;
8503 
8504 	if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8505 		return EINVAL;
8506 	}
8507 
8508 	flags = kevent_adjust_flags_for_proc(p, flags);
8509 
8510 	kectx->kec_data_out   = 0;
8511 	kectx->kec_data_avail = 0;
8512 	kectx->kec_data_size  = 0;
8513 	kectx->kec_data_resid = 0;
8514 	kectx->kec_deadline   = 0;
8515 	kectx->kec_fp         = NULL;
8516 	kectx->kec_fd         = uap->fd;
8517 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8518 
8519 	/* convert timeout to absolute - if we have one (and not immediate) */
8520 	if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8521 		error = kevent_legacy_get_deadline(flags, uap->timeout,
8522 		    &kectx->kec_deadline);
8523 		if (__improbable(error)) {
8524 			return error;
8525 		}
8526 	}
8527 
8528 	/* get the kq we are going to be working on */
8529 	if (flags & KEVENT_FLAG_WORKQ) {
8530 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8531 	} else {
8532 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8533 	}
8534 	if (__improbable(error)) {
8535 		return error;
8536 	}
8537 
8538 	return kevent_internal(kq, uap->changelist, uap->nchanges,
8539 	           uap->eventlist, uap->nevents, flags, kectx, retval,
8540 	           /*legacy*/ true);
8541 }
8542 
8543 /*!
8544  * @function kevent
8545  *
8546  * @brief
8547  * The legacy kevent() syscall.
8548  */
8549 int
8550 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8551 {
8552 	struct kevent64_args args = {
8553 		.fd         = uap->fd,
8554 		.changelist = uap->changelist,
8555 		.nchanges   = uap->nchanges,
8556 		.eventlist  = uap->eventlist,
8557 		.nevents    = uap->nevents,
8558 		.timeout    = uap->timeout,
8559 	};
8560 
8561 	return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8562 }
8563 
8564 /*!
8565  * @function kevent64
8566  *
8567  * @brief
8568  * The legacy kevent64() syscall.
8569  */
8570 int
8571 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8572 {
8573 	int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8574 	return kevent_legacy_internal(p, uap, retval, flags);
8575 }
8576 
8577 #pragma mark - socket interface
8578 
8579 #if SOCKETS
8580 #include <sys/param.h>
8581 #include <sys/socket.h>
8582 #include <sys/protosw.h>
8583 #include <sys/domain.h>
8584 #include <sys/mbuf.h>
8585 #include <sys/kern_event.h>
8586 #include <sys/malloc.h>
8587 #include <sys/sys_domain.h>
8588 #include <sys/syslog.h>
8589 
8590 #ifndef ROUNDUP64
8591 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8592 #endif
8593 
8594 #ifndef ADVANCE64
8595 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8596 #endif
8597 
8598 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8599 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8600 
8601 static int kev_attach(struct socket *so, int proto, struct proc *p);
8602 static int kev_detach(struct socket *so);
8603 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8604     struct ifnet *ifp, struct proc *p);
8605 static lck_mtx_t * event_getlock(struct socket *, int);
8606 static int event_lock(struct socket *, int, void *);
8607 static int event_unlock(struct socket *, int, void *);
8608 
8609 static int event_sofreelastref(struct socket *);
8610 static void kev_delete(struct kern_event_pcb *);
8611 
8612 static struct pr_usrreqs event_usrreqs = {
8613 	.pru_attach =           kev_attach,
8614 	.pru_control =          kev_control,
8615 	.pru_detach =           kev_detach,
8616 	.pru_soreceive =        soreceive,
8617 };
8618 
8619 static struct protosw eventsw[] = {
8620 	{
8621 		.pr_type =              SOCK_RAW,
8622 		.pr_protocol =          SYSPROTO_EVENT,
8623 		.pr_flags =             PR_ATOMIC,
8624 		.pr_usrreqs =           &event_usrreqs,
8625 		.pr_lock =              event_lock,
8626 		.pr_unlock =            event_unlock,
8627 		.pr_getlock =           event_getlock,
8628 	}
8629 };
8630 
8631 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8632 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8633 
8634 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8635     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8636 
8637 struct kevtstat kevtstat;
8638 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8639     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8640     kevt_getstat, "S,kevtstat", "");
8641 
8642 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8643     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8644     kevt_pcblist, "S,xkevtpcb", "");
8645 
8646 SYSCTL_UINT(_net_systm_kevt, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
8647     (unsigned int *)&kevtstat.kes_pcbcount, 0, "");
8648 
8649 static lck_mtx_t *
8650 event_getlock(struct socket *so, int flags)
8651 {
8652 #pragma unused(flags)
8653 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8654 
8655 	if (so->so_pcb != NULL) {
8656 		if (so->so_usecount < 0) {
8657 			panic("%s: so=%p usecount=%d lrh= %s", __func__,
8658 			    so, so->so_usecount, solockhistory_nr(so));
8659 		}
8660 		/* NOTREACHED */
8661 	} else {
8662 		panic("%s: so=%p NULL NO so_pcb %s", __func__,
8663 		    so, solockhistory_nr(so));
8664 		/* NOTREACHED */
8665 	}
8666 	return &ev_pcb->evp_mtx;
8667 }
8668 
8669 static int
8670 event_lock(struct socket *so, int refcount, void *lr)
8671 {
8672 	void *lr_saved;
8673 
8674 	if (lr == NULL) {
8675 		lr_saved = __builtin_return_address(0);
8676 	} else {
8677 		lr_saved = lr;
8678 	}
8679 
8680 	if (so->so_pcb != NULL) {
8681 		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8682 	} else {
8683 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8684 		    so, lr_saved, solockhistory_nr(so));
8685 		/* NOTREACHED */
8686 	}
8687 
8688 	if (so->so_usecount < 0) {
8689 		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8690 		    so, so->so_pcb, lr_saved, so->so_usecount,
8691 		    solockhistory_nr(so));
8692 		/* NOTREACHED */
8693 	}
8694 
8695 	if (refcount) {
8696 		so->so_usecount++;
8697 	}
8698 
8699 	so->lock_lr[so->next_lock_lr] = lr_saved;
8700 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8701 	return 0;
8702 }
8703 
8704 static int
8705 event_unlock(struct socket *so, int refcount, void *lr)
8706 {
8707 	void *lr_saved;
8708 	lck_mtx_t *mutex_held;
8709 
8710 	if (lr == NULL) {
8711 		lr_saved = __builtin_return_address(0);
8712 	} else {
8713 		lr_saved = lr;
8714 	}
8715 
8716 	if (refcount) {
8717 		so->so_usecount--;
8718 	}
8719 	if (so->so_usecount < 0) {
8720 		panic("%s: so=%p usecount=%d lrh= %s", __func__,
8721 		    so, so->so_usecount, solockhistory_nr(so));
8722 		/* NOTREACHED */
8723 	}
8724 	if (so->so_pcb == NULL) {
8725 		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8726 		    so, so->so_usecount, (void *)lr_saved,
8727 		    solockhistory_nr(so));
8728 		/* NOTREACHED */
8729 	}
8730 	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8731 
8732 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8733 	so->unlock_lr[so->next_unlock_lr] = lr_saved;
8734 	so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8735 
8736 	if (so->so_usecount == 0) {
8737 		VERIFY(so->so_flags & SOF_PCBCLEARING);
8738 		event_sofreelastref(so);
8739 	} else {
8740 		lck_mtx_unlock(mutex_held);
8741 	}
8742 
8743 	return 0;
8744 }
8745 
8746 static int
8747 event_sofreelastref(struct socket *so)
8748 {
8749 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8750 
8751 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8752 
8753 	so->so_pcb = NULL;
8754 
8755 	/*
8756 	 * Disable upcall in the event another thread is in kev_post_msg()
8757 	 * appending record to the receive socket buffer, since sbwakeup()
8758 	 * may release the socket lock otherwise.
8759 	 */
8760 	so->so_rcv.sb_flags &= ~SB_UPCALL;
8761 	so->so_snd.sb_flags &= ~SB_UPCALL;
8762 	so->so_event = sonullevent;
8763 	lck_mtx_unlock(&(ev_pcb->evp_mtx));
8764 
8765 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8766 	lck_rw_lock_exclusive(&kev_rwlock);
8767 	LIST_REMOVE(ev_pcb, evp_link);
8768 	kevtstat.kes_pcbcount--;
8769 	kevtstat.kes_gencnt++;
8770 	lck_rw_done(&kev_rwlock);
8771 	kev_delete(ev_pcb);
8772 
8773 	sofreelastref(so, 1);
8774 	return 0;
8775 }
8776 
8777 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8778 
8779 static
8780 struct kern_event_head kern_event_head;
8781 
8782 static u_int32_t static_event_id = 0;
8783 
8784 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8785 
8786 /*
8787  * Install the protosw's for the NKE manager.  Invoked at extension load time
8788  */
8789 void
8790 kern_event_init(struct domain *dp)
8791 {
8792 	struct protosw *pr;
8793 	int i;
8794 
8795 	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8796 	VERIFY(dp == systemdomain);
8797 
8798 	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8799 		net_add_proto(pr, dp, 1);
8800 	}
8801 }
8802 
8803 static int
8804 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8805 {
8806 	int error = 0;
8807 	struct kern_event_pcb *ev_pcb;
8808 
8809 	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8810 	if (error != 0) {
8811 		return error;
8812 	}
8813 
8814 	ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8815 	lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8816 
8817 	ev_pcb->evp_socket = so;
8818 	ev_pcb->evp_vendor_code_filter = 0xffffffff;
8819 
8820 	so->so_pcb = (caddr_t) ev_pcb;
8821 	lck_rw_lock_exclusive(&kev_rwlock);
8822 	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8823 	kevtstat.kes_pcbcount++;
8824 	kevtstat.kes_gencnt++;
8825 	lck_rw_done(&kev_rwlock);
8826 
8827 	return error;
8828 }
8829 
8830 static void
8831 kev_delete(struct kern_event_pcb *ev_pcb)
8832 {
8833 	VERIFY(ev_pcb != NULL);
8834 	lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8835 	zfree(ev_pcb_zone, ev_pcb);
8836 }
8837 
8838 static int
8839 kev_detach(struct socket *so)
8840 {
8841 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8842 
8843 	if (ev_pcb != NULL) {
8844 		soisdisconnected(so);
8845 		so->so_flags |= SOF_PCBCLEARING;
8846 	}
8847 
8848 	return 0;
8849 }
8850 
8851 /*
8852  * For now, kev_vendor_code and mbuf_tags use the same
8853  * mechanism.
8854  */
8855 errno_t
8856 kev_vendor_code_find(
8857 	const char      *string,
8858 	u_int32_t       *out_vendor_code)
8859 {
8860 	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8861 		return EINVAL;
8862 	}
8863 	return net_str_id_find_internal(string, out_vendor_code,
8864 	           NSI_VENDOR_CODE, 1);
8865 }
8866 
8867 errno_t
8868 kev_msg_post(struct kev_msg *event_msg)
8869 {
8870 	mbuf_tag_id_t min_vendor, max_vendor;
8871 
8872 	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8873 
8874 	if (event_msg == NULL) {
8875 		return EINVAL;
8876 	}
8877 
8878 	/*
8879 	 * Limit third parties to posting events for registered vendor codes
8880 	 * only
8881 	 */
8882 	if (event_msg->vendor_code < min_vendor ||
8883 	    event_msg->vendor_code > max_vendor) {
8884 		os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8885 		return EINVAL;
8886 	}
8887 	return kev_post_msg(event_msg);
8888 }
8889 
8890 static int
8891 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8892 {
8893 	struct mbuf *m, *m2;
8894 	struct kern_event_pcb *ev_pcb;
8895 	struct kern_event_msg *ev;
8896 	char *tmp;
8897 	u_int32_t total_size;
8898 	int i;
8899 
8900 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8901 	/*
8902 	 * Special hook for ALF state updates
8903 	 */
8904 	if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8905 	    event_msg->kev_class == KEV_NKE_CLASS &&
8906 	    event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8907 	    event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8908 #if MACH_ASSERT
8909 		os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8910 #endif /* MACH_ASSERT */
8911 		net_filter_event_mark(NET_FILTER_EVENT_ALF,
8912 		    net_check_compatible_alf());
8913 	}
8914 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8915 
8916 	/* Verify the message is small enough to fit in one mbuf w/o cluster */
8917 	total_size = KEV_MSG_HEADER_SIZE;
8918 
8919 	for (i = 0; i < 5; i++) {
8920 		if (event_msg->dv[i].data_length == 0) {
8921 			break;
8922 		}
8923 		total_size += event_msg->dv[i].data_length;
8924 	}
8925 
8926 	if (total_size > MLEN) {
8927 		os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8928 		return EMSGSIZE;
8929 	}
8930 
8931 	m = m_get(wait, MT_DATA);
8932 	if (m == 0) {
8933 		os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8934 		return ENOMEM;
8935 	}
8936 	ev = mtod(m, struct kern_event_msg *);
8937 	total_size = KEV_MSG_HEADER_SIZE;
8938 
8939 	tmp = (char *) &ev->event_data[0];
8940 	for (i = 0; i < 5; i++) {
8941 		if (event_msg->dv[i].data_length == 0) {
8942 			break;
8943 		}
8944 
8945 		total_size += event_msg->dv[i].data_length;
8946 		bcopy(event_msg->dv[i].data_ptr, tmp,
8947 		    event_msg->dv[i].data_length);
8948 		tmp += event_msg->dv[i].data_length;
8949 	}
8950 
8951 	ev->id = ++static_event_id;
8952 	ev->total_size   = total_size;
8953 	ev->vendor_code  = event_msg->vendor_code;
8954 	ev->kev_class    = event_msg->kev_class;
8955 	ev->kev_subclass = event_msg->kev_subclass;
8956 	ev->event_code   = event_msg->event_code;
8957 
8958 	m->m_len = total_size;
8959 	lck_rw_lock_shared(&kev_rwlock);
8960 	for (ev_pcb = LIST_FIRST(&kern_event_head);
8961 	    ev_pcb;
8962 	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8963 		lck_mtx_lock(&ev_pcb->evp_mtx);
8964 		if (ev_pcb->evp_socket->so_pcb == NULL) {
8965 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8966 			continue;
8967 		}
8968 		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8969 			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8970 				lck_mtx_unlock(&ev_pcb->evp_mtx);
8971 				continue;
8972 			}
8973 
8974 			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8975 				if (ev_pcb->evp_class_filter != ev->kev_class) {
8976 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8977 					continue;
8978 				}
8979 
8980 				if ((ev_pcb->evp_subclass_filter !=
8981 				    KEV_ANY_SUBCLASS) &&
8982 				    (ev_pcb->evp_subclass_filter !=
8983 				    ev->kev_subclass)) {
8984 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8985 					continue;
8986 				}
8987 			}
8988 		}
8989 
8990 		m2 = m_copym(m, 0, m->m_len, wait);
8991 		if (m2 == 0) {
8992 			os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8993 			m_free(m);
8994 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8995 			lck_rw_done(&kev_rwlock);
8996 			return ENOMEM;
8997 		}
8998 		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8999 			/*
9000 			 * We use "m" for the socket stats as it would be
9001 			 * unsafe to use "m2"
9002 			 */
9003 			so_inc_recv_data_stat(ev_pcb->evp_socket,
9004 			    1, m->m_len);
9005 
9006 			sorwakeup(ev_pcb->evp_socket);
9007 			os_atomic_inc(&kevtstat.kes_posted, relaxed);
9008 		} else {
9009 			os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
9010 		}
9011 		lck_mtx_unlock(&ev_pcb->evp_mtx);
9012 	}
9013 	m_free(m);
9014 	lck_rw_done(&kev_rwlock);
9015 
9016 	return 0;
9017 }
9018 
9019 int
9020 kev_post_msg(struct kev_msg *event_msg)
9021 {
9022 	return kev_post_msg_internal(event_msg, M_WAIT);
9023 }
9024 
9025 int
9026 kev_post_msg_nowait(struct kev_msg *event_msg)
9027 {
9028 	return kev_post_msg_internal(event_msg, M_NOWAIT);
9029 }
9030 
9031 static int
9032 kev_control(struct socket *so,
9033     u_long cmd,
9034     caddr_t data,
9035     __unused struct ifnet *ifp,
9036     __unused struct proc *p)
9037 {
9038 	struct kev_request *kev_req = (struct kev_request *) data;
9039 	struct kern_event_pcb  *ev_pcb;
9040 	struct kev_vendor_code *kev_vendor;
9041 	u_int32_t  *id_value = (u_int32_t *) data;
9042 
9043 	switch (cmd) {
9044 	case SIOCGKEVID:
9045 		*id_value = static_event_id;
9046 		break;
9047 	case SIOCSKEVFILT:
9048 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9049 		ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9050 		ev_pcb->evp_class_filter = kev_req->kev_class;
9051 		ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
9052 		break;
9053 	case SIOCGKEVFILT:
9054 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9055 		kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9056 		kev_req->kev_class   = ev_pcb->evp_class_filter;
9057 		kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9058 		break;
9059 	case SIOCGKEVVENDOR:
9060 		kev_vendor = (struct kev_vendor_code *)data;
9061 		/* Make sure string is NULL terminated */
9062 		kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
9063 		return net_str_id_find_internal(kev_vendor->vendor_string,
9064 		           &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
9065 	default:
9066 		return ENOTSUP;
9067 	}
9068 
9069 	return 0;
9070 }
9071 
9072 int
9073 kevt_getstat SYSCTL_HANDLER_ARGS
9074 {
9075 #pragma unused(oidp, arg1, arg2)
9076 	int error = 0;
9077 
9078 	lck_rw_lock_shared(&kev_rwlock);
9079 
9080 	if (req->newptr != USER_ADDR_NULL) {
9081 		error = EPERM;
9082 		goto done;
9083 	}
9084 	if (req->oldptr == USER_ADDR_NULL) {
9085 		req->oldidx = sizeof(struct kevtstat);
9086 		goto done;
9087 	}
9088 
9089 	error = SYSCTL_OUT(req, &kevtstat,
9090 	    MIN(sizeof(struct kevtstat), req->oldlen));
9091 done:
9092 	lck_rw_done(&kev_rwlock);
9093 
9094 	return error;
9095 }
9096 
9097 __private_extern__ int
9098 kevt_pcblist SYSCTL_HANDLER_ARGS
9099 {
9100 #pragma unused(oidp, arg1, arg2)
9101 	int error = 0;
9102 	uint64_t n, i;
9103 	struct xsystmgen xsg;
9104 	void *buf = NULL;
9105 	size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
9106 	    ROUNDUP64(sizeof(struct xsocket_n)) +
9107 	    2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
9108 	    ROUNDUP64(sizeof(struct xsockstat_n));
9109 	struct kern_event_pcb  *ev_pcb;
9110 
9111 	buf = kalloc_data(item_size, Z_WAITOK_ZERO_NOFAIL);
9112 
9113 	lck_rw_lock_shared(&kev_rwlock);
9114 
9115 	n = kevtstat.kes_pcbcount;
9116 
9117 	if (req->oldptr == USER_ADDR_NULL) {
9118 		req->oldidx = (size_t) ((n + n / 8) * item_size);
9119 		goto done;
9120 	}
9121 	if (req->newptr != USER_ADDR_NULL) {
9122 		error = EPERM;
9123 		goto done;
9124 	}
9125 	bzero(&xsg, sizeof(xsg));
9126 	xsg.xg_len = sizeof(xsg);
9127 	xsg.xg_count = n;
9128 	xsg.xg_gen = kevtstat.kes_gencnt;
9129 	xsg.xg_sogen = so_gencnt;
9130 	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9131 	if (error) {
9132 		goto done;
9133 	}
9134 	/*
9135 	 * We are done if there is no pcb
9136 	 */
9137 	if (n == 0) {
9138 		goto done;
9139 	}
9140 
9141 	i = 0;
9142 	for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9143 	    i < n && ev_pcb != NULL;
9144 	    i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9145 		struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9146 		struct xsocket_n *xso = (struct xsocket_n *)
9147 		    ADVANCE64(xk, sizeof(*xk));
9148 		struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9149 		    ADVANCE64(xso, sizeof(*xso));
9150 		struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9151 		    ADVANCE64(xsbrcv, sizeof(*xsbrcv));
9152 		struct xsockstat_n *xsostats = (struct xsockstat_n *)
9153 		    ADVANCE64(xsbsnd, sizeof(*xsbsnd));
9154 
9155 		bzero(buf, item_size);
9156 
9157 		lck_mtx_lock(&ev_pcb->evp_mtx);
9158 
9159 		xk->kep_len = sizeof(struct xkevtpcb);
9160 		xk->kep_kind = XSO_EVT;
9161 		xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRHASH(ev_pcb);
9162 		xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9163 		xk->kep_class_filter = ev_pcb->evp_class_filter;
9164 		xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9165 
9166 		sotoxsocket_n(ev_pcb->evp_socket, xso);
9167 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
9168 		    &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9169 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
9170 		    &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9171 		sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9172 
9173 		lck_mtx_unlock(&ev_pcb->evp_mtx);
9174 
9175 		error = SYSCTL_OUT(req, buf, item_size);
9176 	}
9177 
9178 	if (error == 0) {
9179 		/*
9180 		 * Give the user an updated idea of our state.
9181 		 * If the generation differs from what we told
9182 		 * her before, she knows that something happened
9183 		 * while we were processing this request, and it
9184 		 * might be necessary to retry.
9185 		 */
9186 		bzero(&xsg, sizeof(xsg));
9187 		xsg.xg_len = sizeof(xsg);
9188 		xsg.xg_count = n;
9189 		xsg.xg_gen = kevtstat.kes_gencnt;
9190 		xsg.xg_sogen = so_gencnt;
9191 		error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9192 		if (error) {
9193 			goto done;
9194 		}
9195 	}
9196 
9197 done:
9198 	lck_rw_done(&kev_rwlock);
9199 
9200 	kfree_data(buf, item_size);
9201 	return error;
9202 }
9203 
9204 #endif /* SOCKETS */
9205 
9206 
9207 int
9208 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
9209 {
9210 	struct vinfo_stat * st;
9211 
9212 	st = &kinfo->kq_stat;
9213 
9214 	st->vst_size = kqu.kq->kq_count;
9215 	if (kqu.kq->kq_state & KQ_KEV_QOS) {
9216 		st->vst_blksize = sizeof(struct kevent_qos_s);
9217 	} else if (kqu.kq->kq_state & KQ_KEV64) {
9218 		st->vst_blksize = sizeof(struct kevent64_s);
9219 	} else {
9220 		st->vst_blksize = sizeof(struct kevent);
9221 	}
9222 	st->vst_mode = S_IFIFO;
9223 	st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
9224 	    kqu.kqwl->kqwl_dynamicid : 0;
9225 
9226 	/* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9227 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9228 	static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
9229 	static_assert(PROC_KQUEUE_32 == KQ_KEV32);
9230 	static_assert(PROC_KQUEUE_64 == KQ_KEV64);
9231 	static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
9232 	static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
9233 	static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
9234 	kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
9235 	if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
9236 		if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
9237 			kinfo->kq_state |= PROC_KQUEUE_SELECT;
9238 		}
9239 	}
9240 
9241 	return 0;
9242 }
9243 
9244 static int
9245 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
9246 {
9247 	workq_threadreq_t kqr = &kqwl->kqwl_request;
9248 	workq_threadreq_param_t trp = {};
9249 	int err;
9250 
9251 	if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
9252 		return EINVAL;
9253 	}
9254 
9255 	if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
9256 		return err;
9257 	}
9258 
9259 	kqlock(kqwl);
9260 
9261 	kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
9262 	kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9263 	kqdi->kqdi_request_state = kqr->tr_state;
9264 	kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
9265 	kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
9266 	kqdi->kqdi_sync_waiters = 0;
9267 	kqdi->kqdi_sync_waiter_qos = 0;
9268 
9269 	trp.trp_value = kqwl->kqwl_params;
9270 	if (trp.trp_flags & TRP_PRIORITY) {
9271 		kqdi->kqdi_pri = trp.trp_pri;
9272 	} else {
9273 		kqdi->kqdi_pri = 0;
9274 	}
9275 
9276 	if (trp.trp_flags & TRP_POLICY) {
9277 		kqdi->kqdi_pol = trp.trp_pol;
9278 	} else {
9279 		kqdi->kqdi_pol = 0;
9280 	}
9281 
9282 	if (trp.trp_flags & TRP_CPUPERCENT) {
9283 		kqdi->kqdi_cpupercent = trp.trp_cpupercent;
9284 	} else {
9285 		kqdi->kqdi_cpupercent = 0;
9286 	}
9287 
9288 	kqunlock(kqwl);
9289 
9290 	return 0;
9291 }
9292 
9293 
9294 static unsigned long
9295 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9296     unsigned long buflen, unsigned long nknotes)
9297 {
9298 	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9299 		if (kq == knote_get_kq(kn)) {
9300 			if (nknotes < buflen) {
9301 				struct kevent_extinfo *info = &buf[nknotes];
9302 
9303 				kqlock(kq);
9304 
9305 				if (knote_fops(kn)->f_sanitized_copyout) {
9306 					knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
9307 				} else {
9308 					info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
9309 				}
9310 
9311 				if (knote_has_qos(kn)) {
9312 					info->kqext_kev.qos =
9313 					    _pthread_priority_thread_qos_fast(kn->kn_qos);
9314 				} else {
9315 					info->kqext_kev.qos = kn->kn_qos_override;
9316 				}
9317 				info->kqext_kev.filter |= 0xff00; /* sign extend filter */
9318 				info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
9319 				info->kqext_kev.data    = 0; /* this is where sdata lives */
9320 				info->kqext_sdata       = kn->kn_sdata;
9321 				info->kqext_status      = kn->kn_status;
9322 				info->kqext_sfflags     = kn->kn_sfflags;
9323 
9324 				kqunlock(kq);
9325 			}
9326 
9327 			/* we return total number of knotes, which may be more than requested */
9328 			nknotes++;
9329 		}
9330 	}
9331 
9332 	return nknotes;
9333 }
9334 
9335 int
9336 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9337     int32_t *nkqueues_out)
9338 {
9339 	proc_t p = (proc_t)proc;
9340 	struct filedesc *fdp = &p->p_fd;
9341 	unsigned int nkqueues = 0;
9342 	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9343 	size_t buflen, bufsize;
9344 	kqueue_id_t *kq_ids = NULL;
9345 	int err = 0;
9346 
9347 	assert(p != NULL);
9348 
9349 	if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9350 		err = EINVAL;
9351 		goto out;
9352 	}
9353 
9354 	buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9355 
9356 	if (ubuflen != 0) {
9357 		if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9358 			err = ERANGE;
9359 			goto out;
9360 		}
9361 		kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
9362 		if (!kq_ids) {
9363 			err = ENOMEM;
9364 			goto out;
9365 		}
9366 	}
9367 
9368 	kqhash_lock(fdp);
9369 
9370 	u_long kqhashmask = fdp->fd_kqhashmask;
9371 	if (kqhashmask > 0) {
9372 		for (uint32_t i = 0; i < kqhashmask + 1; i++) {
9373 			struct kqworkloop *kqwl;
9374 
9375 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9376 				/* report the number of kqueues, even if they don't all fit */
9377 				if (nkqueues < buflen) {
9378 					kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9379 				}
9380 				nkqueues++;
9381 			}
9382 
9383 			/*
9384 			 * Drop the kqhash lock and take it again to give some breathing room
9385 			 */
9386 			kqhash_unlock(fdp);
9387 			kqhash_lock(fdp);
9388 
9389 			/*
9390 			 * Reevaluate to see if we have raced with someone who changed this -
9391 			 * if we have, we should bail out with the set of info captured so far
9392 			 */
9393 			if (fdp->fd_kqhashmask != kqhashmask) {
9394 				break;
9395 			}
9396 		}
9397 	}
9398 
9399 	kqhash_unlock(fdp);
9400 
9401 	if (kq_ids) {
9402 		size_t copysize;
9403 		if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
9404 			err = ERANGE;
9405 			goto out;
9406 		}
9407 
9408 		assert(ubufsize >= copysize);
9409 		err = copyout(kq_ids, ubuf, copysize);
9410 	}
9411 
9412 out:
9413 	if (kq_ids) {
9414 		kfree_data(kq_ids, bufsize);
9415 	}
9416 
9417 	if (!err) {
9418 		*nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9419 	}
9420 	return err;
9421 }
9422 
9423 int
9424 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9425     uint32_t ubufsize, int32_t *size_out)
9426 {
9427 	proc_t p = (proc_t)proc;
9428 	struct kqworkloop *kqwl;
9429 	int err = 0;
9430 	struct kqueue_dyninfo kqdi = { };
9431 
9432 	assert(p != NULL);
9433 
9434 	if (ubufsize < sizeof(struct kqueue_info)) {
9435 		return ENOBUFS;
9436 	}
9437 
9438 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9439 	if (!kqwl) {
9440 		return ESRCH;
9441 	}
9442 
9443 	/*
9444 	 * backward compatibility: allow the argument to this call to only be
9445 	 * a struct kqueue_info
9446 	 */
9447 	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9448 		ubufsize = sizeof(struct kqueue_dyninfo);
9449 		err = fill_kqueue_dyninfo(kqwl, &kqdi);
9450 	} else {
9451 		ubufsize = sizeof(struct kqueue_info);
9452 		err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
9453 	}
9454 	if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9455 		*size_out = ubufsize;
9456 	}
9457 	kqworkloop_release(kqwl);
9458 	return err;
9459 }
9460 
9461 int
9462 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9463     uint32_t ubufsize, int32_t *nknotes_out)
9464 {
9465 	proc_t p = (proc_t)proc;
9466 	struct kqworkloop *kqwl;
9467 	int err;
9468 
9469 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9470 	if (!kqwl) {
9471 		return ESRCH;
9472 	}
9473 
9474 	err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
9475 	kqworkloop_release(kqwl);
9476 	return err;
9477 }
9478 
9479 int
9480 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9481     uint32_t bufsize, int32_t *retval)
9482 {
9483 	struct knote *kn;
9484 	int i;
9485 	int err = 0;
9486 	struct filedesc *fdp = &p->p_fd;
9487 	unsigned long nknotes = 0;
9488 	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9489 	struct kevent_extinfo *kqext = NULL;
9490 
9491 	/* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9492 	buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9493 
9494 	kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
9495 	if (kqext == NULL) {
9496 		err = ENOMEM;
9497 		goto out;
9498 	}
9499 
9500 	proc_fdlock(p);
9501 	u_long fd_knlistsize = fdp->fd_knlistsize;
9502 	struct klist *fd_knlist = fdp->fd_knlist;
9503 
9504 	for (i = 0; i < fd_knlistsize; i++) {
9505 		kn = SLIST_FIRST(&fd_knlist[i]);
9506 		nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9507 
9508 		proc_fdunlock(p);
9509 		proc_fdlock(p);
9510 		/*
9511 		 * Reevaluate to see if we have raced with someone who changed this -
9512 		 * if we have, we return the set of info for fd_knlistsize we knew
9513 		 * in the beginning except if knotes_dealloc interleaves with us.
9514 		 * In that case, we bail out early with the set of info captured so far.
9515 		 */
9516 		if (fd_knlistsize != fdp->fd_knlistsize) {
9517 			if (fdp->fd_knlistsize) {
9518 				/* kq_add_knote might grow fdp->fd_knlist. */
9519 				fd_knlist = fdp->fd_knlist;
9520 			} else {
9521 				break;
9522 			}
9523 		}
9524 	}
9525 	proc_fdunlock(p);
9526 
9527 	knhash_lock(fdp);
9528 	u_long knhashmask = fdp->fd_knhashmask;
9529 
9530 	if (knhashmask != 0) {
9531 		for (i = 0; i < (int)knhashmask + 1; i++) {
9532 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9533 			nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9534 
9535 			knhash_unlock(fdp);
9536 			knhash_lock(fdp);
9537 
9538 			/*
9539 			 * Reevaluate to see if we have raced with someone who changed this -
9540 			 * if we have, we should bail out with the set of info captured so far
9541 			 */
9542 			if (fdp->fd_knhashmask != knhashmask) {
9543 				break;
9544 			}
9545 		}
9546 	}
9547 	knhash_unlock(fdp);
9548 
9549 	assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9550 	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9551 
9552 out:
9553 	kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9554 
9555 	if (!err) {
9556 		*retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9557 	}
9558 	return err;
9559 }
9560 
9561 static unsigned int
9562 klist_copy_udata(struct klist *list, uint64_t *buf,
9563     unsigned int buflen, unsigned int nknotes)
9564 {
9565 	struct knote *kn;
9566 	SLIST_FOREACH(kn, list, kn_link) {
9567 		if (nknotes < buflen) {
9568 			/*
9569 			 * kevent_register will always set kn_udata atomically
9570 			 * so that we don't have to take any kqlock here.
9571 			 */
9572 			buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9573 		}
9574 		/* we return total number of knotes, which may be more than requested */
9575 		nknotes++;
9576 	}
9577 
9578 	return nknotes;
9579 }
9580 
9581 int
9582 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9583 {
9584 	proc_t p = (proc_t)proc;
9585 	struct filedesc *fdp = &p->p_fd;
9586 	unsigned int nuptrs = 0;
9587 	unsigned int buflen = bufsize / sizeof(uint64_t);
9588 	struct kqworkloop *kqwl;
9589 	u_long size = 0;
9590 	struct klist *fd_knlist = NULL;
9591 
9592 	if (buflen > 0) {
9593 		assert(buf != NULL);
9594 	}
9595 
9596 	/*
9597 	 * Copyout the uptrs as much as possible but make sure to drop the respective
9598 	 * locks and take them again periodically so that we don't blow through
9599 	 * preemption disabled timeouts. Always reevaluate to see if we have raced
9600 	 * with someone who changed size of the hash - if we have, we return info for
9601 	 * the size of the hash we knew in the beginning except if it drops to 0.
9602 	 * In that case, we bail out with the set of info captured so far
9603 	 */
9604 	proc_fdlock(p);
9605 	size = fdp->fd_knlistsize;
9606 	fd_knlist = fdp->fd_knlist;
9607 
9608 	for (int i = 0; i < size; i++) {
9609 		nuptrs = klist_copy_udata(&fd_knlist[i], buf, buflen, nuptrs);
9610 
9611 		proc_fdunlock(p);
9612 		proc_fdlock(p);
9613 		if (size != fdp->fd_knlistsize) {
9614 			if (fdp->fd_knlistsize) {
9615 				/* kq_add_knote might grow fdp->fd_knlist. */
9616 				fd_knlist = fdp->fd_knlist;
9617 			} else {
9618 				break;
9619 			}
9620 		}
9621 	}
9622 	proc_fdunlock(p);
9623 
9624 	knhash_lock(fdp);
9625 	size = fdp->fd_knhashmask;
9626 
9627 	if (size != 0) {
9628 		for (size_t i = 0; i < size + 1; i++) {
9629 			nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9630 
9631 			knhash_unlock(fdp);
9632 			knhash_lock(fdp);
9633 			/* The only path that can interleave with us today is knotes_dealloc. */
9634 			if (size != fdp->fd_knhashmask) {
9635 				break;
9636 			}
9637 		}
9638 	}
9639 	knhash_unlock(fdp);
9640 
9641 	kqhash_lock(fdp);
9642 	size = fdp->fd_kqhashmask;
9643 
9644 	if (size != 0) {
9645 		for (size_t i = 0; i < size + 1; i++) {
9646 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9647 				if (nuptrs < buflen) {
9648 					buf[nuptrs] = kqwl->kqwl_dynamicid;
9649 				}
9650 				nuptrs++;
9651 			}
9652 
9653 			kqhash_unlock(fdp);
9654 			kqhash_lock(fdp);
9655 			if (size != fdp->fd_kqhashmask) {
9656 				break;
9657 			}
9658 		}
9659 	}
9660 	kqhash_unlock(fdp);
9661 
9662 	return (int)nuptrs;
9663 }
9664 
9665 static void
9666 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9667 {
9668 	uint64_t ast_addr;
9669 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9670 	size_t user_addr_size = proc_is_64bit ? 8 : 4;
9671 	uint32_t ast_flags32 = 0;
9672 	uint64_t ast_flags64 = 0;
9673 	struct uthread *ut = get_bsdthread_info(thread);
9674 
9675 	if (ut->uu_kqr_bound != NULL) {
9676 		ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9677 	}
9678 
9679 	if (ast_flags64 == 0) {
9680 		return;
9681 	}
9682 
9683 	if (!(p->p_flag & P_LP64)) {
9684 		ast_flags32 = (uint32_t)ast_flags64;
9685 		assert(ast_flags64 < 0x100000000ull);
9686 	}
9687 
9688 	ast_addr = thread_rettokern_addr(thread);
9689 	if (ast_addr == 0) {
9690 		return;
9691 	}
9692 
9693 	if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9694 	    (user_addr_t)ast_addr,
9695 	    user_addr_size) != 0) {
9696 		printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9697 		    "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9698 	}
9699 }
9700 
9701 /*
9702  * Semantics of writing to TSD value:
9703  *
9704  * 1. It is written to by the kernel and cleared by userspace.
9705  * 2. When the userspace code clears the TSD field, it takes responsibility for
9706  * taking action on the quantum expiry action conveyed by kernel.
9707  * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9708  * userspace back to kernel to make sure that it is never leaked across thread
9709  * requests.
9710  */
9711 void
9712 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9713     uint64_t flags)
9714 {
9715 	uint64_t ast_addr;
9716 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9717 	uint32_t ast_flags32 = 0;
9718 	uint64_t ast_flags64 = flags;
9719 
9720 	if (ast_flags64 == 0) {
9721 		return;
9722 	}
9723 
9724 	if (!(p->p_flag & P_LP64)) {
9725 		ast_flags32 = (uint32_t)ast_flags64;
9726 		assert(ast_flags64 < 0x100000000ull);
9727 	}
9728 
9729 	ast_addr = thread_wqquantum_addr(thread);
9730 	assert(ast_addr != 0);
9731 
9732 	if (proc_is_64bit) {
9733 		if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9734 #if DEBUG || DEVELOPMENT
9735 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9736 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9737 #endif
9738 		}
9739 	} else {
9740 		if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9741 #if DEBUG || DEVELOPMENT
9742 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9743 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9744 #endif
9745 		}
9746 	}
9747 }
9748 
9749 void
9750 kevent_ast(thread_t thread, uint16_t bits)
9751 {
9752 	proc_t p = current_proc();
9753 
9754 
9755 	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9756 		workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9757 	}
9758 	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9759 		kevent_set_return_to_kernel_user_tsd(p, thread);
9760 	}
9761 
9762 	if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9763 		workq_kern_quantum_expiry_reevaluate(p, thread);
9764 	}
9765 }
9766 
9767 #if DEVELOPMENT || DEBUG
9768 
9769 #define KEVENT_SYSCTL_BOUND_ID 1
9770 
9771 static int
9772 kevent_sysctl SYSCTL_HANDLER_ARGS
9773 {
9774 #pragma unused(oidp, arg2)
9775 	uintptr_t type = (uintptr_t)arg1;
9776 	uint64_t bound_id = 0;
9777 
9778 	if (type != KEVENT_SYSCTL_BOUND_ID) {
9779 		return EINVAL;
9780 	}
9781 
9782 	if (req->newptr) {
9783 		return EINVAL;
9784 	}
9785 
9786 	struct uthread *ut = current_uthread();
9787 	if (!ut) {
9788 		return EFAULT;
9789 	}
9790 
9791 	workq_threadreq_t kqr = ut->uu_kqr_bound;
9792 	if (kqr) {
9793 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9794 			bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9795 		} else {
9796 			bound_id = -1;
9797 		}
9798 	}
9799 
9800 	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9801 }
9802 
9803 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9804     "kevent information");
9805 
9806 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9807     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9808     (void *)KEVENT_SYSCTL_BOUND_ID,
9809     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9810     "get the ID of the bound kqueue");
9811 
9812 #endif /* DEVELOPMENT || DEBUG */
9813