xref: /xnu-10063.101.15/bsd/kern/kern_event.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 /*-
30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 /*
55  *	@(#)kern_event.c       1.0 (3/31/2000)
56  */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91 
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 #include <kern/work_interval.h>
107 
108 #include <pthread/priority_private.h>
109 #include <pthread/workqueue_syscalls.h>
110 #include <pthread/workqueue_internal.h>
111 #include <libkern/libkern.h>
112 
113 #include <os/log.h>
114 
115 #include "net/net_str_id.h"
116 
117 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
118 #include <skywalk/lib/net_filter_event.h>
119 
120 extern bool net_check_compatible_alf(void);
121 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
122 
123 #include <mach/task.h>
124 #include <libkern/section_keywords.h>
125 
126 #if CONFIG_MEMORYSTATUS
127 #include <sys/kern_memorystatus.h>
128 #endif
129 
130 #if DEVELOPMENT || DEBUG
131 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
132 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
133 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
134 #endif
135 
136 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
137 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
138     VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
139 
140 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
141 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
142 
143 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
144 
145 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
146     vfs_context_t ctx);
147 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
148 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
149     struct kevent_qos_s *kev);
150 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
151 
152 static const struct fileops kqueueops = {
153 	.fo_type     = DTYPE_KQUEUE,
154 	.fo_read     = fo_no_read,
155 	.fo_write    = fo_no_write,
156 	.fo_ioctl    = fo_no_ioctl,
157 	.fo_select   = kqueue_select,
158 	.fo_close    = kqueue_close,
159 	.fo_drain    = kqueue_drain,
160 	.fo_kqfilter = kqueue_kqfilter,
161 };
162 
163 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
164 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
165 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
166     thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
167 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
168 static void kevent_register_wait_cleanup(struct knote *kn);
169 
170 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
171 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
172 
173 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
174 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
175 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
176 static void kqueue_update_iotier_override(kqueue_t kqu);
177 
178 static void kqworkloop_unbind(struct kqworkloop *kwql);
179 
180 enum kqwl_unbind_locked_mode {
181 	KQWL_OVERRIDE_DROP_IMMEDIATELY,
182 	KQWL_OVERRIDE_DROP_DELAYED,
183 };
184 static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
185     enum kqwl_unbind_locked_mode how);
186 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
187 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
188 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
189 enum {
190 	KQWL_UTQ_NONE,
191 	/*
192 	 * The wakeup qos is the qos of QUEUED knotes.
193 	 *
194 	 * This QoS is accounted for with the events override in the
195 	 * kqr_override_index field. It is raised each time a new knote is queued at
196 	 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
197 	 * knote buckets and is recomputed after each event delivery.
198 	 */
199 	KQWL_UTQ_UPDATE_WAKEUP_QOS,
200 	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
201 	KQWL_UTQ_UNBINDING, /* attempt to rebind */
202 	KQWL_UTQ_PARKING,
203 	/*
204 	 * The wakeup override is for suppressed knotes that have fired again at
205 	 * a higher QoS than the one for which they are suppressed already.
206 	 * This override is cleared when the knote suppressed list becomes empty.
207 	 */
208 	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
209 	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
210 	/*
211 	 * The QoS is the maximum QoS of an event enqueued on this workloop in
212 	 * userland. It is copied from the only EVFILT_WORKLOOP knote with
213 	 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
214 	 * such knote, this QoS is 0.
215 	 */
216 	KQWL_UTQ_SET_QOS_INDEX,
217 	KQWL_UTQ_REDRIVE_EVENTS,
218 };
219 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
220 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
221 
222 static struct knote *knote_alloc(void);
223 static void knote_free(struct knote *kn);
224 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
225     struct knote_lock_ctx *knlc, struct proc *p);
226 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
227     struct kevent_qos_s *kev, bool is_fd, struct proc *p);
228 
229 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
230 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
231 
232 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
233     struct kevent_qos_s *kev, int result);
234 static void knote_suppress(kqueue_t kqu, struct knote *kn);
235 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
236 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
237 
238 // both these functions may dequeue the knote and it is up to the caller
239 // to enqueue the knote back
240 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
241 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
242 
243 static ZONE_DEFINE(knote_zone, "knote zone",
244     sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
245 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
246     sizeof(struct kqfile), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
247 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
248     sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
249 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
250     sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_NOTBITAG);
251 
252 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
253 
254 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
255 static void filt_no_detach(struct knote *kn);
256 static int filt_bad_event(struct knote *kn, long hint);
257 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
258 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
259 
260 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
261 	.f_attach  = filt_no_attach,
262 	.f_detach  = filt_no_detach,
263 	.f_event   = filt_bad_event,
264 	.f_touch   = filt_bad_touch,
265 	.f_process = filt_bad_process,
266 };
267 
268 #if CONFIG_MEMORYSTATUS
269 extern const struct filterops memorystatus_filtops;
270 #endif /* CONFIG_MEMORYSTATUS */
271 extern const struct filterops fs_filtops;
272 extern const struct filterops sig_filtops;
273 extern const struct filterops machport_attach_filtops;
274 extern const struct filterops mach_port_filtops;
275 extern const struct filterops mach_port_set_filtops;
276 extern const struct filterops pipe_nfiltops;
277 extern const struct filterops pipe_rfiltops;
278 extern const struct filterops pipe_wfiltops;
279 extern const struct filterops ptsd_kqops;
280 extern const struct filterops ptmx_kqops;
281 extern const struct filterops soread_filtops;
282 extern const struct filterops sowrite_filtops;
283 extern const struct filterops sock_filtops;
284 extern const struct filterops soexcept_filtops;
285 extern const struct filterops spec_filtops;
286 extern const struct filterops bpfread_filtops;
287 extern const struct filterops necp_fd_rfiltops;
288 #if SKYWALK
289 extern const struct filterops skywalk_channel_rfiltops;
290 extern const struct filterops skywalk_channel_wfiltops;
291 extern const struct filterops skywalk_channel_efiltops;
292 #endif /* SKYWALK */
293 extern const struct filterops fsevent_filtops;
294 extern const struct filterops vnode_filtops;
295 extern const struct filterops tty_filtops;
296 
297 const static struct filterops file_filtops;
298 const static struct filterops kqread_filtops;
299 const static struct filterops proc_filtops;
300 const static struct filterops timer_filtops;
301 const static struct filterops user_filtops;
302 const static struct filterops workloop_filtops;
303 #if CONFIG_EXCLAVES
304 extern const struct filterops exclaves_notification_filtops;
305 #endif /* CONFIG_EXCLAVES */
306 
307 /*
308  *
309  * Rules for adding new filters to the system:
310  * Public filters:
311  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
312  *   in the exported section of the header
313  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
314  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
315  *   of the Public Filters section in the array.
316  * Private filters:
317  * - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
318  * - Update the EVFILTID_MAX value to reflect the new addition
319  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
320  *   the Private filters section of the array.
321  */
322 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
323 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
324 	/* Public Filters */
325 	[~EVFILT_READ]                  = &file_filtops,
326 	[~EVFILT_WRITE]                 = &file_filtops,
327 	[~EVFILT_AIO]                   = &bad_filtops,
328 	[~EVFILT_VNODE]                 = &file_filtops,
329 	[~EVFILT_PROC]                  = &proc_filtops,
330 	[~EVFILT_SIGNAL]                = &sig_filtops,
331 	[~EVFILT_TIMER]                 = &timer_filtops,
332 	[~EVFILT_MACHPORT]              = &machport_attach_filtops,
333 	[~EVFILT_FS]                    = &fs_filtops,
334 	[~EVFILT_USER]                  = &user_filtops,
335 	[~EVFILT_UNUSED_11]             = &bad_filtops,
336 	[~EVFILT_VM]                    = &bad_filtops,
337 	[~EVFILT_SOCK]                  = &file_filtops,
338 #if CONFIG_MEMORYSTATUS
339 	[~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
340 #else
341 	[~EVFILT_MEMORYSTATUS]          = &bad_filtops,
342 #endif
343 	[~EVFILT_EXCEPT]                = &file_filtops,
344 #if SKYWALK
345 	[~EVFILT_NW_CHANNEL]            = &file_filtops,
346 #else /* !SKYWALK */
347 	[~EVFILT_NW_CHANNEL]            = &bad_filtops,
348 #endif /* !SKYWALK */
349 	[~EVFILT_WORKLOOP]              = &workloop_filtops,
350 #if CONFIG_EXCLAVES
351 	[~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
352 #else /* !CONFIG_EXCLAVES */
353 	[~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
354 #endif /* CONFIG_EXCLAVES*/
355 
356 	/* Private filters */
357 	[EVFILTID_KQREAD]               = &kqread_filtops,
358 	[EVFILTID_PIPE_N]               = &pipe_nfiltops,
359 	[EVFILTID_PIPE_R]               = &pipe_rfiltops,
360 	[EVFILTID_PIPE_W]               = &pipe_wfiltops,
361 	[EVFILTID_PTSD]                 = &ptsd_kqops,
362 	[EVFILTID_SOREAD]               = &soread_filtops,
363 	[EVFILTID_SOWRITE]              = &sowrite_filtops,
364 	[EVFILTID_SCK]                  = &sock_filtops,
365 	[EVFILTID_SOEXCEPT]             = &soexcept_filtops,
366 	[EVFILTID_SPEC]                 = &spec_filtops,
367 	[EVFILTID_BPFREAD]              = &bpfread_filtops,
368 	[EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
369 #if SKYWALK
370 	[EVFILTID_SKYWALK_CHANNEL_W]    = &skywalk_channel_wfiltops,
371 	[EVFILTID_SKYWALK_CHANNEL_R]    = &skywalk_channel_rfiltops,
372 	[EVFILTID_SKYWALK_CHANNEL_E]    = &skywalk_channel_efiltops,
373 #else /* !SKYWALK */
374 	[EVFILTID_SKYWALK_CHANNEL_W]    = &bad_filtops,
375 	[EVFILTID_SKYWALK_CHANNEL_R]    = &bad_filtops,
376 	[EVFILTID_SKYWALK_CHANNEL_E]    = &bad_filtops,
377 #endif /* !SKYWALK */
378 	[EVFILTID_FSEVENT]              = &fsevent_filtops,
379 	[EVFILTID_VN]                   = &vnode_filtops,
380 	[EVFILTID_TTY]                  = &tty_filtops,
381 	[EVFILTID_PTMX]                 = &ptmx_kqops,
382 	[EVFILTID_MACH_PORT]            = &mach_port_filtops,
383 	[EVFILTID_MACH_PORT_SET]        = &mach_port_set_filtops,
384 
385 	/* fake filter for detached knotes, keep last */
386 	[EVFILTID_DETACHED]             = &bad_filtops,
387 };
388 
389 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)390 kqr_thread_bound(workq_threadreq_t kqr)
391 {
392 	return kqr->tr_state == WORKQ_TR_STATE_BOUND;
393 }
394 
395 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)396 kqr_thread_requested_pending(workq_threadreq_t kqr)
397 {
398 	workq_tr_state_t tr_state = kqr->tr_state;
399 	return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
400 }
401 
402 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)403 kqr_thread_requested(workq_threadreq_t kqr)
404 {
405 	return kqr->tr_state != WORKQ_TR_STATE_IDLE;
406 }
407 
408 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)409 kqr_thread_fast(workq_threadreq_t kqr)
410 {
411 	assert(kqr_thread_bound(kqr));
412 	return kqr->tr_thread;
413 }
414 
415 static inline thread_t
kqr_thread(workq_threadreq_t kqr)416 kqr_thread(workq_threadreq_t kqr)
417 {
418 	return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
419 }
420 
421 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)422 kqr_kqworkloop(workq_threadreq_t kqr)
423 {
424 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
425 		return __container_of(kqr, struct kqworkloop, kqwl_request);
426 	}
427 	return NULL;
428 }
429 
430 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)431 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
432 {
433 	kqueue_t kqu;
434 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
435 		kqu.kqwl = kqr_kqworkloop(kqr);
436 	} else {
437 		kqu.kqwq = p->p_fd.fd_wqkqueue;
438 		assert(kqr >= kqu.kqwq->kqwq_request &&
439 		    kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
440 	}
441 	return kqu;
442 }
443 
444 #if CONFIG_PREADOPT_TG
445 /* There are no guarantees about which locks are held when this is called */
446 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)447 kqr_preadopt_thread_group(workq_threadreq_t req)
448 {
449 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
450 	return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
451 }
452 
453 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)454 inline _Atomic(thread_group_qos_t) *
455 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
456 {
457 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
458 	return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
459 }
460 #endif
461 
462 /*
463  * kqueue/note lock implementations
464  *
465  *	The kqueue lock guards the kq state, the state of its queues,
466  *	and the kqueue-aware status and locks of individual knotes.
467  *
468  *	The kqueue workq lock is used to protect state guarding the
469  *	interaction of the kqueue with the workq.  This state cannot
470  *	be guarded by the kq lock - as it needs to be taken when we
471  *	already have the waitq set lock held (during the waitq hook
472  *	callback).  It might be better to use the waitq lock itself
473  *	for this, but the IRQ requirements make that difficult).
474  *
475  *	Knote flags, filter flags, and associated data are protected
476  *	by the underlying object lock - and are only ever looked at
477  *	by calling the filter to get a [consistent] snapshot of that
478  *	data.
479  */
480 
481 static inline void
kqlock(kqueue_t kqu)482 kqlock(kqueue_t kqu)
483 {
484 	lck_spin_lock(&kqu.kq->kq_lock);
485 }
486 
487 static inline void
kqlock_held(__assert_only kqueue_t kqu)488 kqlock_held(__assert_only kqueue_t kqu)
489 {
490 	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
491 }
492 
493 static inline void
kqunlock(kqueue_t kqu)494 kqunlock(kqueue_t kqu)
495 {
496 	lck_spin_unlock(&kqu.kq->kq_lock);
497 }
498 
499 static inline void
knhash_lock(struct filedesc * fdp)500 knhash_lock(struct filedesc *fdp)
501 {
502 	lck_mtx_lock(&fdp->fd_knhashlock);
503 }
504 
505 static inline void
knhash_unlock(struct filedesc * fdp)506 knhash_unlock(struct filedesc *fdp)
507 {
508 	lck_mtx_unlock(&fdp->fd_knhashlock);
509 }
510 
511 /* wait event for knote locks */
512 static inline event_t
knote_lock_wev(struct knote * kn)513 knote_lock_wev(struct knote *kn)
514 {
515 	return (event_t)(&kn->kn_hook);
516 }
517 
518 /* wait event for kevent_register_wait_* */
519 static inline event64_t
knote_filt_wev64(struct knote * kn)520 knote_filt_wev64(struct knote *kn)
521 {
522 	/* kdp_workloop_sync_wait_find_owner knows about this */
523 	return CAST_EVENT64_T(kn);
524 }
525 
526 /* wait event for knote_post/knote_drop */
527 static inline event_t
knote_post_wev(struct knote * kn)528 knote_post_wev(struct knote *kn)
529 {
530 	return &kn->kn_kevent;
531 }
532 
533 /*!
534  * @function knote_has_qos
535  *
536  * @brief
537  * Whether the knote has a regular QoS.
538  *
539  * @discussion
540  * kn_qos_override is:
541  * - 0 on kqfiles
542  * - THREAD_QOS_LAST for special buckets (manager)
543  *
544  * Other values mean the knote participates to QoS propagation.
545  */
546 static inline bool
knote_has_qos(struct knote * kn)547 knote_has_qos(struct knote *kn)
548 {
549 	return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
550 }
551 
552 #pragma mark knote locks
553 
554 /*
555  * Enum used by the knote_lock_* functions.
556  *
557  * KNOTE_KQ_LOCK_ALWAYS
558  *   The function will always return with the kq lock held.
559  *
560  * KNOTE_KQ_LOCK_ON_SUCCESS
561  *   The function will return with the kq lock held if it was successful
562  *   (knote_lock() is the only function that can fail).
563  *
564  * KNOTE_KQ_LOCK_ON_FAILURE
565  *   The function will return with the kq lock held if it was unsuccessful
566  *   (knote_lock() is the only function that can fail).
567  *
568  * KNOTE_KQ_UNLOCK:
569  *   The function returns with the kq unlocked.
570  */
571 enum kqlocking {
572 	KNOTE_KQ_LOCK_ALWAYS,
573 	KNOTE_KQ_LOCK_ON_SUCCESS,
574 	KNOTE_KQ_LOCK_ON_FAILURE,
575 	KNOTE_KQ_UNLOCK,
576 };
577 
578 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)579 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
580 {
581 	struct knote_lock_ctx *ctx;
582 	LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
583 		if (ctx->knlc_knote == kn) {
584 			return ctx;
585 		}
586 	}
587 	panic("knote lock context not found: %p", kn);
588 	__builtin_trap();
589 }
590 
591 /* slowpath of knote_lock() */
592 __attribute__((noinline))
593 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)594 knote_lock_slow(kqueue_t kqu, struct knote *kn,
595     struct knote_lock_ctx *knlc, int kqlocking)
596 {
597 	struct knote_lock_ctx *owner_lc;
598 	struct uthread *uth = current_uthread();
599 	wait_result_t wr;
600 
601 	kqlock_held(kqu);
602 
603 	owner_lc = knote_lock_ctx_find(kqu, kn);
604 #if DEBUG || DEVELOPMENT
605 	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
606 #endif
607 	owner_lc->knlc_waiters++;
608 
609 	/*
610 	 * Make our lock context visible to knote_unlock()
611 	 */
612 	uth->uu_knlock = knlc;
613 
614 	wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
615 	    knote_lock_wev(kn), owner_lc->knlc_thread,
616 	    THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
617 
618 	if (wr == THREAD_RESTART) {
619 		/*
620 		 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
621 		 * We need to cleanup the state since no one did.
622 		 */
623 		uth->uu_knlock = NULL;
624 #if DEBUG || DEVELOPMENT
625 		assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
626 		knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
627 #endif
628 
629 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
630 		    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
631 			kqlock(kqu);
632 		}
633 		return false;
634 	} else {
635 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
636 		    kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
637 			kqlock(kqu);
638 #if DEBUG || DEVELOPMENT
639 			/*
640 			 * This state is set under the lock so we can't
641 			 * really assert this unless we hold the lock.
642 			 */
643 			assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
644 #endif
645 		}
646 		return true;
647 	}
648 }
649 
650 /*
651  * Attempts to take the "knote" lock.
652  *
653  * Called with the kqueue lock held.
654  *
655  * Returns true if the knote lock is acquired, false if it has been dropped
656  */
657 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)658 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
659     enum kqlocking kqlocking)
660 {
661 	kqlock_held(kqu);
662 
663 #if DEBUG || DEVELOPMENT
664 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
665 #endif
666 	knlc->knlc_knote = kn;
667 	knlc->knlc_thread = current_thread();
668 	knlc->knlc_waiters = 0;
669 
670 	if (__improbable(kn->kn_status & KN_LOCKED)) {
671 		return knote_lock_slow(kqu, kn, knlc, kqlocking);
672 	}
673 
674 	/*
675 	 * When the knote will be dropped, the knote lock is taken before
676 	 * KN_DROPPING is set, and then the knote will be removed from any
677 	 * hash table that references it before the lock is canceled.
678 	 */
679 	assert((kn->kn_status & KN_DROPPING) == 0);
680 	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
681 	kn->kn_status |= KN_LOCKED;
682 #if DEBUG || DEVELOPMENT
683 	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
684 #endif
685 
686 	if (kqlocking == KNOTE_KQ_UNLOCK ||
687 	    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
688 		kqunlock(kqu);
689 	}
690 	return true;
691 }
692 
693 /*
694  * Unlocks a knote successfully locked with knote_lock().
695  *
696  * Called with the kqueue lock held.
697  *
698  * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
699  */
700 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)701 knote_unlock(kqueue_t kqu, struct knote *kn,
702     struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
703 {
704 	kqlock_held(kqu);
705 
706 	assert(knlc->knlc_knote == kn);
707 	assert(kn->kn_status & KN_LOCKED);
708 #if DEBUG || DEVELOPMENT
709 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
710 #endif
711 
712 	LIST_REMOVE(knlc, knlc_link);
713 
714 	if (knlc->knlc_waiters) {
715 		thread_t thread = THREAD_NULL;
716 
717 		wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
718 		    LCK_WAKE_DEFAULT, &thread);
719 
720 		/*
721 		 * knote_lock_slow() publishes the lock context of waiters
722 		 * in uthread::uu_knlock.
723 		 *
724 		 * Reach out and make this context the new owner.
725 		 */
726 		struct uthread *ut = get_bsdthread_info(thread);
727 		struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
728 
729 		assert(next_owner_lc->knlc_knote == kn);
730 		next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
731 		LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
732 #if DEBUG || DEVELOPMENT
733 		next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
734 #endif
735 		ut->uu_knlock = NULL;
736 		thread_deallocate_safe(thread);
737 	} else {
738 		kn->kn_status &= ~KN_LOCKED;
739 	}
740 
741 	if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
742 		/*
743 		 * No f_event() in flight anymore, we can leave QoS "Merge" mode
744 		 *
745 		 * See knote_adjust_qos()
746 		 */
747 		kn->kn_status &= ~KN_MERGE_QOS;
748 	}
749 	if (kqlocking == KNOTE_KQ_UNLOCK) {
750 		kqunlock(kqu);
751 	}
752 #if DEBUG || DEVELOPMENT
753 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
754 #endif
755 }
756 
757 /*
758  * Aborts all waiters for a knote lock, and unlock the knote.
759  *
760  * Called with the kqueue lock held.
761  *
762  * Returns with the kqueue unlocked.
763  */
764 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)765 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
766     struct knote_lock_ctx *knlc)
767 {
768 	kqlock_held(kq);
769 
770 	assert(knlc->knlc_knote == kn);
771 	assert(kn->kn_status & KN_LOCKED);
772 	assert(kn->kn_status & KN_DROPPING);
773 
774 	LIST_REMOVE(knlc, knlc_link);
775 	kn->kn_status &= ~KN_LOCKED;
776 	kqunlock(kq);
777 
778 	if (knlc->knlc_waiters) {
779 		wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
780 	}
781 #if DEBUG || DEVELOPMENT
782 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
783 #endif
784 }
785 
786 /*
787  * Call the f_event hook of a given filter.
788  *
789  * Takes a use count to protect against concurrent drops.
790  * Called with the object lock held.
791  */
792 static void
knote_post(struct knote * kn,long hint)793 knote_post(struct knote *kn, long hint)
794 {
795 	struct kqueue *kq = knote_get_kq(kn);
796 	int dropping, result;
797 
798 	kqlock(kq);
799 
800 	if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
801 		return kqunlock(kq);
802 	}
803 
804 	if (__improbable(kn->kn_status & KN_POSTING)) {
805 		panic("KNOTE() called concurrently on knote %p", kn);
806 	}
807 
808 	kn->kn_status |= KN_POSTING;
809 
810 	kqunlock(kq);
811 	result = filter_call(knote_fops(kn), f_event(kn, hint));
812 	kqlock(kq);
813 
814 	/* Someone dropped the knote/the monitored object vanished while we
815 	 * were in f_event, swallow the side effects of the post.
816 	 */
817 	dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
818 
819 	if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
820 		kqueue_update_iotier_override(kq);
821 	}
822 
823 	if (!dropping && (result & FILTER_ACTIVE)) {
824 		knote_activate(kq, kn, result);
825 	}
826 
827 	if ((kn->kn_status & KN_LOCKED) == 0) {
828 		/*
829 		 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
830 		 *
831 		 * See knote_adjust_qos()
832 		 */
833 		kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
834 	} else {
835 		kn->kn_status &= ~KN_POSTING;
836 	}
837 
838 	if (__improbable(dropping)) {
839 		thread_wakeup(knote_post_wev(kn));
840 	}
841 
842 	kqunlock(kq);
843 }
844 
845 /*
846  * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
847  * caller to be done.
848  *
849  *	- kq locked at entry
850  *	- kq unlocked at exit
851  */
852 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)853 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
854 {
855 	kqlock_held(kq);
856 
857 	assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
858 
859 	if (kn->kn_status & KN_POSTING) {
860 		lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
861 		    THREAD_UNINT | THREAD_WAIT_NOREPORT);
862 	} else {
863 		kqunlock(kq);
864 	}
865 }
866 
867 #pragma mark knote helpers for filters
868 
869 OS_ALWAYS_INLINE
870 void *
knote_kn_hook_get_raw(struct knote * kn)871 knote_kn_hook_get_raw(struct knote *kn)
872 {
873 	uintptr_t *addr = &kn->kn_hook;
874 
875 	void *hook = (void *) *addr;
876 #if __has_feature(ptrauth_calls)
877 	if (hook) {
878 		uint16_t blend = kn->kn_filter;
879 		blend |= (kn->kn_filtid << 8);
880 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
881 
882 		hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
883 		    ptrauth_blend_discriminator(addr, blend));
884 	}
885 #endif
886 
887 	return hook;
888 }
889 
890 OS_ALWAYS_INLINE void
knote_kn_hook_set_raw(struct knote * kn,void * kn_hook)891 knote_kn_hook_set_raw(struct knote *kn, void *kn_hook)
892 {
893 	uintptr_t *addr = &kn->kn_hook;
894 #if __has_feature(ptrauth_calls)
895 	if (kn_hook) {
896 		uint16_t blend = kn->kn_filter;
897 		blend |= (kn->kn_filtid << 8);
898 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
899 
900 		kn_hook = ptrauth_sign_unauthenticated(kn_hook,
901 		    ptrauth_key_process_independent_data,
902 		    ptrauth_blend_discriminator(addr, blend));
903 	}
904 #endif
905 	*addr = (uintptr_t) kn_hook;
906 }
907 
908 OS_ALWAYS_INLINE
909 void
knote_set_error(struct knote * kn,int error)910 knote_set_error(struct knote *kn, int error)
911 {
912 	kn->kn_flags |= EV_ERROR;
913 	kn->kn_sdata = error;
914 }
915 
916 OS_ALWAYS_INLINE
917 int64_t
knote_low_watermark(const struct knote * kn)918 knote_low_watermark(const struct knote *kn)
919 {
920 	return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
921 }
922 
923 /*!
924  * @function knote_fill_kevent_with_sdata
925  *
926  * @brief
927  * Fills in a kevent from the current content of a knote.
928  *
929  * @discussion
930  * This is meant to be called from filter's f_process hooks.
931  * The kevent data is filled with kn->kn_sdata.
932  *
933  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
934  *
935  * Using knote_fill_kevent is typically preferred.
936  */
937 OS_ALWAYS_INLINE
938 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)939 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
940 {
941 #define knote_assert_aliases(name1, offs1, name2) \
942 	static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
943 	    offsetof(struct kevent_internal_s, name2), \
944 	        "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
945 	/*
946 	 * All the code makes assumptions on these aliasing,
947 	 * so make sure we fail the build if we ever ever ever break them.
948 	 */
949 	knote_assert_aliases(ident, 0, kei_ident);
950 #ifdef __LITTLE_ENDIAN__
951 	knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
952 	knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
953 #else
954 	knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
955 	knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
956 #endif
957 	knote_assert_aliases(flags, 0, kei_flags);
958 	knote_assert_aliases(qos, 0, kei_qos);
959 	knote_assert_aliases(udata, 0, kei_udata);
960 	knote_assert_aliases(fflags, 0, kei_fflags);
961 	knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
962 	knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
963 	knote_assert_aliases(ext, 0, kei_ext);
964 #undef knote_assert_aliases
965 
966 	/*
967 	 * Fix the differences between kevent_qos_s and kevent_internal_s:
968 	 * - xflags is where kn_sfflags lives, we need to zero it
969 	 * - fixup the high bits of `filter` where kn_filtid lives
970 	 */
971 	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;
972 	kev->xflags = 0;
973 	kev->filter |= 0xff00;
974 	if (kn->kn_flags & EV_CLEAR) {
975 		kn->kn_fflags = 0;
976 	}
977 }
978 
979 /*!
980  * @function knote_fill_kevent
981  *
982  * @brief
983  * Fills in a kevent from the current content of a knote.
984  *
985  * @discussion
986  * This is meant to be called from filter's f_process hooks.
987  * The kevent data is filled with the passed in data.
988  *
989  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
990  */
991 OS_ALWAYS_INLINE
992 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)993 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
994 {
995 	knote_fill_kevent_with_sdata(kn, kev);
996 	kev->filter = kn->kn_filter;
997 	kev->data = data;
998 }
999 
1000 
1001 #pragma mark file_filtops
1002 
1003 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)1004 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
1005 {
1006 	return fo_kqfilter(kn->kn_fp, kn, kev);
1007 }
1008 
1009 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1010 	.f_isfd = 1,
1011 	.f_attach = filt_fileattach,
1012 };
1013 
1014 #pragma mark kqread_filtops
1015 
1016 #define f_flag fp_glob->fg_flag
1017 #define f_ops fp_glob->fg_ops
1018 #define f_lflags fp_glob->fg_lflags
1019 
1020 static void
filt_kqdetach(struct knote * kn)1021 filt_kqdetach(struct knote *kn)
1022 {
1023 	struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
1024 	struct kqueue *kq = &kqf->kqf_kqueue;
1025 
1026 	kqlock(kq);
1027 	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1028 	kqunlock(kq);
1029 }
1030 
1031 static int
filt_kqueue(struct knote * kn,__unused long hint)1032 filt_kqueue(struct knote *kn, __unused long hint)
1033 {
1034 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1035 
1036 	return kq->kq_count > 0;
1037 }
1038 
1039 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)1040 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
1041 {
1042 #pragma unused(kev)
1043 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1044 	int res;
1045 
1046 	kqlock(kq);
1047 	res = (kq->kq_count > 0);
1048 	kqunlock(kq);
1049 
1050 	return res;
1051 }
1052 
1053 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1054 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1055 {
1056 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1057 	int res = 0;
1058 
1059 	kqlock(kq);
1060 	if (kq->kq_count) {
1061 		knote_fill_kevent(kn, kev, kq->kq_count);
1062 		res = 1;
1063 	}
1064 	kqunlock(kq);
1065 
1066 	return res;
1067 }
1068 
1069 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1070 	.f_isfd = 1,
1071 	.f_detach = filt_kqdetach,
1072 	.f_event = filt_kqueue,
1073 	.f_touch = filt_kqtouch,
1074 	.f_process = filt_kqprocess,
1075 };
1076 
1077 #pragma mark proc_filtops
1078 
1079 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1080 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1081 {
1082 	struct proc *p;
1083 
1084 	assert(PID_MAX < NOTE_PDATAMASK);
1085 
1086 	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1087 		knote_set_error(kn, ENOTSUP);
1088 		return 0;
1089 	}
1090 
1091 	p = proc_find((int)kn->kn_id);
1092 	if (p == NULL) {
1093 		knote_set_error(kn, ESRCH);
1094 		return 0;
1095 	}
1096 
1097 	const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1098 
1099 	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1100 		do {
1101 			pid_t selfpid = proc_selfpid();
1102 
1103 			if (p->p_ppid == selfpid) {
1104 				break;  /* parent => ok */
1105 			}
1106 			if ((p->p_lflag & P_LTRACED) != 0 &&
1107 			    (p->p_oppid == selfpid)) {
1108 				break;  /* parent-in-waiting => ok */
1109 			}
1110 			if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1111 				break; /* allowed to signal => ok */
1112 			}
1113 			proc_rele(p);
1114 			knote_set_error(kn, EACCES);
1115 			return 0;
1116 		} while (0);
1117 	}
1118 
1119 	kn->kn_proc = p;
1120 	kn->kn_flags |= EV_CLEAR;       /* automatically set */
1121 	kn->kn_sdata = 0;               /* incoming data is ignored */
1122 
1123 	proc_klist_lock();
1124 
1125 	KNOTE_ATTACH(&p->p_klist, kn);
1126 
1127 	proc_klist_unlock();
1128 
1129 	proc_rele(p);
1130 
1131 	/*
1132 	 * only captures edge-triggered events after this point
1133 	 * so it can't already be fired.
1134 	 */
1135 	return 0;
1136 }
1137 
1138 
1139 /*
1140  * The knote may be attached to a different process, which may exit,
1141  * leaving nothing for the knote to be attached to.  In that case,
1142  * the pointer to the process will have already been nulled out.
1143  */
1144 static void
filt_procdetach(struct knote * kn)1145 filt_procdetach(struct knote *kn)
1146 {
1147 	struct proc *p;
1148 
1149 	proc_klist_lock();
1150 
1151 	p = kn->kn_proc;
1152 	if (p != PROC_NULL) {
1153 		kn->kn_proc = PROC_NULL;
1154 		KNOTE_DETACH(&p->p_klist, kn);
1155 	}
1156 
1157 	proc_klist_unlock();
1158 }
1159 
1160 static int
filt_procevent(struct knote * kn,long hint)1161 filt_procevent(struct knote *kn, long hint)
1162 {
1163 	u_int event;
1164 
1165 	/* ALWAYS CALLED WITH proc_klist_lock */
1166 
1167 	/*
1168 	 * Note: a lot of bits in hint may be obtained from the knote
1169 	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1170 	 * bits in hint for filt_procevent
1171 	 *
1172 	 * mask off extra data
1173 	 */
1174 	event = (u_int)hint & NOTE_PCTRLMASK;
1175 
1176 	/*
1177 	 * termination lifecycle events can happen while a debugger
1178 	 * has reparented a process, in which case notifications
1179 	 * should be quashed except to the tracing parent. When
1180 	 * the debugger reaps the child (either via wait4(2) or
1181 	 * process exit), the child will be reparented to the original
1182 	 * parent and these knotes re-fired.
1183 	 */
1184 	if (event & NOTE_EXIT) {
1185 		if ((kn->kn_proc->p_oppid != 0)
1186 		    && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1187 			/*
1188 			 * This knote is not for the current ptrace(2) parent, ignore.
1189 			 */
1190 			return 0;
1191 		}
1192 	}
1193 
1194 	/*
1195 	 * if the user is interested in this event, record it.
1196 	 */
1197 	if (kn->kn_sfflags & event) {
1198 		kn->kn_fflags |= event;
1199 	}
1200 
1201 #pragma clang diagnostic push
1202 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1203 	if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1204 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1205 	}
1206 #pragma clang diagnostic pop
1207 
1208 
1209 	/*
1210 	 * The kernel has a wrapper in place that returns the same data
1211 	 * as is collected here, in kn_hook32.  Any changes to how
1212 	 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1213 	 * should also be reflected in the proc_pidnoteexit() wrapper.
1214 	 */
1215 	if (event == NOTE_EXIT) {
1216 		kn->kn_hook32 = 0;
1217 		if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1218 			kn->kn_fflags |= NOTE_EXITSTATUS;
1219 			kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1220 		}
1221 		if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1222 			kn->kn_fflags |= NOTE_EXIT_DETAIL;
1223 			if ((kn->kn_proc->p_lflag &
1224 			    P_LTERM_DECRYPTFAIL) != 0) {
1225 				kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1226 			}
1227 			if ((kn->kn_proc->p_lflag &
1228 			    P_LTERM_JETSAM) != 0) {
1229 				kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1230 				switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1231 				case P_JETSAM_VMPAGESHORTAGE:
1232 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1233 					break;
1234 				case P_JETSAM_VMTHRASHING:
1235 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1236 					break;
1237 				case P_JETSAM_FCTHRASHING:
1238 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1239 					break;
1240 				case P_JETSAM_VNODE:
1241 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1242 					break;
1243 				case P_JETSAM_HIWAT:
1244 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1245 					break;
1246 				case P_JETSAM_PID:
1247 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1248 					break;
1249 				case P_JETSAM_IDLEEXIT:
1250 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1251 					break;
1252 				}
1253 			}
1254 			if ((proc_getcsflags(kn->kn_proc) &
1255 			    CS_KILLED) != 0) {
1256 				kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1257 			}
1258 		}
1259 	}
1260 
1261 	/* if we have any matching state, activate the knote */
1262 	return kn->kn_fflags != 0;
1263 }
1264 
1265 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1266 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1267 {
1268 	int res;
1269 
1270 	proc_klist_lock();
1271 
1272 	/* accept new filter flags and mask off output events no long interesting */
1273 	kn->kn_sfflags = kev->fflags;
1274 
1275 	/* restrict the current results to the (smaller?) set of new interest */
1276 	/*
1277 	 * For compatibility with previous implementations, we leave kn_fflags
1278 	 * as they were before.
1279 	 */
1280 	//kn->kn_fflags &= kn->kn_sfflags;
1281 
1282 	res = (kn->kn_fflags != 0);
1283 
1284 	proc_klist_unlock();
1285 
1286 	return res;
1287 }
1288 
1289 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1290 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1291 {
1292 	int res = 0;
1293 
1294 	proc_klist_lock();
1295 	if (kn->kn_fflags) {
1296 		knote_fill_kevent(kn, kev, kn->kn_hook32);
1297 		kn->kn_hook32 = 0;
1298 		res = 1;
1299 	}
1300 	proc_klist_unlock();
1301 	return res;
1302 }
1303 
1304 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1305 	.f_attach  = filt_procattach,
1306 	.f_detach  = filt_procdetach,
1307 	.f_event   = filt_procevent,
1308 	.f_touch   = filt_proctouch,
1309 	.f_process = filt_procprocess,
1310 };
1311 
1312 #pragma mark timer_filtops
1313 
1314 struct filt_timer_params {
1315 	uint64_t deadline; /* deadline in abs/cont time
1316 	                    *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1317 	uint64_t leeway;   /* leeway in abstime, or 0 if none */
1318 	uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1319 };
1320 
1321 /*
1322  * Values stored in the knote at rest (using Mach absolute time units)
1323  *
1324  * kn->kn_thcall        where the thread_call object is stored
1325  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1326  * kn->kn_ext[1]        leeway value
1327  * kn->kn_sdata         interval timer: the interval
1328  *                      absolute/deadline timer: 0
1329  * kn->kn_hook32        timer state (with gencount)
1330  *
1331  * TIMER_IDLE:
1332  *   The timer has either never been scheduled or been cancelled.
1333  *   It is safe to schedule a new one in this state.
1334  *
1335  * TIMER_ARMED:
1336  *   The timer has been scheduled
1337  *
1338  * TIMER_FIRED
1339  *   The timer has fired and an event needs to be delivered.
1340  *   When in this state, the callout may still be running.
1341  *
1342  * TIMER_IMMEDIATE
1343  *   The timer has fired at registration time, and the callout was never
1344  *   dispatched.
1345  */
1346 #define TIMER_IDLE       0x0
1347 #define TIMER_ARMED      0x1
1348 #define TIMER_FIRED      0x2
1349 #define TIMER_IMMEDIATE  0x3
1350 #define TIMER_STATE_MASK 0x3
1351 #define TIMER_GEN_INC    0x4
1352 
1353 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1354 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1355 {
1356 	kn->kn_ext[0] = params->deadline;
1357 	kn->kn_ext[1] = params->leeway;
1358 	kn->kn_sdata  = params->interval;
1359 }
1360 
1361 /*
1362  * filt_timervalidate - process data from user
1363  *
1364  * Sets up the deadline, interval, and leeway from the provided user data
1365  *
1366  * Input:
1367  *      kn_sdata        timer deadline or interval time
1368  *      kn_sfflags      style of timer, unit of measurement
1369  *
1370  * Output:
1371  *      struct filter_timer_params to apply to the filter with
1372  *      filt_timer_set_params when changes are ready to be commited.
1373  *
1374  * Returns:
1375  *      EINVAL          Invalid user data parameters
1376  *      ERANGE          Various overflows with the parameters
1377  *
1378  * Called with timer filter lock held.
1379  */
1380 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1381 filt_timervalidate(const struct kevent_qos_s *kev,
1382     struct filt_timer_params *params)
1383 {
1384 	/*
1385 	 * There are 5 knobs that need to be chosen for a timer registration:
1386 	 *
1387 	 * A) Units of time (what is the time duration of the specified number)
1388 	 *      Absolute and interval take:
1389 	 *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1390 	 *      Defaults to milliseconds if not specified
1391 	 *
1392 	 * B) Clock epoch (what is the zero point of the specified number)
1393 	 *      For interval, there is none
1394 	 *      For absolute, defaults to the gettimeofday/calendar epoch
1395 	 *      With NOTE_MACHTIME, uses mach_absolute_time()
1396 	 *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1397 	 *
1398 	 * C) The knote's behavior on delivery
1399 	 *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1400 	 *      Absolute is a forced one-shot timer which deletes on delivery
1401 	 *      TODO: Add a way for absolute to be not forced one-shot
1402 	 *
1403 	 * D) Whether the time duration is relative to now or absolute
1404 	 *      Interval fires at now + duration when it is set up
1405 	 *      Absolute fires at now + difference between now walltime and passed in walltime
1406 	 *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1407 	 *
1408 	 * E) Whether the timer continues to tick across sleep
1409 	 *      By default all three do not.
1410 	 *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1411 	 *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1412 	 *              expires when mach_continuous_time() is > the passed in value.
1413 	 */
1414 
1415 	uint64_t multiplier;
1416 
1417 	boolean_t use_abstime = FALSE;
1418 
1419 	switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1420 	case NOTE_SECONDS:
1421 		multiplier = NSEC_PER_SEC;
1422 		break;
1423 	case NOTE_USECONDS:
1424 		multiplier = NSEC_PER_USEC;
1425 		break;
1426 	case NOTE_NSECONDS:
1427 		multiplier = 1;
1428 		break;
1429 	case NOTE_MACHTIME:
1430 		multiplier = 0;
1431 		use_abstime = TRUE;
1432 		break;
1433 	case 0: /* milliseconds (default) */
1434 		multiplier = NSEC_PER_SEC / 1000;
1435 		break;
1436 	default:
1437 		return EINVAL;
1438 	}
1439 
1440 	/* transform the leeway in kn_ext[1] to same time scale */
1441 	if (kev->fflags & NOTE_LEEWAY) {
1442 		uint64_t leeway_abs;
1443 
1444 		if (use_abstime) {
1445 			leeway_abs = (uint64_t)kev->ext[1];
1446 		} else {
1447 			uint64_t leeway_ns;
1448 			if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1449 				return ERANGE;
1450 			}
1451 
1452 			nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1453 		}
1454 
1455 		params->leeway = leeway_abs;
1456 	} else {
1457 		params->leeway = 0;
1458 	}
1459 
1460 	if (kev->fflags & NOTE_ABSOLUTE) {
1461 		uint64_t deadline_abs;
1462 
1463 		if (use_abstime) {
1464 			deadline_abs = (uint64_t)kev->data;
1465 		} else {
1466 			uint64_t calendar_deadline_ns;
1467 
1468 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1469 				return ERANGE;
1470 			}
1471 
1472 			/* calendar_deadline_ns is in nanoseconds since the epoch */
1473 
1474 			clock_sec_t seconds;
1475 			clock_nsec_t nanoseconds;
1476 
1477 			/*
1478 			 * Note that the conversion through wall-time is only done once.
1479 			 *
1480 			 * If the relationship between MAT and gettimeofday changes,
1481 			 * the underlying timer does not update.
1482 			 *
1483 			 * TODO: build a wall-time denominated timer_call queue
1484 			 * and a flag to request DTRTing with wall-time timers
1485 			 */
1486 			clock_get_calendar_nanotime(&seconds, &nanoseconds);
1487 
1488 			uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1489 
1490 			/* if deadline is in the future */
1491 			if (calendar_now_ns < calendar_deadline_ns) {
1492 				uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1493 				uint64_t interval_abs;
1494 
1495 				nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1496 
1497 				/*
1498 				 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1499 				 * causes the timer to keep ticking across sleep, but
1500 				 * it does not change the calendar timebase.
1501 				 */
1502 
1503 				if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1504 					clock_continuoustime_interval_to_deadline(interval_abs,
1505 					    &deadline_abs);
1506 				} else {
1507 					clock_absolutetime_interval_to_deadline(interval_abs,
1508 					    &deadline_abs);
1509 				}
1510 			} else {
1511 				deadline_abs = 0; /* cause immediate expiration */
1512 			}
1513 		}
1514 
1515 		params->deadline = deadline_abs;
1516 		params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1517 	} else if (kev->data < 0) {
1518 		/*
1519 		 * Negative interval timers fire immediately, once.
1520 		 *
1521 		 * Ideally a negative interval would be an error, but certain clients
1522 		 * pass negative values on accident, and expect an event back.
1523 		 *
1524 		 * In the old implementation the timer would repeat with no delay
1525 		 * N times until mach_absolute_time() + (N * interval) underflowed,
1526 		 * then it would wait ~forever by accidentally arming a timer for the far future.
1527 		 *
1528 		 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1529 		 */
1530 
1531 		params->deadline = 0; /* expire immediately */
1532 		params->interval = 0; /* non-repeating */
1533 	} else {
1534 		uint64_t interval_abs = 0;
1535 
1536 		if (use_abstime) {
1537 			interval_abs = (uint64_t)kev->data;
1538 		} else {
1539 			uint64_t interval_ns;
1540 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1541 				return ERANGE;
1542 			}
1543 
1544 			nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1545 		}
1546 
1547 		uint64_t deadline = 0;
1548 
1549 		if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1550 			clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1551 		} else {
1552 			clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1553 		}
1554 
1555 		params->deadline = deadline;
1556 		params->interval = interval_abs;
1557 	}
1558 
1559 	return 0;
1560 }
1561 
1562 /*
1563  * filt_timerexpire - the timer callout routine
1564  */
1565 static void
filt_timerexpire(void * knx,void * state_on_arm)1566 filt_timerexpire(void *knx, void *state_on_arm)
1567 {
1568 	struct knote *kn = knx;
1569 
1570 	uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1571 	uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1572 
1573 	if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1574 		// our f_event always would say FILTER_ACTIVE,
1575 		// so be leaner and just do it.
1576 		struct kqueue *kq = knote_get_kq(kn);
1577 		kqlock(kq);
1578 		knote_activate(kq, kn, FILTER_ACTIVE);
1579 		kqunlock(kq);
1580 	} else {
1581 		/*
1582 		 * The timer has been reprogrammed or canceled since it was armed,
1583 		 * and this is a late firing for the timer, just ignore it.
1584 		 */
1585 	}
1586 }
1587 
1588 /*
1589  * Does this deadline needs a timer armed for it, or has it expired?
1590  */
1591 static bool
filt_timer_is_ready(struct knote * kn)1592 filt_timer_is_ready(struct knote *kn)
1593 {
1594 	uint64_t now, deadline = kn->kn_ext[0];
1595 
1596 	if (deadline == 0) {
1597 		return true;
1598 	}
1599 
1600 	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1601 		now = mach_continuous_time();
1602 	} else {
1603 		now = mach_absolute_time();
1604 	}
1605 	return deadline <= now;
1606 }
1607 
1608 /*
1609  * Arm a timer
1610  *
1611  * It is the responsibility of the caller to make sure the timer call
1612  * has completed or been cancelled properly prior to arming it.
1613  */
1614 static void
filt_timerarm(struct knote * kn)1615 filt_timerarm(struct knote *kn)
1616 {
1617 	uint64_t deadline = kn->kn_ext[0];
1618 	uint64_t leeway   = kn->kn_ext[1];
1619 	uint32_t state;
1620 
1621 	int filter_flags = kn->kn_sfflags;
1622 	unsigned int timer_flags = 0;
1623 
1624 	if (filter_flags & NOTE_CRITICAL) {
1625 		timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1626 	} else if (filter_flags & NOTE_BACKGROUND) {
1627 		timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1628 	} else {
1629 		timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1630 	}
1631 
1632 	if (filter_flags & NOTE_LEEWAY) {
1633 		timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1634 	}
1635 
1636 	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1637 		timer_flags |= THREAD_CALL_CONTINUOUS;
1638 	}
1639 
1640 	/*
1641 	 * Move to ARMED.
1642 	 *
1643 	 * We increase the gencount, and setup the thread call with this expected
1644 	 * state. It means that if there was a previous generation of the timer in
1645 	 * flight that needs to be ignored, then 3 things are possible:
1646 	 *
1647 	 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1648 	 *   but we clobber it with ARMED and a new gencount. The knote will still
1649 	 *   be activated, but filt_timerprocess() which is serialized with this
1650 	 *   call will not see the FIRED bit set and will not deliver an event.
1651 	 *
1652 	 * - this code runs first, but filt_timerexpire() comes second. Because it
1653 	 *   knows an old gencount, it will debounce and not activate the knote.
1654 	 *
1655 	 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1656 	 *   will just cancel it properly.
1657 	 *
1658 	 * This is important as userspace expects to never be woken up for past
1659 	 * timers after filt_timertouch ran.
1660 	 */
1661 	state = os_atomic_load(&kn->kn_hook32, relaxed);
1662 	state &= ~TIMER_STATE_MASK;
1663 	state += TIMER_GEN_INC + TIMER_ARMED;
1664 	os_atomic_store(&kn->kn_hook32, state, relaxed);
1665 
1666 	thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1667 	    (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1668 }
1669 
1670 /*
1671  * Mark a timer as "already fired" when it is being reprogrammed
1672  *
1673  * If there is a timer in flight, this will do a best effort at canceling it,
1674  * but will not wait. If the thread call was in flight, having set the
1675  * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1676  * cancelation.
1677  */
1678 static void
filt_timerfire_immediate(struct knote * kn)1679 filt_timerfire_immediate(struct knote *kn)
1680 {
1681 	uint32_t state;
1682 
1683 	static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1684 	    "validate that this atomic or will transition to IMMEDIATE");
1685 	state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1686 
1687 	if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1688 		thread_call_cancel(kn->kn_thcall);
1689 	}
1690 }
1691 
1692 /*
1693  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1694  */
1695 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1696 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1697 {
1698 	thread_call_t callout;
1699 	struct filt_timer_params params;
1700 	int error;
1701 
1702 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1703 		knote_set_error(kn, error);
1704 		return 0;
1705 	}
1706 
1707 	callout = thread_call_allocate_with_options(filt_timerexpire,
1708 	    (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1709 	    THREAD_CALL_OPTIONS_ONCE);
1710 
1711 	if (NULL == callout) {
1712 		knote_set_error(kn, ENOMEM);
1713 		return 0;
1714 	}
1715 
1716 	filt_timer_set_params(kn, &params);
1717 	kn->kn_thcall = callout;
1718 	kn->kn_flags |= EV_CLEAR;
1719 	os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1720 
1721 	/* NOTE_ABSOLUTE implies EV_ONESHOT */
1722 	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1723 		kn->kn_flags |= EV_ONESHOT;
1724 	}
1725 
1726 	if (filt_timer_is_ready(kn)) {
1727 		os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1728 		return FILTER_ACTIVE;
1729 	} else {
1730 		filt_timerarm(kn);
1731 		return 0;
1732 	}
1733 }
1734 
1735 /*
1736  * Shut down the timer if it's running, and free the callout.
1737  */
1738 static void
filt_timerdetach(struct knote * kn)1739 filt_timerdetach(struct knote *kn)
1740 {
1741 	__assert_only boolean_t freed;
1742 
1743 	/*
1744 	 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1745 	 * running anymore.
1746 	 */
1747 	thread_call_cancel_wait(kn->kn_thcall);
1748 	freed = thread_call_free(kn->kn_thcall);
1749 	assert(freed);
1750 }
1751 
1752 /*
1753  * filt_timertouch - update timer knote with new user input
1754  *
1755  * Cancel and restart the timer based on new user data. When
1756  * the user picks up a knote, clear the count of how many timer
1757  * pops have gone off (in kn_data).
1758  */
1759 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1760 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1761 {
1762 	struct filt_timer_params params;
1763 	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1764 	int error;
1765 
1766 	if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1767 	    !_pthread_priority_thread_qos(kev->qos)) {
1768 		/* validate usage of FILTER_UPDATE_REQ_QOS */
1769 		kev->flags |= EV_ERROR;
1770 		kev->data = ERANGE;
1771 		return 0;
1772 	}
1773 
1774 	if (changed_flags & NOTE_ABSOLUTE) {
1775 		kev->flags |= EV_ERROR;
1776 		kev->data = EINVAL;
1777 		return 0;
1778 	}
1779 
1780 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1781 		kev->flags |= EV_ERROR;
1782 		kev->data = error;
1783 		return 0;
1784 	}
1785 
1786 	/* capture the new values used to compute deadline */
1787 	filt_timer_set_params(kn, &params);
1788 	kn->kn_sfflags = kev->fflags;
1789 
1790 	if (filt_timer_is_ready(kn)) {
1791 		filt_timerfire_immediate(kn);
1792 		return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1793 	} else {
1794 		filt_timerarm(kn);
1795 		return FILTER_UPDATE_REQ_QOS;
1796 	}
1797 }
1798 
1799 /*
1800  * filt_timerprocess - query state of knote and snapshot event data
1801  *
1802  * Determine if the timer has fired in the past, snapshot the state
1803  * of the kevent for returning to user-space, and clear pending event
1804  * counters for the next time.
1805  */
1806 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1807 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1808 {
1809 	uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1810 
1811 	/*
1812 	 * filt_timerprocess is serialized with any filter routine except for
1813 	 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1814 	 * transition, and on success, activates the knote.
1815 	 *
1816 	 * Hence, we don't need atomic modifications of the state, only to peek at
1817 	 * whether we see any of the "FIRED" state, and if we do, it is safe to
1818 	 * do simple state machine transitions.
1819 	 */
1820 	switch (state & TIMER_STATE_MASK) {
1821 	case TIMER_IDLE:
1822 	case TIMER_ARMED:
1823 		/*
1824 		 * This can happen if a touch resets a timer that had fired
1825 		 * without being processed
1826 		 */
1827 		return 0;
1828 	}
1829 
1830 	os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1831 
1832 	/*
1833 	 * Copy out the interesting kevent state,
1834 	 * but don't leak out the raw time calculations.
1835 	 *
1836 	 * TODO: potential enhancements - tell the user about:
1837 	 *      - deadline to which this timer thought it was expiring
1838 	 *      - return kn_sfflags in the fflags field so the client can know
1839 	 *        under what flags the timer fired
1840 	 */
1841 	knote_fill_kevent(kn, kev, 1);
1842 	kev->ext[0] = 0;
1843 	/* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1844 
1845 	if (kn->kn_sdata != 0) {
1846 		/*
1847 		 * This is a 'repeating' timer, so we have to emit
1848 		 * how many intervals expired between the arm
1849 		 * and the process.
1850 		 *
1851 		 * A very strange style of interface, because
1852 		 * this could easily be done in the client...
1853 		 */
1854 
1855 		uint64_t now;
1856 
1857 		if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1858 			now = mach_continuous_time();
1859 		} else {
1860 			now = mach_absolute_time();
1861 		}
1862 
1863 		uint64_t first_deadline = kn->kn_ext[0];
1864 		uint64_t interval_abs   = kn->kn_sdata;
1865 		uint64_t orig_arm_time  = first_deadline - interval_abs;
1866 
1867 		assert(now > orig_arm_time);
1868 		assert(now > first_deadline);
1869 
1870 		uint64_t elapsed = now - orig_arm_time;
1871 
1872 		uint64_t num_fired = elapsed / interval_abs;
1873 
1874 		/*
1875 		 * To reach this code, we must have seen the timer pop
1876 		 * and be in repeating mode, so therefore it must have been
1877 		 * more than 'interval' time since the attach or last
1878 		 * successful touch.
1879 		 */
1880 		assert(num_fired > 0);
1881 
1882 		/* report how many intervals have elapsed to the user */
1883 		kev->data = (int64_t)num_fired;
1884 
1885 		/* We only need to re-arm the timer if it's not about to be destroyed */
1886 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
1887 			/* fire at the end of the next interval */
1888 			uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1889 
1890 			assert(new_deadline > now);
1891 
1892 			kn->kn_ext[0] = new_deadline;
1893 
1894 			/*
1895 			 * This can't shortcut setting up the thread call, because
1896 			 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1897 			 */
1898 			filt_timerarm(kn);
1899 		}
1900 	}
1901 
1902 	return FILTER_ACTIVE;
1903 }
1904 
1905 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1906 	.f_extended_codes = true,
1907 	.f_attach   = filt_timerattach,
1908 	.f_detach   = filt_timerdetach,
1909 	.f_event    = filt_bad_event,
1910 	.f_touch    = filt_timertouch,
1911 	.f_process  = filt_timerprocess,
1912 };
1913 
1914 #pragma mark user_filtops
1915 
1916 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1917 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1918 {
1919 	if (kn->kn_sfflags & NOTE_TRIGGER) {
1920 		kn->kn_hook32 = FILTER_ACTIVE;
1921 	} else {
1922 		kn->kn_hook32 = 0;
1923 	}
1924 	return kn->kn_hook32;
1925 }
1926 
1927 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1928 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1929 {
1930 	uint32_t ffctrl;
1931 	int fflags;
1932 
1933 	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1934 	fflags = kev->fflags & NOTE_FFLAGSMASK;
1935 	switch (ffctrl) {
1936 	case NOTE_FFNOP:
1937 		break;
1938 	case NOTE_FFAND:
1939 		kn->kn_sfflags &= fflags;
1940 		break;
1941 	case NOTE_FFOR:
1942 		kn->kn_sfflags |= fflags;
1943 		break;
1944 	case NOTE_FFCOPY:
1945 		kn->kn_sfflags = fflags;
1946 		break;
1947 	}
1948 	kn->kn_sdata = kev->data;
1949 
1950 	if (kev->fflags & NOTE_TRIGGER) {
1951 		kn->kn_hook32 = FILTER_ACTIVE;
1952 	}
1953 	return (int)kn->kn_hook32;
1954 }
1955 
1956 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1957 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1958 {
1959 	int result = (int)kn->kn_hook32;
1960 
1961 	if (result) {
1962 		/* EVFILT_USER returns the data that was passed in */
1963 		knote_fill_kevent_with_sdata(kn, kev);
1964 		kev->fflags = kn->kn_sfflags;
1965 		if (kn->kn_flags & EV_CLEAR) {
1966 			/* knote_fill_kevent cleared kn_fflags */
1967 			kn->kn_hook32 = 0;
1968 		}
1969 	}
1970 
1971 	return result;
1972 }
1973 
1974 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1975 	.f_extended_codes = true,
1976 	.f_attach  = filt_userattach,
1977 	.f_detach  = filt_no_detach,
1978 	.f_event   = filt_bad_event,
1979 	.f_touch   = filt_usertouch,
1980 	.f_process = filt_userprocess,
1981 };
1982 
1983 #pragma mark workloop_filtops
1984 
1985 #define EPREEMPTDISABLED (-1)
1986 
1987 static inline void
filt_wllock(struct kqworkloop * kqwl)1988 filt_wllock(struct kqworkloop *kqwl)
1989 {
1990 	lck_spin_lock(&kqwl->kqwl_statelock);
1991 }
1992 
1993 static inline void
filt_wlunlock(struct kqworkloop * kqwl)1994 filt_wlunlock(struct kqworkloop *kqwl)
1995 {
1996 	lck_spin_unlock(&kqwl->kqwl_statelock);
1997 }
1998 
1999 /*
2000  * Returns true when the interlock for the turnstile is the workqueue lock
2001  *
2002  * When this is the case, all turnstiles operations are delegated
2003  * to the workqueue subsystem.
2004  *
2005  * This is required because kqueue_threadreq_bind_prepost only holds the
2006  * workqueue lock but needs to move the inheritor from the workloop turnstile
2007  * away from the creator thread, so that this now fulfilled request cannot be
2008  * picked anymore by other threads.
2009  */
2010 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)2011 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2012 {
2013 	return kqr_thread_requested_pending(&kqwl->kqwl_request);
2014 }
2015 
2016 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)2017 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
2018     turnstile_update_flags_t flags)
2019 {
2020 	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2021 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2022 
2023 	/*
2024 	 * binding to the workq should always happen through
2025 	 * workq_kern_threadreq_update_inheritor()
2026 	 */
2027 	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2028 
2029 	if ((inheritor = kqwl->kqwl_owner)) {
2030 		flags |= TURNSTILE_INHERITOR_THREAD;
2031 	} else if ((inheritor = kqr_thread(kqr))) {
2032 		flags |= TURNSTILE_INHERITOR_THREAD;
2033 	}
2034 
2035 	turnstile_update_inheritor(ts, inheritor, flags);
2036 }
2037 
2038 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2039 #define FILT_WLATTACH 0
2040 #define FILT_WLTOUCH  1
2041 #define FILT_WLDROP   2
2042 
2043 __result_use_check
2044 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)2045 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
2046     struct kevent_qos_s *kev, kq_index_t qos_index, int op)
2047 {
2048 	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2049 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2050 	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2051 	kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2052 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2053 	int action = KQWL_UTQ_NONE, error = 0;
2054 	bool wl_inheritor_updated = false, needs_wake = false;
2055 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2056 	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2057 	uint64_t udata = 0;
2058 	struct turnstile *ts = TURNSTILE_NULL;
2059 
2060 	filt_wllock(kqwl);
2061 
2062 again:
2063 	new_owner = cur_owner = kqwl->kqwl_owner;
2064 
2065 	/*
2066 	 * Phase 1:
2067 	 *
2068 	 * If asked, load the uint64 value at the user provided address and compare
2069 	 * it against the passed in mask and expected value.
2070 	 *
2071 	 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2072 	 * a thread reference.
2073 	 *
2074 	 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2075 	 * the current thread, then end ownership.
2076 	 *
2077 	 * Lastly decide whether we need to perform a QoS update.
2078 	 */
2079 	if (uaddr) {
2080 		/*
2081 		 * Until <rdar://problem/24999882> exists,
2082 		 * disabling preemption copyin forces any
2083 		 * vm_fault we encounter to fail.
2084 		 */
2085 		error = copyin_atomic64(uaddr, &udata);
2086 
2087 		/*
2088 		 * If we get EFAULT, drop locks, and retry.
2089 		 * If we still get an error report it,
2090 		 * else assume the memory has been faulted
2091 		 * and attempt to copyin under lock again.
2092 		 */
2093 		switch (error) {
2094 		case 0:
2095 			break;
2096 		case EFAULT:
2097 			if (efault_retry-- > 0) {
2098 				filt_wlunlock(kqwl);
2099 				error = copyin_atomic64(uaddr, &udata);
2100 				filt_wllock(kqwl);
2101 				if (error == 0) {
2102 					goto again;
2103 				}
2104 			}
2105 			OS_FALLTHROUGH;
2106 		default:
2107 			goto out;
2108 		}
2109 
2110 		/* Update state as copied in.  */
2111 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2112 
2113 		if ((udata & mask) != (kdata & mask)) {
2114 			error = ESTALE;
2115 		} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2116 			/*
2117 			 * Decipher the owner port name, and translate accordingly.
2118 			 * The low 2 bits were borrowed for other flags, so mask them off.
2119 			 *
2120 			 * Then attempt translation to a thread reference or fail.
2121 			 */
2122 			mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2123 			if (name != MACH_PORT_NULL) {
2124 				name = ipc_entry_name_mask(name);
2125 				extra_thread_ref = port_name_to_thread(name,
2126 				    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2127 				if (extra_thread_ref == THREAD_NULL) {
2128 					error = EOWNERDEAD;
2129 					goto out;
2130 				}
2131 				new_owner = extra_thread_ref;
2132 			}
2133 		}
2134 	}
2135 
2136 	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2137 		new_owner = THREAD_NULL;
2138 	}
2139 
2140 	if (error == 0) {
2141 		if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2142 			action = KQWL_UTQ_SET_QOS_INDEX;
2143 		} else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2144 			action = KQWL_UTQ_SET_QOS_INDEX;
2145 		}
2146 
2147 		if (op == FILT_WLTOUCH) {
2148 			/*
2149 			 * Save off any additional fflags/data we just accepted
2150 			 * But only keep the last round of "update" bits we acted on which helps
2151 			 * debugging a lot.
2152 			 */
2153 			kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2154 			kn->kn_sfflags |= kev->fflags;
2155 			if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2156 				needs_wake = (kn->kn_thread != THREAD_NULL);
2157 			}
2158 		} else if (op == FILT_WLDROP) {
2159 			if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2160 			    NOTE_WL_SYNC_WAIT) {
2161 				/*
2162 				 * When deleting a SYNC_WAIT knote that hasn't been woken up
2163 				 * explicitly, issue a wake up.
2164 				 */
2165 				kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2166 				needs_wake = (kn->kn_thread != THREAD_NULL);
2167 			}
2168 		}
2169 	}
2170 
2171 	/*
2172 	 * Phase 2:
2173 	 *
2174 	 * Commit ownership and QoS changes if any, possibly wake up waiters
2175 	 */
2176 
2177 	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2178 		goto out;
2179 	}
2180 
2181 	kqlock(kqwl);
2182 
2183 	/* If already tracked as servicer, don't track as owner */
2184 	if (new_owner == kqr_thread(kqr)) {
2185 		new_owner = THREAD_NULL;
2186 	}
2187 
2188 	if (cur_owner != new_owner) {
2189 		kqwl->kqwl_owner = new_owner;
2190 		if (new_owner == extra_thread_ref) {
2191 			/* we just transfered this ref to kqwl_owner */
2192 			extra_thread_ref = THREAD_NULL;
2193 		}
2194 		cur_override = kqworkloop_override(kqwl);
2195 
2196 		if (new_owner) {
2197 			/* override it before we drop the old */
2198 			if (cur_override != THREAD_QOS_UNSPECIFIED) {
2199 				thread_add_kevent_override(new_owner, cur_override);
2200 			}
2201 			if (kqr_thread_requested_pending(kqr)) {
2202 				if (action == KQWL_UTQ_NONE) {
2203 					action = KQWL_UTQ_REDRIVE_EVENTS;
2204 				}
2205 			}
2206 		} else if (action == KQWL_UTQ_NONE &&
2207 		    !kqr_thread_requested(kqr) &&
2208 		    kqwl->kqwl_wakeup_qos) {
2209 			action = KQWL_UTQ_REDRIVE_EVENTS;
2210 		}
2211 	}
2212 
2213 	if (action != KQWL_UTQ_NONE) {
2214 		kqworkloop_update_threads_qos(kqwl, action, qos_index);
2215 	}
2216 
2217 	ts = kqwl->kqwl_turnstile;
2218 	if (cur_owner != new_owner && ts) {
2219 		if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2220 			/*
2221 			 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2222 			 * the code went through workq_kern_threadreq_initiate()
2223 			 * and the workqueue has set the inheritor already
2224 			 */
2225 			assert(filt_wlturnstile_interlock_is_workq(kqwl));
2226 		} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2227 			workq_kern_threadreq_lock(kqwl->kqwl_p);
2228 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2229 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
2230 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2231 			if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2232 				/*
2233 				 * If the workq is no longer the interlock, then
2234 				 * workq_kern_threadreq_update_inheritor() has finished a bind
2235 				 * and we need to fallback to the regular path.
2236 				 */
2237 				filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2238 			}
2239 			wl_inheritor_updated = true;
2240 		} else {
2241 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2242 			wl_inheritor_updated = true;
2243 		}
2244 
2245 		/*
2246 		 * We need a turnstile reference because we are dropping the interlock
2247 		 * and the caller has not called turnstile_prepare.
2248 		 */
2249 		if (wl_inheritor_updated) {
2250 			turnstile_reference(ts);
2251 		}
2252 	}
2253 
2254 	if (needs_wake && ts) {
2255 		waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2256 		    kn->kn_thread, THREAD_AWAKENED);
2257 		if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2258 			disable_preemption();
2259 			error = EPREEMPTDISABLED;
2260 		}
2261 	}
2262 
2263 	kqunlock(kqwl);
2264 
2265 out:
2266 	/*
2267 	 * Phase 3:
2268 	 *
2269 	 * Unlock and cleanup various lingering references and things.
2270 	 */
2271 	filt_wlunlock(kqwl);
2272 
2273 #if CONFIG_WORKLOOP_DEBUG
2274 	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2275 		.updater = current_thread(),
2276 		.servicer = kqr_thread(kqr), /* Note: racy */
2277 		.old_owner = cur_owner,
2278 		.new_owner = new_owner,
2279 
2280 		.kev_ident  = kev->ident,
2281 		.error      = (int16_t)error,
2282 		.kev_flags  = kev->flags,
2283 		.kev_fflags = kev->fflags,
2284 
2285 		.kev_mask   = mask,
2286 		.kev_value  = kdata,
2287 		.in_value   = udata,
2288 	});
2289 #endif // CONFIG_WORKLOOP_DEBUG
2290 
2291 	if (wl_inheritor_updated) {
2292 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2293 		turnstile_deallocate_safe(ts);
2294 	}
2295 
2296 	if (cur_owner && new_owner != cur_owner) {
2297 		if (cur_override != THREAD_QOS_UNSPECIFIED) {
2298 			thread_drop_kevent_override(cur_owner);
2299 		}
2300 		thread_deallocate_safe(cur_owner);
2301 	}
2302 	if (extra_thread_ref) {
2303 		thread_deallocate_safe(extra_thread_ref);
2304 	}
2305 	return error;
2306 }
2307 
2308 /*
2309  * Remembers the last updated that came in from userspace for debugging reasons.
2310  * - fflags is mirrored from the userspace kevent
2311  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2312  * - ext[VALUE] is set to what the kernel loaded atomically
2313  * - data is set to the error if any
2314  */
2315 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2316 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2317     int error)
2318 {
2319 	kn->kn_fflags = kev->fflags;
2320 	kn->kn_sdata = error;
2321 	memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2322 }
2323 
2324 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2325 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2326     struct kevent_qos_s *kev, int op)
2327 {
2328 	user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2329 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2330 	uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
2331 	uint64_t udata = 0;
2332 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2333 	int error = 0;
2334 
2335 	if (op == FILT_WLATTACH) {
2336 		(void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2337 	} else if (uaddr == 0) {
2338 		return 0;
2339 	}
2340 
2341 	filt_wllock(kqwl);
2342 
2343 again:
2344 
2345 	/*
2346 	 * Do the debounce thing, the lock serializing the state is the knote lock.
2347 	 */
2348 	if (uaddr) {
2349 		/*
2350 		 * Until <rdar://problem/24999882> exists,
2351 		 * disabling preemption copyin forces any
2352 		 * vm_fault we encounter to fail.
2353 		 */
2354 		error = copyin_atomic64(uaddr, &udata);
2355 
2356 		/*
2357 		 * If we get EFAULT, drop locks, and retry.
2358 		 * If we still get an error report it,
2359 		 * else assume the memory has been faulted
2360 		 * and attempt to copyin under lock again.
2361 		 */
2362 		switch (error) {
2363 		case 0:
2364 			break;
2365 		case EFAULT:
2366 			if (efault_retry-- > 0) {
2367 				filt_wlunlock(kqwl);
2368 				error = copyin_atomic64(uaddr, &udata);
2369 				filt_wllock(kqwl);
2370 				if (error == 0) {
2371 					goto again;
2372 				}
2373 			}
2374 			OS_FALLTHROUGH;
2375 		default:
2376 			goto out;
2377 		}
2378 
2379 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2380 		kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2381 
2382 		if ((udata & mask) != (kdata & mask)) {
2383 			error = ESTALE;
2384 			goto out;
2385 		}
2386 	}
2387 
2388 	if (op == FILT_WLATTACH) {
2389 		error = filt_wlattach_sync_ipc(kn);
2390 		if (error == 0) {
2391 			disable_preemption();
2392 			error = EPREEMPTDISABLED;
2393 		}
2394 	}
2395 
2396 out:
2397 	filt_wlunlock(kqwl);
2398 	return error;
2399 }
2400 
2401 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2402 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2403 {
2404 	struct kqueue *kq = knote_get_kq(kn);
2405 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2406 	int error = 0, result = 0;
2407 	kq_index_t qos_index = 0;
2408 
2409 	if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2410 		error = ENOTSUP;
2411 		goto out;
2412 	}
2413 
2414 	uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2415 	switch (command) {
2416 	case NOTE_WL_THREAD_REQUEST:
2417 		if (kn->kn_id != kqwl->kqwl_dynamicid) {
2418 			error = EINVAL;
2419 			goto out;
2420 		}
2421 		qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2422 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
2423 			error = ERANGE;
2424 			goto out;
2425 		}
2426 		if (kqwl->kqwl_request.tr_kq_qos_index) {
2427 			/*
2428 			 * There already is a thread request, and well, you're only allowed
2429 			 * one per workloop, so fail the attach.
2430 			 */
2431 			error = EALREADY;
2432 			goto out;
2433 		}
2434 		break;
2435 	case NOTE_WL_SYNC_WAIT:
2436 	case NOTE_WL_SYNC_WAKE:
2437 		if (kn->kn_id == kqwl->kqwl_dynamicid) {
2438 			error = EINVAL;
2439 			goto out;
2440 		}
2441 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2442 			error = EINVAL;
2443 			goto out;
2444 		}
2445 		if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2446 			error = EINVAL;
2447 			goto out;
2448 		}
2449 		break;
2450 
2451 	case NOTE_WL_SYNC_IPC:
2452 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2453 			error = EINVAL;
2454 			goto out;
2455 		}
2456 		if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2457 			error = EINVAL;
2458 			goto out;
2459 		}
2460 		break;
2461 	default:
2462 		error = EINVAL;
2463 		goto out;
2464 	}
2465 
2466 	if (command == NOTE_WL_SYNC_IPC) {
2467 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2468 	} else {
2469 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2470 	}
2471 
2472 	if (error == EPREEMPTDISABLED) {
2473 		error = 0;
2474 		result = FILTER_THREADREQ_NODEFEER;
2475 	}
2476 out:
2477 	if (error) {
2478 		/* If userland wants ESTALE to be hidden, fail the attach anyway */
2479 		if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2480 			error = 0;
2481 		}
2482 		knote_set_error(kn, error);
2483 		return result;
2484 	}
2485 	if (command == NOTE_WL_SYNC_WAIT) {
2486 		return kevent_register_wait_prepare(kn, kev, result);
2487 	}
2488 	/* Just attaching the thread request successfully will fire it */
2489 	if (command == NOTE_WL_THREAD_REQUEST) {
2490 		/*
2491 		 * Thread Request knotes need an explicit touch to be active again,
2492 		 * so delivering an event needs to also consume it.
2493 		 */
2494 		kn->kn_flags |= EV_CLEAR;
2495 		return result | FILTER_ACTIVE;
2496 	}
2497 	return result;
2498 }
2499 
2500 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2501 filt_wlwait_continue(void *parameter, wait_result_t wr)
2502 {
2503 	struct _kevent_register *cont_args = parameter;
2504 	struct kqworkloop *kqwl = cont_args->kqwl;
2505 
2506 	kqlock(kqwl);
2507 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2508 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2509 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2510 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2511 	} else {
2512 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2513 	}
2514 	kqunlock(kqwl);
2515 
2516 	turnstile_cleanup();
2517 
2518 	if (wr == THREAD_INTERRUPTED) {
2519 		cont_args->kev.flags |= EV_ERROR;
2520 		cont_args->kev.data = EINTR;
2521 	} else if (wr != THREAD_AWAKENED) {
2522 		panic("Unexpected wait result: %d", wr);
2523 	}
2524 
2525 	kevent_register_wait_return(cont_args);
2526 }
2527 
2528 /*
2529  * Called with the workloop mutex held, most of the time never returns as it
2530  * calls filt_wlwait_continue through a continuation.
2531  */
2532 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2533 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2534     struct _kevent_register *cont_args)
2535 {
2536 	struct kqworkloop *kqwl = cont_args->kqwl;
2537 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2538 	struct turnstile *ts;
2539 	bool workq_locked = false;
2540 
2541 	kqlock_held(kqwl);
2542 
2543 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2544 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2545 		workq_locked = true;
2546 	}
2547 
2548 	ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2549 	    TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2550 
2551 	if (workq_locked) {
2552 		workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2553 		    &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2554 		    TURNSTILE_DELAYED_UPDATE);
2555 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2556 			/*
2557 			 * if the interlock is no longer the workqueue lock,
2558 			 * then we don't need to hold it anymore.
2559 			 */
2560 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2561 			workq_locked = false;
2562 		}
2563 	}
2564 	if (!workq_locked) {
2565 		/*
2566 		 * If the interlock is the workloop's, then it's our responsibility to
2567 		 * call update_inheritor, so just do it.
2568 		 */
2569 		filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2570 	}
2571 
2572 	thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2573 	waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2574 	    THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2575 
2576 	if (workq_locked) {
2577 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2578 	}
2579 
2580 	thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2581 	if (thread) {
2582 		thread_reference(thread);
2583 	}
2584 
2585 	kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2586 }
2587 
2588 /* called in stackshot context to report the thread responsible for blocking this thread */
2589 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2590 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2591     event64_t event, thread_waitinfo_t *waitinfo)
2592 {
2593 	struct knote *kn = (struct knote *)event;
2594 
2595 	zone_require(knote_zone, kn);
2596 
2597 	assert(kn->kn_thread == thread);
2598 
2599 	struct kqueue *kq = knote_get_kq(kn);
2600 
2601 	zone_require(kqworkloop_zone, kq);
2602 	assert(kq->kq_state & KQ_WORKLOOP);
2603 
2604 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2605 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2606 
2607 	thread_t kqwl_owner = kqwl->kqwl_owner;
2608 
2609 	if (kqwl_owner != THREAD_NULL) {
2610 		thread_require(kqwl_owner);
2611 		waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2612 	} else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2613 		thread_require(kqr->tr_thread);
2614 		waitinfo->owner = thread_tid(kqr->tr_thread);
2615 	} else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2616 		waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2617 	} else {
2618 		waitinfo->owner = 0;
2619 	}
2620 
2621 	waitinfo->context = kqwl->kqwl_dynamicid;
2622 }
2623 
2624 static void
filt_wldetach(struct knote * kn)2625 filt_wldetach(struct knote *kn)
2626 {
2627 	if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2628 		filt_wldetach_sync_ipc(kn);
2629 	} else if (kn->kn_thread) {
2630 		kevent_register_wait_cleanup(kn);
2631 	}
2632 }
2633 
2634 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2635 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2636     thread_qos_t *qos_index)
2637 {
2638 	uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2639 	uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2640 
2641 	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2642 		return EINVAL;
2643 	}
2644 	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2645 		if (kev->flags & EV_DELETE) {
2646 			return EINVAL;
2647 		}
2648 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2649 			return EINVAL;
2650 		}
2651 		if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2652 			return ERANGE;
2653 		}
2654 	}
2655 
2656 	switch (new_commands) {
2657 	case NOTE_WL_THREAD_REQUEST:
2658 		/* thread requests can only update themselves */
2659 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2660 			return EINVAL;
2661 		}
2662 		break;
2663 
2664 	case NOTE_WL_SYNC_WAIT:
2665 		if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2666 			return EINVAL;
2667 		}
2668 		goto sync_checks;
2669 
2670 	case NOTE_WL_SYNC_WAKE:
2671 sync_checks:
2672 		if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2673 			return EINVAL;
2674 		}
2675 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2676 			return EINVAL;
2677 		}
2678 		break;
2679 
2680 	case NOTE_WL_SYNC_IPC:
2681 		if (sav_commands != NOTE_WL_SYNC_IPC) {
2682 			return EINVAL;
2683 		}
2684 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2685 			return EINVAL;
2686 		}
2687 		break;
2688 
2689 	default:
2690 		return EINVAL;
2691 	}
2692 	return 0;
2693 }
2694 
2695 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2696 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2697 {
2698 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2699 	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2700 	int result = 0;
2701 
2702 	int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2703 	if (error) {
2704 		goto out;
2705 	}
2706 
2707 	uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2708 	if (command == NOTE_WL_SYNC_IPC) {
2709 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2710 	} else {
2711 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2712 		filt_wlremember_last_update(kn, kev, error);
2713 	}
2714 	if (error == EPREEMPTDISABLED) {
2715 		error = 0;
2716 		result = FILTER_THREADREQ_NODEFEER;
2717 	}
2718 
2719 out:
2720 	if (error) {
2721 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2722 			/* If userland wants ESTALE to be hidden, do not activate */
2723 			return result;
2724 		}
2725 		kev->flags |= EV_ERROR;
2726 		kev->data = error;
2727 		return result;
2728 	}
2729 	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2730 		return kevent_register_wait_prepare(kn, kev, result);
2731 	}
2732 	/* Just touching the thread request successfully will fire it */
2733 	if (command == NOTE_WL_THREAD_REQUEST) {
2734 		if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2735 			result |= FILTER_UPDATE_REQ_QOS;
2736 		}
2737 		result |= FILTER_ACTIVE;
2738 	}
2739 	return result;
2740 }
2741 
2742 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2743 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2744 {
2745 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2746 
2747 	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2748 	if (error) {
2749 		goto out;
2750 	}
2751 
2752 	uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2753 	if (command == NOTE_WL_SYNC_IPC) {
2754 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2755 	} else {
2756 		error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2757 		filt_wlremember_last_update(kn, kev, error);
2758 	}
2759 	assert(error != EPREEMPTDISABLED);
2760 
2761 out:
2762 	if (error) {
2763 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2764 			return false;
2765 		}
2766 		kev->flags |= EV_ERROR;
2767 		kev->data = error;
2768 		return false;
2769 	}
2770 	return true;
2771 }
2772 
2773 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2774 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2775 {
2776 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2777 	int rc = 0;
2778 
2779 	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2780 
2781 	kqlock(kqwl);
2782 
2783 	if (kqwl->kqwl_owner) {
2784 		/*
2785 		 * <rdar://problem/33584321> userspace sometimes due to events being
2786 		 * delivered but not triggering a drain session can cause a process
2787 		 * of the thread request knote.
2788 		 *
2789 		 * When that happens, the automatic deactivation due to process
2790 		 * would swallow the event, so we have to activate the knote again.
2791 		 */
2792 		knote_activate(kqwl, kn, FILTER_ACTIVE);
2793 	} else {
2794 #if DEBUG || DEVELOPMENT
2795 		if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2796 			/*
2797 			 * see src/queue_internal.h in libdispatch
2798 			 */
2799 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2800 			user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2801 			task_t t = current_task();
2802 			uint64_t val;
2803 			if (addr && task_is_active(t) && !task_is_halting(t) &&
2804 			    copyin_atomic64(addr, &val) == 0 &&
2805 			    val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2806 			    (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2807 				panic("kevent: workloop %#016llx is not enqueued "
2808 				    "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2809 				    kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2810 			}
2811 		}
2812 #endif
2813 		knote_fill_kevent(kn, kev, 0);
2814 		kev->fflags = kn->kn_sfflags;
2815 		rc |= FILTER_ACTIVE;
2816 	}
2817 
2818 	kqunlock(kqwl);
2819 
2820 	if (rc & FILTER_ACTIVE) {
2821 		workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2822 	}
2823 	return rc;
2824 }
2825 
2826 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2827 	.f_extended_codes = true,
2828 	.f_attach  = filt_wlattach,
2829 	.f_detach  = filt_wldetach,
2830 	.f_event   = filt_bad_event,
2831 	.f_touch   = filt_wltouch,
2832 	.f_process = filt_wlprocess,
2833 	.f_allow_drop = filt_wlallow_drop,
2834 	.f_post_register_wait = filt_wlpost_register_wait,
2835 };
2836 
2837 #pragma mark - kqueues allocation and deallocation
2838 
2839 OS_NOINLINE
2840 static void
2841 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2842 
2843 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2844 kqworkloop_try_retain(struct kqworkloop *kqwl)
2845 {
2846 	return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2847 }
2848 
2849 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2850 kqworkloop_retain(struct kqworkloop *kqwl)
2851 {
2852 	return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2853 }
2854 
2855 OS_ALWAYS_INLINE
2856 static inline void
kqueue_retain(kqueue_t kqu)2857 kqueue_retain(kqueue_t kqu)
2858 {
2859 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2860 		kqworkloop_retain(kqu.kqwl);
2861 	}
2862 }
2863 
2864 OS_ALWAYS_INLINE
2865 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2866 kqworkloop_release_live(struct kqworkloop *kqwl)
2867 {
2868 	os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2869 }
2870 
2871 OS_ALWAYS_INLINE
2872 static inline void
kqueue_release_live(kqueue_t kqu)2873 kqueue_release_live(kqueue_t kqu)
2874 {
2875 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2876 		kqworkloop_release_live(kqu.kqwl);
2877 	}
2878 }
2879 
2880 OS_ALWAYS_INLINE
2881 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2882 kqworkloop_release(struct kqworkloop *kqwl)
2883 {
2884 	if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2885 		kqworkloop_dealloc(kqwl, true);
2886 	}
2887 }
2888 
2889 OS_ALWAYS_INLINE
2890 static inline void
kqueue_release(kqueue_t kqu)2891 kqueue_release(kqueue_t kqu)
2892 {
2893 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2894 		kqworkloop_release(kqu.kqwl);
2895 	}
2896 }
2897 
2898 /*!
2899  * @function kqueue_destroy
2900  *
2901  * @brief
2902  * Common part to all kqueue dealloc functions.
2903  */
2904 OS_NOINLINE
2905 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2906 kqueue_destroy(kqueue_t kqu, zone_t zone)
2907 {
2908 	lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2909 
2910 	zfree(zone, kqu.kq);
2911 }
2912 
2913 /*!
2914  * @function kqueue_init
2915  *
2916  * @brief
2917  * Common part to all kqueue alloc functions.
2918  */
2919 static kqueue_t
kqueue_init(kqueue_t kqu)2920 kqueue_init(kqueue_t kqu)
2921 {
2922 	lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2923 	return kqu;
2924 }
2925 
2926 #pragma mark kqfile allocation and deallocation
2927 
2928 /*!
2929  * @function kqueue_dealloc
2930  *
2931  * @brief
2932  * Detach all knotes from a kqfile and free it.
2933  *
2934  * @discussion
2935  * We walk each list looking for knotes referencing this
2936  * this kqueue.  If we find one, we try to drop it.  But
2937  * if we fail to get a drop reference, that will wait
2938  * until it is dropped.  So, we can just restart again
2939  * safe in the assumption that the list will eventually
2940  * not contain any more references to this kqueue (either
2941  * we dropped them all, or someone else did).
2942  *
2943  * Assumes no new events are being added to the kqueue.
2944  * Nothing locked on entry or exit.
2945  */
2946 void
kqueue_dealloc(struct kqueue * kq)2947 kqueue_dealloc(struct kqueue *kq)
2948 {
2949 	KNOTE_LOCK_CTX(knlc);
2950 	struct proc *p = kq->kq_p;
2951 	struct filedesc *fdp = &p->p_fd;
2952 	struct knote *kn;
2953 
2954 	assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2955 
2956 	proc_fdlock(p);
2957 	for (int i = 0; i < fdp->fd_knlistsize; i++) {
2958 		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2959 		while (kn != NULL) {
2960 			if (kq == knote_get_kq(kn)) {
2961 				kqlock(kq);
2962 				proc_fdunlock(p);
2963 				if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2964 					knote_drop(kq, kn, &knlc);
2965 				}
2966 				proc_fdlock(p);
2967 				/* start over at beginning of list */
2968 				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2969 				continue;
2970 			}
2971 			kn = SLIST_NEXT(kn, kn_link);
2972 		}
2973 	}
2974 
2975 	knhash_lock(fdp);
2976 	proc_fdunlock(p);
2977 
2978 	if (fdp->fd_knhashmask != 0) {
2979 		for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2980 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2981 			while (kn != NULL) {
2982 				if (kq == knote_get_kq(kn)) {
2983 					kqlock(kq);
2984 					knhash_unlock(fdp);
2985 					if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2986 						knote_drop(kq, kn, &knlc);
2987 					}
2988 					knhash_lock(fdp);
2989 					/* start over at beginning of list */
2990 					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2991 					continue;
2992 				}
2993 				kn = SLIST_NEXT(kn, kn_link);
2994 			}
2995 		}
2996 	}
2997 	knhash_unlock(fdp);
2998 
2999 	kqueue_destroy(kq, kqfile_zone);
3000 }
3001 
3002 /*!
3003  * @function kqueue_alloc
3004  *
3005  * @brief
3006  * Allocate a kqfile.
3007  */
3008 struct kqueue *
kqueue_alloc(struct proc * p)3009 kqueue_alloc(struct proc *p)
3010 {
3011 	struct kqfile *kqf;
3012 
3013 	/*
3014 	 * kqfiles are created with kqueue() so we need to wait for
3015 	 * the first kevent syscall to know which bit among
3016 	 * KQ_KEV_{32,64,QOS} will be set in kqf_state
3017 	 */
3018 	kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
3019 	kqf->kqf_p = p;
3020 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3021 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3022 
3023 	return kqueue_init(kqf).kq;
3024 }
3025 
3026 /*!
3027  * @function kqueue_internal
3028  *
3029  * @brief
3030  * Core implementation for kqueue and guarded_kqueue_np()
3031  */
3032 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)3033 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
3034 {
3035 	struct kqueue *kq;
3036 	struct fileproc *fp;
3037 	int fd, error;
3038 
3039 	error = falloc_withinit(p, current_cached_proc_cred(p),
3040 	    vfs_context_current(), &fp, &fd, fp_init, initarg);
3041 	if (error) {
3042 		return error;
3043 	}
3044 
3045 	kq = kqueue_alloc(p);
3046 	if (kq == NULL) {
3047 		fp_free(p, fd, fp);
3048 		return ENOMEM;
3049 	}
3050 
3051 	fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3052 	fp->f_flag = FREAD | FWRITE;
3053 	fp->f_ops = &kqueueops;
3054 	fp_set_data(fp, kq);
3055 	fp->f_lflags |= FG_CONFINED;
3056 
3057 	proc_fdlock(p);
3058 	procfdtbl_releasefd(p, fd, NULL);
3059 	fp_drop(p, fd, fp, 1);
3060 	proc_fdunlock(p);
3061 
3062 	*retval = fd;
3063 	return error;
3064 }
3065 
3066 /*!
3067  * @function kqueue
3068  *
3069  * @brief
3070  * The kqueue syscall.
3071  */
3072 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3073 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3074 {
3075 	return kqueue_internal(p, NULL, NULL, retval);
3076 }
3077 
3078 #pragma mark kqworkq allocation and deallocation
3079 
3080 /*!
3081  * @function kqworkq_dealloc
3082  *
3083  * @brief
3084  * Deallocates a workqueue kqueue.
3085  *
3086  * @discussion
3087  * This only happens at process death, or for races with concurrent
3088  * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3089  * this kqueue, either there are none, or someone else took care of them.
3090  */
3091 void
kqworkq_dealloc(struct kqworkq * kqwq)3092 kqworkq_dealloc(struct kqworkq *kqwq)
3093 {
3094 	kqueue_destroy(kqwq, kqworkq_zone);
3095 }
3096 
3097 /*!
3098  * @function kqworkq_alloc
3099  *
3100  * @brief
3101  * Allocates a workqueue kqueue.
3102  *
3103  * @discussion
3104  * This is the slow path of kevent_get_kqwq.
3105  * This takes care of making sure procs have a single workq kqueue.
3106  */
3107 OS_NOINLINE
3108 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3109 kqworkq_alloc(struct proc *p, unsigned int flags)
3110 {
3111 	struct kqworkq *kqwq, *tmp;
3112 
3113 	kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3114 
3115 	assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3116 	if (flags & KEVENT_FLAG_LEGACY64) {
3117 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3118 	} else {
3119 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3120 	}
3121 	kqwq->kqwq_p = p;
3122 
3123 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3124 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3125 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3126 	}
3127 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3128 		/*
3129 		 * Because of how the bucketized system works, we mix overcommit
3130 		 * sources with not overcommit: each time we move a knote from
3131 		 * one bucket to the next due to overrides, we'd had to track
3132 		 * overcommitness, and it's really not worth it in the workloop
3133 		 * enabled world that track this faithfully.
3134 		 *
3135 		 * Incidentally, this behaves like the original manager-based
3136 		 * kqwq where event delivery always happened (hence is
3137 		 * "overcommit")
3138 		 */
3139 		kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3140 		kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3141 		if (i != KQWQ_QOS_MANAGER) {
3142 			kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3143 		}
3144 		kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3145 	}
3146 
3147 	kqueue_init(kqwq);
3148 
3149 	if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3150 		kqworkq_dealloc(kqwq);
3151 		return tmp;
3152 	}
3153 
3154 	return kqwq;
3155 }
3156 
3157 #pragma mark kqworkloop allocation and deallocation
3158 
3159 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3160 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3161 
3162 OS_ALWAYS_INLINE
3163 static inline void
kqhash_lock(struct filedesc * fdp)3164 kqhash_lock(struct filedesc *fdp)
3165 {
3166 	lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3167 }
3168 
3169 OS_ALWAYS_INLINE
3170 static inline void
kqhash_unlock(struct filedesc * fdp)3171 kqhash_unlock(struct filedesc *fdp)
3172 {
3173 	lck_mtx_unlock(&fdp->fd_kqhashlock);
3174 }
3175 
3176 OS_ALWAYS_INLINE
3177 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3178 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3179     struct kqworkloop *kqwl)
3180 {
3181 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3182 	LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3183 }
3184 
3185 OS_ALWAYS_INLINE
3186 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3187 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3188 {
3189 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3190 	struct kqworkloop *kqwl;
3191 
3192 	LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3193 		if (kqwl->kqwl_dynamicid == id) {
3194 			return kqwl;
3195 		}
3196 	}
3197 	return NULL;
3198 }
3199 
3200 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3201 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3202 {
3203 	struct kqworkloop *kqwl = NULL;
3204 
3205 	kqhash_lock(fdp);
3206 	if (__probable(fdp->fd_kqhash)) {
3207 		kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3208 		if (kqwl && !kqworkloop_try_retain(kqwl)) {
3209 			kqwl = NULL;
3210 		}
3211 	}
3212 	kqhash_unlock(fdp);
3213 	return kqwl;
3214 }
3215 
3216 OS_NOINLINE
3217 static void
kqworkloop_hash_init(struct filedesc * fdp)3218 kqworkloop_hash_init(struct filedesc *fdp)
3219 {
3220 	struct kqwllist *alloc_hash;
3221 	u_long alloc_mask;
3222 
3223 	kqhash_unlock(fdp);
3224 	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3225 	kqhash_lock(fdp);
3226 
3227 	/* See if we won the race */
3228 	if (__probable(fdp->fd_kqhashmask == 0)) {
3229 		fdp->fd_kqhash = alloc_hash;
3230 		fdp->fd_kqhashmask = alloc_mask;
3231 	} else {
3232 		kqhash_unlock(fdp);
3233 		hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3234 		kqhash_lock(fdp);
3235 	}
3236 }
3237 
3238 /*
3239  * kqueue iotier override is only supported for kqueue that has
3240  * only one port as a mach port source. Updating the iotier
3241  * override on the mach port source will update the override
3242  * on kqueue as well. Since kqueue with iotier override will
3243  * only have one port attached, there is no logic for saturation
3244  * like qos override, the iotier override of mach port source
3245  * would be reflected in kevent iotier override.
3246  */
3247 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3248 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3249 {
3250 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3251 		return;
3252 	}
3253 
3254 	struct kqworkloop *kqwl = kqu.kqwl;
3255 	os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3256 }
3257 
3258 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3259 kqueue_get_iotier_override(kqueue_t kqu)
3260 {
3261 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3262 		return THROTTLE_LEVEL_END;
3263 	}
3264 
3265 	struct kqworkloop *kqwl = kqu.kqwl;
3266 	return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3267 }
3268 
3269 #if CONFIG_PREADOPT_TG
3270 /*
3271  * This function is called with a borrowed reference on the thread group without
3272  * kq lock held with the mqueue lock held. It may or may not have the knote lock
3273  * (called from both fevent as well as fattach/ftouch). Upon success, an
3274  * additional reference on the TG is taken
3275  */
3276 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3277 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3278 {
3279 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3280 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3281 		    (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3282 		return;
3283 	}
3284 
3285 	struct kqworkloop *kqwl = kqu.kqwl;
3286 
3287 	assert(qos < THREAD_QOS_LAST);
3288 
3289 	thread_group_retain(tg);
3290 
3291 	thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3292 	int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3293 		if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3294 		        os_atomic_rmw_loop_give_up(break);
3295 		}
3296 
3297 		if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3298 		        /*
3299 		         * Note that old_tg could be a NULL TG pointer but with a QoS
3300 		         * set. See also workq_thread_reset_pri.
3301 		         *
3302 		         * Compare the QoS of existing preadopted tg with new one and
3303 		         * only overwrite the thread group if we have one with a higher
3304 		         * QoS.
3305 		         */
3306 		        thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3307 		        if (existing_qos >= qos) {
3308 		                os_atomic_rmw_loop_give_up(break);
3309 			}
3310 		}
3311 
3312 		// Transfer the ref taken earlier in the function to the kqwl
3313 		new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3314 	});
3315 
3316 	if (ret) {
3317 		KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3318 
3319 		if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3320 			thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3321 		}
3322 
3323 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3324 	} else {
3325 		// We failed to write to the kqwl_preadopt_tg, drop the ref we took
3326 		// earlier in the function
3327 		thread_group_deallocate_safe(tg);
3328 	}
3329 }
3330 
3331 /*
3332  * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3333  */
3334 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3335 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3336 {
3337 	bool success = false;
3338 	if (kq->kq_state & KQ_WORKLOOP) {
3339 		struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3340 		thread_group_qos_t old_tg;
3341 		success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3342 		    KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3343 		    &old_tg, relaxed);
3344 		if (success) {
3345 			thread_set_preadopt_thread_group(thread, tg);
3346 		} else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3347 			/*
3348 			 * Technically the following set_preadopt should be a no-op since this
3349 			 * servicer thread preadopts kqwl's permanent tg at bind time.
3350 			 * See kqueue_threadreq_bind.
3351 			 */
3352 			thread_set_preadopt_thread_group(thread, KQWL_GET_PREADOPTED_TG(old_tg));
3353 		} else {
3354 			assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3355 			    old_tg == KQWL_PREADOPTED_TG_NEVER);
3356 		}
3357 	}
3358 	return success;
3359 }
3360 #endif
3361 
3362 /*!
3363  * @function kqworkloop_dealloc
3364  *
3365  * @brief
3366  * Deallocates a workloop kqueue.
3367  *
3368  * @discussion
3369  * Knotes hold references on the workloop, so we can't really reach this
3370  * function unless all of these are already gone.
3371  *
3372  * Nothing locked on entry or exit.
3373  *
3374  * @param hash_remove
3375  * Whether to remove the workloop from its hash table.
3376  */
3377 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3378 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3379 {
3380 	thread_t cur_owner;
3381 
3382 	cur_owner = kqwl->kqwl_owner;
3383 	if (cur_owner) {
3384 		if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3385 			thread_drop_kevent_override(cur_owner);
3386 		}
3387 		thread_deallocate(cur_owner);
3388 		kqwl->kqwl_owner = THREAD_NULL;
3389 	}
3390 
3391 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3392 		struct turnstile *ts;
3393 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3394 		    &ts, TURNSTILE_WORKLOOPS);
3395 		turnstile_cleanup();
3396 		turnstile_deallocate(ts);
3397 	}
3398 
3399 	if (hash_remove) {
3400 		struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3401 
3402 		kqhash_lock(fdp);
3403 		LIST_REMOVE(kqwl, kqwl_hashlink);
3404 #if CONFIG_PROC_RESOURCE_LIMITS
3405 		fdp->num_kqwls--;
3406 #endif
3407 		kqhash_unlock(fdp);
3408 	}
3409 
3410 #if CONFIG_PREADOPT_TG
3411 	thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3412 	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3413 		thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3414 	}
3415 #endif
3416 
3417 	assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3418 	assert(kqwl->kqwl_owner == THREAD_NULL);
3419 	assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3420 
3421 	lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3422 	kqueue_destroy(kqwl, kqworkloop_zone);
3423 }
3424 
3425 /*!
3426  * @function kqworkloop_init
3427  *
3428  * @brief
3429  * Initializes an allocated kqworkloop.
3430  */
3431 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp,struct thread_group * trp_permanent_preadopt_tg)3432 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3433     kqueue_id_t id, workq_threadreq_param_t *trp
3434 #if CONFIG_PREADOPT_TG
3435     , struct thread_group *trp_permanent_preadopt_tg
3436 #endif
3437     )
3438 {
3439 	kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3440 	os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3441 	kqwl->kqwl_dynamicid = id;
3442 	kqwl->kqwl_p         = p;
3443 	if (trp) {
3444 		kqwl->kqwl_params = trp->trp_value;
3445 	}
3446 
3447 	workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3448 	if (trp) {
3449 		if (trp->trp_flags & TRP_PRIORITY) {
3450 			tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3451 		}
3452 		if (trp->trp_flags) {
3453 			tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3454 		}
3455 	}
3456 	kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3457 	kqwl->kqwl_request.tr_flags = tr_flags;
3458 	os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3459 #if CONFIG_PREADOPT_TG
3460 	if (trp_permanent_preadopt_tg) {
3461 		/*
3462 		 * This kqwl is permanently configured with a thread group.
3463 		 * By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3464 		 * has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3465 		 * trp_permanent_preadopt_tg is transferred to the kqwl.
3466 		 */
3467 		thread_group_qos_t kqwl_preadopt_tg;
3468 		kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_permanent_preadopt_tg);
3469 		os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3470 	} else if (task_is_app(current_task())) {
3471 		/*
3472 		 * Not a specially preconfigured kqwl so it is open to participate in sync IPC
3473 		 * thread group preadoption; but, apps will never adopt a thread group that
3474 		 * is not their own. This is a gross hack to simulate the post-process that
3475 		 * is done in the voucher subsystem today for thread groups.
3476 		 */
3477 		os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3478 	}
3479 #endif
3480 
3481 	for (int i = 0; i < KQWL_NBUCKETS; i++) {
3482 		TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3483 	}
3484 	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3485 
3486 	lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3487 
3488 	kqueue_init(kqwl);
3489 }
3490 
3491 #if CONFIG_PROC_RESOURCE_LIMITS
3492 void
kqworkloop_check_limit_exceeded(struct filedesc * fdp)3493 kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3494 {
3495 	int num_kqwls = fdp->num_kqwls;
3496 	if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > 0 &&
3497 	    num_kqwls > fdp->kqwl_dyn_soft_limit) {
3498 		kqwl_above_soft_limit_send_notification(fdp);
3499 		act_set_astproc_resource(current_thread());
3500 	} else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > 0
3501 	    && num_kqwls > fdp->kqwl_dyn_hard_limit) {
3502 		kqwl_above_hard_limit_send_notification(fdp);
3503 		act_set_astproc_resource(current_thread());
3504 	}
3505 }
3506 #endif
3507 
3508 /*!
3509  * @function kqworkloop_get_or_create
3510  *
3511  * @brief
3512  * Wrapper around kqworkloop_init that handles the uniquing of workloops.
3513  *
3514  * @returns
3515  * 0:      success
3516  * EINVAL: invalid parameters
3517  * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3518  * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3519  * ENOMEM: allocation failed
3520  */
3521 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,struct thread_group * trp_permanent_preadopt_tg,unsigned int flags,struct kqworkloop ** kqwlp)3522 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3523     workq_threadreq_param_t *trp,
3524 #if CONFIG_PREADOPT_TG
3525     struct thread_group *trp_permanent_preadopt_tg,
3526 #endif
3527     unsigned int flags, struct kqworkloop **kqwlp)
3528 {
3529 	struct filedesc *fdp = &p->p_fd;
3530 	struct kqworkloop *alloc_kqwl = NULL;
3531 	struct kqworkloop *kqwl = NULL;
3532 	int error = 0;
3533 
3534 	assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3535 
3536 	if (id == 0 || id == (kqueue_id_t)-1) {
3537 		return EINVAL;
3538 	}
3539 
3540 	for (;;) {
3541 		kqhash_lock(fdp);
3542 		if (__improbable(fdp->fd_kqhash == NULL)) {
3543 			kqworkloop_hash_init(fdp);
3544 		}
3545 
3546 		kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3547 		if (kqwl) {
3548 			if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3549 				/*
3550 				 * If MUST_NOT_EXIST was passed, even if we would have failed
3551 				 * the try_retain, it could have gone the other way, and
3552 				 * userspace can't tell. Let'em fix their race.
3553 				 */
3554 				error = EEXIST;
3555 				break;
3556 			}
3557 
3558 			if (__probable(kqworkloop_try_retain(kqwl))) {
3559 				/*
3560 				 * This is a valid live workloop !
3561 				 */
3562 				*kqwlp = kqwl;
3563 				error = 0;
3564 				break;
3565 			}
3566 		}
3567 
3568 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3569 			error = ENOENT;
3570 			break;
3571 		}
3572 
3573 		/*
3574 		 * We didn't find what we were looking for.
3575 		 *
3576 		 * If this is the second time we reach this point (alloc_kqwl != NULL),
3577 		 * then we're done.
3578 		 *
3579 		 * If this is the first time we reach this point (alloc_kqwl == NULL),
3580 		 * then try to allocate one without blocking.
3581 		 */
3582 		if (__probable(alloc_kqwl == NULL)) {
3583 			alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3584 		}
3585 		if (__probable(alloc_kqwl)) {
3586 #if CONFIG_PROC_RESOURCE_LIMITS
3587 			fdp->num_kqwls++;
3588 			kqworkloop_check_limit_exceeded(fdp);
3589 #endif
3590 			kqworkloop_init(alloc_kqwl, p, id, trp
3591 #if CONFIG_PREADOPT_TG
3592 			    , trp_permanent_preadopt_tg
3593 #endif
3594 			    );
3595 			kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3596 			kqhash_unlock(fdp);
3597 			*kqwlp = alloc_kqwl;
3598 			return 0;
3599 		}
3600 
3601 		/*
3602 		 * We have to block to allocate a workloop, drop the lock,
3603 		 * allocate one, but then we need to retry lookups as someone
3604 		 * else could race with us.
3605 		 */
3606 		kqhash_unlock(fdp);
3607 
3608 		alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3609 	}
3610 
3611 	kqhash_unlock(fdp);
3612 
3613 	if (__improbable(alloc_kqwl)) {
3614 		zfree(kqworkloop_zone, alloc_kqwl);
3615 	}
3616 
3617 	return error;
3618 }
3619 
3620 #pragma mark - knotes
3621 
3622 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3623 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3624 {
3625 	knote_set_error(kn, ENOTSUP);
3626 	return 0;
3627 }
3628 
3629 static void
filt_no_detach(__unused struct knote * kn)3630 filt_no_detach(__unused struct knote *kn)
3631 {
3632 }
3633 
3634 static int __dead2
filt_bad_event(struct knote * kn,long hint)3635 filt_bad_event(struct knote *kn, long hint)
3636 {
3637 	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3638 }
3639 
3640 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3641 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3642 {
3643 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3644 }
3645 
3646 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3647 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3648 {
3649 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3650 }
3651 
3652 /*
3653  * knotes_dealloc - detach all knotes for the process and drop them
3654  *
3655  *		Process is in such a state that it will not try to allocate
3656  *		any more knotes during this process (stopped for exit or exec).
3657  */
3658 void
knotes_dealloc(proc_t p)3659 knotes_dealloc(proc_t p)
3660 {
3661 	struct filedesc *fdp = &p->p_fd;
3662 	struct kqueue *kq;
3663 	struct knote *kn;
3664 	struct  klist *kn_hash = NULL;
3665 	u_long kn_hashmask;
3666 	int i;
3667 
3668 	proc_fdlock(p);
3669 
3670 	/* Close all the fd-indexed knotes up front */
3671 	if (fdp->fd_knlistsize > 0) {
3672 		for (i = 0; i < fdp->fd_knlistsize; i++) {
3673 			while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3674 				kq = knote_get_kq(kn);
3675 				kqlock(kq);
3676 				proc_fdunlock(p);
3677 				knote_drop(kq, kn, NULL);
3678 				proc_fdlock(p);
3679 			}
3680 		}
3681 		/* free the table */
3682 		kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3683 	}
3684 	fdp->fd_knlistsize = 0;
3685 
3686 	proc_fdunlock(p);
3687 
3688 	knhash_lock(fdp);
3689 
3690 	/* Clean out all the hashed knotes as well */
3691 	if (fdp->fd_knhashmask != 0) {
3692 		for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3693 			while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3694 				kq = knote_get_kq(kn);
3695 				kqlock(kq);
3696 				knhash_unlock(fdp);
3697 				knote_drop(kq, kn, NULL);
3698 				knhash_lock(fdp);
3699 			}
3700 		}
3701 		kn_hash = fdp->fd_knhash;
3702 		kn_hashmask = fdp->fd_knhashmask;
3703 		fdp->fd_knhashmask = 0;
3704 		fdp->fd_knhash = NULL;
3705 	}
3706 
3707 	knhash_unlock(fdp);
3708 
3709 	if (kn_hash) {
3710 		hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3711 	}
3712 }
3713 
3714 /*
3715  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3716  * scheduling parameters
3717  *
3718  * Process is in such a state that it will not try to allocate
3719  * any more kqs or knotes during this process (stopped for exit or exec).
3720  */
3721 void
kqworkloops_dealloc(proc_t p)3722 kqworkloops_dealloc(proc_t p)
3723 {
3724 	struct filedesc *fdp = &p->p_fd;
3725 	struct kqworkloop *kqwl, *kqwln;
3726 	struct kqwllist tofree;
3727 
3728 	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3729 		return;
3730 	}
3731 
3732 	kqhash_lock(fdp);
3733 
3734 	if (fdp->fd_kqhashmask == 0) {
3735 		kqhash_unlock(fdp);
3736 		return;
3737 	}
3738 
3739 	LIST_INIT(&tofree);
3740 
3741 	for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3742 		LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3743 #if CONFIG_PREADOPT_TG
3744 			/*
3745 			 * kqworkloops that have scheduling parameters have an
3746 			 * implicit retain from kqueue_workloop_ctl that needs
3747 			 * to be balanced on process exit.
3748 			 */
3749 			__assert_only thread_group_qos_t preadopt_tg;
3750 			preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3751 #endif
3752 			assert(kqwl->kqwl_params
3753 #if CONFIG_PREADOPT_TG
3754 			    || KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3755 #endif
3756 			    );
3757 
3758 			LIST_REMOVE(kqwl, kqwl_hashlink);
3759 			LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3760 		}
3761 	}
3762 #if CONFIG_PROC_RESOURCE_LIMITS
3763 	fdp->num_kqwls = 0;
3764 #endif
3765 	kqhash_unlock(fdp);
3766 
3767 	LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3768 		uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3769 		if (ref != 1) {
3770 			panic("kq(%p) invalid refcount %d", kqwl, ref);
3771 		}
3772 		kqworkloop_dealloc(kqwl, false);
3773 	}
3774 }
3775 
3776 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3777 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3778     struct kevent_qos_s *kev)
3779 {
3780 	/* We don't care about the priority of a disabled or deleted knote */
3781 	if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3782 		return 0;
3783 	}
3784 
3785 	if (kq->kq_state & KQ_WORKLOOP) {
3786 		/*
3787 		 * Workloops need valid priorities with a QOS (excluding manager) for
3788 		 * any enabled knote.
3789 		 *
3790 		 * When it is pre-existing, just make sure it has a valid QoS as
3791 		 * kevent_register() will not use the incoming priority (filters who do
3792 		 * have the responsibility to validate it again, see filt_wltouch).
3793 		 *
3794 		 * If the knote is being made, validate the incoming priority.
3795 		 */
3796 		if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3797 			return ERANGE;
3798 		}
3799 	}
3800 
3801 	return 0;
3802 }
3803 
3804 /*
3805  * Prepare a filter for waiting after register.
3806  *
3807  * The f_post_register_wait hook will be called later by kevent_register()
3808  * and should call kevent_register_wait_block()
3809  */
3810 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3811 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3812 {
3813 	thread_t thread = current_thread();
3814 
3815 	assert(knote_fops(kn)->f_extended_codes);
3816 
3817 	if (kn->kn_thread == NULL) {
3818 		thread_reference(thread);
3819 		kn->kn_thread = thread;
3820 	} else if (kn->kn_thread != thread) {
3821 		/*
3822 		 * kn_thread may be set from a previous aborted wait
3823 		 * However, it has to be from the same thread.
3824 		 */
3825 		kev->flags |= EV_ERROR;
3826 		kev->data = EXDEV;
3827 		return 0;
3828 	}
3829 
3830 	return FILTER_REGISTER_WAIT | rc;
3831 }
3832 
3833 /*
3834  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3835  * aborted instead of properly woken up with thread_wakeup_thread().
3836  */
3837 static void
kevent_register_wait_cleanup(struct knote * kn)3838 kevent_register_wait_cleanup(struct knote *kn)
3839 {
3840 	thread_t thread = kn->kn_thread;
3841 	kn->kn_thread = NULL;
3842 	thread_deallocate(thread);
3843 }
3844 
3845 /*
3846  * Must be called at the end of a f_post_register_wait call from a filter.
3847  */
3848 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3849 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3850     thread_continue_t cont, struct _kevent_register *cont_args)
3851 {
3852 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3853 	kqunlock(cont_args->kqwl);
3854 	cont_args->handoff_thread = thread;
3855 	thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3856 }
3857 
3858 /*
3859  * Called by Filters using a f_post_register_wait to return from their wait.
3860  */
3861 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3862 kevent_register_wait_return(struct _kevent_register *cont_args)
3863 {
3864 	struct kqworkloop *kqwl = cont_args->kqwl;
3865 	struct kevent_qos_s *kev = &cont_args->kev;
3866 	int error = 0;
3867 
3868 	if (cont_args->handoff_thread) {
3869 		thread_deallocate(cont_args->handoff_thread);
3870 	}
3871 
3872 	if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3873 		if ((kev->flags & EV_ERROR) == 0) {
3874 			kev->flags |= EV_ERROR;
3875 			kev->data = 0;
3876 		}
3877 		error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3878 		if (error == 0) {
3879 			cont_args->eventout++;
3880 		}
3881 	}
3882 
3883 	kqworkloop_release(kqwl);
3884 	if (error == 0) {
3885 		*(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3886 	}
3887 	unix_syscall_return(error);
3888 }
3889 
3890 /*
3891  * kevent_register - add a new event to a kqueue
3892  *
3893  *	Creates a mapping between the event source and
3894  *	the kqueue via a knote data structure.
3895  *
3896  *	Because many/most the event sources are file
3897  *	descriptor related, the knote is linked off
3898  *	the filedescriptor table for quick access.
3899  *
3900  *	called with nothing locked
3901  *	caller holds a reference on the kqueue
3902  */
3903 
3904 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)3905 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
3906     struct knote **kn_out)
3907 {
3908 	struct proc *p = kq->kq_p;
3909 	const struct filterops *fops;
3910 	struct knote *kn = NULL;
3911 	int result = 0, error = 0;
3912 	unsigned short kev_flags = kev->flags;
3913 	KNOTE_LOCK_CTX(knlc);
3914 
3915 	if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
3916 		fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
3917 	} else {
3918 		error = EINVAL;
3919 		goto out;
3920 	}
3921 
3922 	/* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3923 	if (__improbable((kev->flags & EV_VANISHED) &&
3924 	    (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
3925 		error = EINVAL;
3926 		goto out;
3927 	}
3928 
3929 	/* Simplify the flags - delete and disable overrule */
3930 	if (kev->flags & EV_DELETE) {
3931 		kev->flags &= ~EV_ADD;
3932 	}
3933 	if (kev->flags & EV_DISABLE) {
3934 		kev->flags &= ~EV_ENABLE;
3935 	}
3936 
3937 	if (kq->kq_state & KQ_WORKLOOP) {
3938 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3939 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
3940 		    kev->udata, kev->flags, kev->filter);
3941 	} else if (kq->kq_state & KQ_WORKQ) {
3942 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3943 		    0, kev->udata, kev->flags, kev->filter);
3944 	} else {
3945 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3946 		    VM_KERNEL_UNSLIDE_OR_PERM(kq),
3947 		    kev->udata, kev->flags, kev->filter);
3948 	}
3949 
3950 restart:
3951 	/* find the matching knote from the fd tables/hashes */
3952 	kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
3953 	error = kevent_register_validate_priority(kq, kn, kev);
3954 	result = 0;
3955 	if (error) {
3956 		if (kn) {
3957 			kqunlock(kq);
3958 		}
3959 		goto out;
3960 	}
3961 
3962 	if (kn == NULL && (kev->flags & EV_ADD) == 0) {
3963 		/*
3964 		 * No knote found, EV_ADD wasn't specified
3965 		 */
3966 
3967 		if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3968 		    (kq->kq_state & KQ_WORKLOOP)) {
3969 			/*
3970 			 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3971 			 * that doesn't care about ENOENT, so just pretend the deletion
3972 			 * happened.
3973 			 */
3974 		} else {
3975 			error = ENOENT;
3976 		}
3977 		goto out;
3978 	} else if (kn == NULL) {
3979 		/*
3980 		 * No knote found, need to attach a new one (attach)
3981 		 */
3982 
3983 		struct fileproc *knote_fp = NULL;
3984 
3985 		/* grab a file reference for the new knote */
3986 		if (fops->f_isfd) {
3987 			if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
3988 				goto out;
3989 			}
3990 		}
3991 
3992 		kn = knote_alloc();
3993 		kn->kn_fp = knote_fp;
3994 		kn->kn_is_fd = fops->f_isfd;
3995 		kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3996 		kn->kn_status = 0;
3997 
3998 		/* was vanish support requested */
3999 		if (kev->flags & EV_VANISHED) {
4000 			kev->flags &= ~EV_VANISHED;
4001 			kn->kn_status |= KN_REQVANISH;
4002 		}
4003 
4004 		/* snapshot matching/dispatching protocol flags into knote */
4005 		if (kev->flags & EV_DISABLE) {
4006 			kn->kn_status |= KN_DISABLED;
4007 		}
4008 
4009 		/*
4010 		 * copy the kevent state into knote
4011 		 * protocol is that fflags and data
4012 		 * are saved off, and cleared before
4013 		 * calling the attach routine.
4014 		 *
4015 		 * - kn->kn_sfflags aliases with kev->xflags
4016 		 * - kn->kn_sdata   aliases with kev->data
4017 		 * - kn->kn_filter  is the top 8 bits of kev->filter
4018 		 */
4019 		kn->kn_kevent  = *(struct kevent_internal_s *)kev;
4020 		kn->kn_sfflags = kev->fflags;
4021 		kn->kn_filtid  = (uint8_t)~kev->filter;
4022 		kn->kn_fflags  = 0;
4023 		knote_reset_priority(kq, kn, kev->qos);
4024 
4025 		/* Add the knote for lookup thru the fd table */
4026 		error = kq_add_knote(kq, kn, &knlc, p);
4027 		if (error) {
4028 			knote_free(kn);
4029 			if (knote_fp != NULL) {
4030 				fp_drop(p, (int)kev->ident, knote_fp, 0);
4031 			}
4032 
4033 			if (error == ERESTART) {
4034 				goto restart;
4035 			}
4036 			goto out;
4037 		}
4038 
4039 		/* fp reference count now applies to knote */
4040 
4041 		/*
4042 		 * we can't use filter_call() because f_attach can change the filter ops
4043 		 * for a filter that supports f_extended_codes, so we need to reload
4044 		 * knote_fops() and not use `fops`.
4045 		 */
4046 		result = fops->f_attach(kn, kev);
4047 		if (result && !knote_fops(kn)->f_extended_codes) {
4048 			result = FILTER_ACTIVE;
4049 		}
4050 
4051 		kqlock(kq);
4052 
4053 		if (result & FILTER_THREADREQ_NODEFEER) {
4054 			enable_preemption();
4055 		}
4056 
4057 		if (kn->kn_flags & EV_ERROR) {
4058 			/*
4059 			 * Failed to attach correctly, so drop.
4060 			 */
4061 			kn->kn_filtid = EVFILTID_DETACHED;
4062 			error = (int)kn->kn_sdata;
4063 			knote_drop(kq, kn, &knlc);
4064 			result = 0;
4065 			goto out;
4066 		}
4067 
4068 		/*
4069 		 * end "attaching" phase - now just attached
4070 		 *
4071 		 * Mark the thread request overcommit, if appropos
4072 		 *
4073 		 * If the attach routine indicated that an
4074 		 * event is already fired, activate the knote.
4075 		 */
4076 		if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4077 		    (kq->kq_state & KQ_WORKLOOP)) {
4078 			kqworkloop_set_overcommit((struct kqworkloop *)kq);
4079 		}
4080 	} else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4081 		/*
4082 		 * The knote was dropped while we were waiting for the lock,
4083 		 * we need to re-evaluate entirely
4084 		 */
4085 
4086 		goto restart;
4087 	} else if (kev->flags & EV_DELETE) {
4088 		/*
4089 		 * Deletion of a knote (drop)
4090 		 *
4091 		 * If the filter wants to filter drop events, let it do so.
4092 		 *
4093 		 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4094 		 * we must wait for the knote to be re-enabled (unless it is being
4095 		 * re-enabled atomically here).
4096 		 */
4097 
4098 		if (knote_fops(kn)->f_allow_drop) {
4099 			bool drop;
4100 
4101 			kqunlock(kq);
4102 			drop = knote_fops(kn)->f_allow_drop(kn, kev);
4103 			kqlock(kq);
4104 
4105 			if (!drop) {
4106 				goto out_unlock;
4107 			}
4108 		}
4109 
4110 		if ((kev->flags & EV_ENABLE) == 0 &&
4111 		    (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4112 		    (kn->kn_status & KN_DISABLED) != 0) {
4113 			kn->kn_status |= KN_DEFERDELETE;
4114 			error = EINPROGRESS;
4115 			goto out_unlock;
4116 		}
4117 
4118 		knote_drop(kq, kn, &knlc);
4119 		goto out;
4120 	} else {
4121 		/*
4122 		 * Regular update of a knote (touch)
4123 		 *
4124 		 * Call touch routine to notify filter of changes in filter values
4125 		 * (and to re-determine if any events are fired).
4126 		 *
4127 		 * If the knote is in defer-delete, avoid calling the filter touch
4128 		 * routine (it has delivered its last event already).
4129 		 *
4130 		 * If the touch routine had no failure,
4131 		 * apply the requested side effects to the knote.
4132 		 */
4133 
4134 		if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4135 			if (kev->flags & EV_ENABLE) {
4136 				result = FILTER_ACTIVE;
4137 			}
4138 		} else {
4139 			kqunlock(kq);
4140 			result = filter_call(knote_fops(kn), f_touch(kn, kev));
4141 			kqlock(kq);
4142 			if (result & FILTER_THREADREQ_NODEFEER) {
4143 				enable_preemption();
4144 			}
4145 		}
4146 
4147 		if (kev->flags & EV_ERROR) {
4148 			result = 0;
4149 			goto out_unlock;
4150 		}
4151 
4152 		if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4153 		    kn->kn_udata != kev->udata) {
4154 			// this allows klist_copy_udata() not to take locks
4155 			os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4156 		}
4157 		if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4158 			kn->kn_status |= KN_DISABLED;
4159 			knote_dequeue(kq, kn);
4160 		}
4161 	}
4162 
4163 	/* accept new kevent state */
4164 	knote_apply_touch(kq, kn, kev, result);
4165 
4166 out_unlock:
4167 	/*
4168 	 * When the filter asked for a post-register wait,
4169 	 * we leave the kqueue locked for kevent_register()
4170 	 * to call the filter's f_post_register_wait hook.
4171 	 */
4172 	if (result & FILTER_REGISTER_WAIT) {
4173 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4174 		*kn_out = kn;
4175 	} else {
4176 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4177 	}
4178 
4179 out:
4180 	/* output local errors through the kevent */
4181 	if (error) {
4182 		kev->flags |= EV_ERROR;
4183 		kev->data = error;
4184 	}
4185 	return result;
4186 }
4187 
4188 /*
4189  * knote_process - process a triggered event
4190  *
4191  *	Validate that it is really still a triggered event
4192  *	by calling the filter routines (if necessary).  Hold
4193  *	a use reference on the knote to avoid it being detached.
4194  *
4195  *	If it is still considered triggered, we will have taken
4196  *	a copy of the state under the filter lock.  We use that
4197  *	snapshot to dispatch the knote for future processing (or
4198  *	not, if this was a lost event).
4199  *
4200  *	Our caller assures us that nobody else can be processing
4201  *	events from this knote during the whole operation. But
4202  *	others can be touching or posting events to the knote
4203  *	interspersed with our processing it.
4204  *
4205  *	caller holds a reference on the kqueue.
4206  *	kqueue locked on entry and exit - but may be dropped
4207  */
4208 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4209 knote_process(struct knote *kn, kevent_ctx_t kectx,
4210     kevent_callback_t callback)
4211 {
4212 	struct kevent_qos_s kev;
4213 	struct kqueue *kq = knote_get_kq(kn);
4214 	KNOTE_LOCK_CTX(knlc);
4215 	int result = FILTER_ACTIVE;
4216 	int error = 0;
4217 	bool drop = false;
4218 
4219 	/*
4220 	 * Must be active
4221 	 * Must be queued and not disabled/suppressed or dropping
4222 	 */
4223 	assert(kn->kn_status & KN_QUEUED);
4224 	assert(kn->kn_status & KN_ACTIVE);
4225 	assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4226 
4227 	if (kq->kq_state & KQ_WORKLOOP) {
4228 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4229 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4230 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4231 		    kn->kn_filtid);
4232 	} else if (kq->kq_state & KQ_WORKQ) {
4233 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4234 		    0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4235 		    kn->kn_filtid);
4236 	} else {
4237 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4238 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4239 		    kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4240 	}
4241 
4242 	if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4243 		/*
4244 		 * When the knote is dropping or has dropped,
4245 		 * then there's nothing we want to process.
4246 		 */
4247 		return EJUSTRETURN;
4248 	}
4249 
4250 	/*
4251 	 * While waiting for the knote lock, we may have dropped the kq lock.
4252 	 * and a touch may have disabled and dequeued the knote.
4253 	 */
4254 	if (!(kn->kn_status & KN_QUEUED)) {
4255 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4256 		return EJUSTRETURN;
4257 	}
4258 
4259 	/*
4260 	 * For deferred-drop or vanished events, we just create a fake
4261 	 * event to acknowledge end-of-life.  Otherwise, we call the
4262 	 * filter's process routine to snapshot the kevent state under
4263 	 * the filter's locking protocol.
4264 	 *
4265 	 * suppress knotes to avoid returning the same event multiple times in
4266 	 * a single call.
4267 	 */
4268 	knote_suppress(kq, kn);
4269 
4270 	if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4271 		uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4272 		if (kn->kn_status & KN_DEFERDELETE) {
4273 			kev_flags |= EV_DELETE;
4274 		} else {
4275 			kev_flags |= EV_VANISHED;
4276 		}
4277 
4278 		/* create fake event */
4279 		kev = (struct kevent_qos_s){
4280 			.filter = kn->kn_filter,
4281 			.ident  = kn->kn_id,
4282 			.flags  = kev_flags,
4283 			.udata  = kn->kn_udata,
4284 		};
4285 	} else {
4286 		kqunlock(kq);
4287 		kev = (struct kevent_qos_s) { };
4288 		result = filter_call(knote_fops(kn), f_process(kn, &kev));
4289 		kqlock(kq);
4290 	}
4291 
4292 	/*
4293 	 * Determine how to dispatch the knote for future event handling.
4294 	 * not-fired: just return (do not callout, leave deactivated).
4295 	 * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4296 	 *            is the deferred delete event delivery itself).  Otherwise,
4297 	 *            drop it.
4298 	 * Dispatch:  don't clear state, just mark it disabled.
4299 	 * Cleared:   just leave it deactivated.
4300 	 * Others:    re-activate as there may be more events to handle.
4301 	 *            This will not wake up more handlers right now, but
4302 	 *            at the completion of handling events it may trigger
4303 	 *            more handler threads (TODO: optimize based on more than
4304 	 *            just this one event being detected by the filter).
4305 	 */
4306 	if ((result & FILTER_ACTIVE) == 0) {
4307 		if ((kn->kn_status & KN_ACTIVE) == 0) {
4308 			/*
4309 			 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4310 			 * within f_process() but that doesn't necessarily make them
4311 			 * ready to process, so we should leave them be.
4312 			 *
4313 			 * For other knotes, since we will not return an event,
4314 			 * there's no point keeping the knote suppressed.
4315 			 */
4316 			knote_unsuppress(kq, kn);
4317 		}
4318 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4319 		return EJUSTRETURN;
4320 	}
4321 
4322 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4323 		knote_adjust_qos(kq, kn, result);
4324 	}
4325 
4326 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4327 		kqueue_update_iotier_override(kq);
4328 	}
4329 
4330 	kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4331 
4332 	if (kev.flags & EV_ONESHOT) {
4333 		if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4334 		    (kn->kn_status & KN_DEFERDELETE) == 0) {
4335 			/* defer dropping non-delete oneshot dispatch2 events */
4336 			kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4337 		} else {
4338 			drop = true;
4339 		}
4340 	} else if (kn->kn_flags & EV_DISPATCH) {
4341 		/* disable all dispatch knotes */
4342 		kn->kn_status |= KN_DISABLED;
4343 	} else if ((kn->kn_flags & EV_CLEAR) == 0) {
4344 		/* re-activate in case there are more events */
4345 		knote_activate(kq, kn, FILTER_ACTIVE);
4346 	}
4347 
4348 	/*
4349 	 * callback to handle each event as we find it.
4350 	 * If we have to detach and drop the knote, do
4351 	 * it while we have the kq unlocked.
4352 	 */
4353 	if (drop) {
4354 		knote_drop(kq, kn, &knlc);
4355 	} else {
4356 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4357 	}
4358 
4359 	if (kev.flags & EV_VANISHED) {
4360 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4361 		    kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4362 		    kn->kn_filtid);
4363 	}
4364 
4365 	error = (callback)(&kev, kectx);
4366 	kqlock(kq);
4367 	return error;
4368 }
4369 
4370 /*
4371  * Returns -1 if the kqueue was unbound and processing should not happen
4372  */
4373 #define KQWQAE_BEGIN_PROCESSING 1
4374 #define KQWQAE_END_PROCESSING   2
4375 #define KQWQAE_UNBIND           3
4376 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4377 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4378     int kevent_flags, int kqwqae_op)
4379 {
4380 	struct knote *kn;
4381 	int rc = 0;
4382 	bool unbind;
4383 	struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4384 	struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4385 
4386 	kqlock_held(&kqwq->kqwq_kqueue);
4387 
4388 	/*
4389 	 * Return suppressed knotes to their original state.
4390 	 * For workq kqueues, suppressed ones that are still
4391 	 * truly active (not just forced into the queue) will
4392 	 * set flags we check below to see if anything got
4393 	 * woken up.
4394 	 */
4395 	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4396 		knote_unsuppress(kqwq, kn);
4397 	}
4398 
4399 	if (kqwqae_op == KQWQAE_UNBIND) {
4400 		unbind = true;
4401 	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4402 		unbind = false;
4403 	} else {
4404 		unbind = TAILQ_EMPTY(queue);
4405 	}
4406 	if (unbind) {
4407 		thread_t thread = kqr_thread_fast(kqr);
4408 		thread_qos_t old_override;
4409 
4410 #if DEBUG || DEVELOPMENT
4411 		thread_t self = current_thread();
4412 		struct uthread *ut = get_bsdthread_info(self);
4413 
4414 		assert(thread == self);
4415 		assert(ut->uu_kqr_bound == kqr);
4416 #endif // DEBUG || DEVELOPMENT
4417 
4418 		old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4419 		if (!TAILQ_EMPTY(queue)) {
4420 			/*
4421 			 * Request a new thread if we didn't process the whole
4422 			 * queue.
4423 			 */
4424 			kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4425 			    kqr->tr_kq_qos_index, 0);
4426 		}
4427 		if (old_override) {
4428 			thread_drop_kevent_override(thread);
4429 		}
4430 		rc = -1;
4431 	}
4432 
4433 	return rc;
4434 }
4435 
4436 /*
4437  * Return 0 to indicate that processing should proceed,
4438  * -1 if there is nothing to process.
4439  *
4440  * Called with kqueue locked and returns the same way,
4441  * but may drop lock temporarily.
4442  */
4443 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4444 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4445     int kevent_flags)
4446 {
4447 	int rc = 0;
4448 
4449 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4450 	    0, kqr->tr_kq_qos_index);
4451 
4452 	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4453 	    KQWQAE_BEGIN_PROCESSING);
4454 
4455 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4456 	    thread_tid(kqr_thread(kqr)),
4457 	    !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4458 
4459 	return rc;
4460 }
4461 
4462 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4463 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4464 {
4465 	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4466 	struct knote *kn, *tmp;
4467 
4468 	kqlock_held(kqwl);
4469 
4470 	TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4471 		/*
4472 		 * If a knote that can adjust QoS is disabled because of the automatic
4473 		 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4474 		 * further overrides keep pushing.
4475 		 */
4476 		if (knote_fops(kn)->f_adjusts_qos &&
4477 		    (kn->kn_status & KN_DISABLED) != 0 &&
4478 		    (kn->kn_status & KN_DROPPING) == 0 &&
4479 		    (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4480 			qos = MAX(qos, kn->kn_qos_override);
4481 			continue;
4482 		}
4483 		knote_unsuppress(kqwl, kn);
4484 	}
4485 
4486 	return qos;
4487 }
4488 
4489 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4490 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4491 {
4492 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4493 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4494 	int rc = 0, op = KQWL_UTQ_NONE;
4495 
4496 	kqlock_held(kq);
4497 
4498 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4499 	    kqwl->kqwl_dynamicid, 0, 0);
4500 
4501 	/* nobody else should still be processing */
4502 	assert((kq->kq_state & KQ_PROCESSING) == 0);
4503 
4504 	kq->kq_state |= KQ_PROCESSING;
4505 
4506 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4507 		/*
4508 		 * When "parking" we want to process events and if no events are found
4509 		 * unbind.
4510 		 *
4511 		 * However, non overcommit threads sometimes park even when they have
4512 		 * more work so that the pool can narrow.  For these, we need to unbind
4513 		 * early, so that calling kqworkloop_update_threads_qos() can ask the
4514 		 * workqueue subsystem whether the thread should park despite having
4515 		 * pending events.
4516 		 */
4517 		if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4518 			op = KQWL_UTQ_PARKING;
4519 		} else {
4520 			op = KQWL_UTQ_UNBINDING;
4521 		}
4522 	} else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4523 		op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4524 	}
4525 
4526 	if (op != KQWL_UTQ_NONE) {
4527 		thread_qos_t qos_override;
4528 		thread_t thread = kqr_thread_fast(kqr);
4529 
4530 		qos_override = kqworkloop_acknowledge_events(kqwl);
4531 
4532 		if (op == KQWL_UTQ_UNBINDING) {
4533 			kqworkloop_unbind_locked(kqwl, thread,
4534 			    KQWL_OVERRIDE_DROP_IMMEDIATELY);
4535 			kqworkloop_release_live(kqwl);
4536 		}
4537 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
4538 		if (op == KQWL_UTQ_PARKING &&
4539 		    (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4540 			kqworkloop_unbind_locked(kqwl, thread,
4541 			    KQWL_OVERRIDE_DROP_DELAYED);
4542 			kqworkloop_release_live(kqwl);
4543 			rc = -1;
4544 		} else if (op == KQWL_UTQ_UNBINDING &&
4545 		    kqr_thread(kqr) != thread) {
4546 			rc = -1;
4547 		}
4548 
4549 		if (rc == -1) {
4550 			kq->kq_state &= ~KQ_PROCESSING;
4551 			kqworkloop_unbind_delayed_override_drop(thread);
4552 		}
4553 	}
4554 
4555 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4556 	    kqwl->kqwl_dynamicid, 0, 0);
4557 
4558 	return rc;
4559 }
4560 
4561 /*
4562  * Return 0 to indicate that processing should proceed,
4563  * -1 if there is nothing to process.
4564  * EBADF if the kqueue is draining
4565  *
4566  * Called with kqueue locked and returns the same way,
4567  * but may drop lock temporarily.
4568  * May block.
4569  */
4570 static int
kqfile_begin_processing(struct kqfile * kq)4571 kqfile_begin_processing(struct kqfile *kq)
4572 {
4573 	kqlock_held(kq);
4574 
4575 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4576 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4577 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4578 
4579 	/* wait to become the exclusive processing thread */
4580 	while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4581 		kq->kqf_state |= KQ_PROCWAIT;
4582 		lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4583 		    &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4584 	}
4585 
4586 	if (kq->kqf_state & KQ_DRAIN) {
4587 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4588 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4589 		return EBADF;
4590 	}
4591 
4592 	/* Nobody else processing */
4593 
4594 	/* anything left to process? */
4595 	if (kq->kqf_count == 0) {
4596 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4597 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4598 		return -1;
4599 	}
4600 
4601 	/* convert to processing mode */
4602 	kq->kqf_state |= KQ_PROCESSING;
4603 
4604 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4605 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4606 	return 0;
4607 }
4608 
4609 /*
4610  * Try to end the processing, only called when a workq thread is attempting to
4611  * park (KEVENT_FLAG_PARKING is set).
4612  *
4613  * When returning -1, the kqworkq is setup again so that it is ready to be
4614  * processed.
4615  */
4616 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4617 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4618     int kevent_flags)
4619 {
4620 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4621 		/*
4622 		 * if acknowledge events "succeeds" it means there are events,
4623 		 * which is a failure condition for end_processing.
4624 		 */
4625 		int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4626 		    KQWQAE_END_PROCESSING);
4627 		if (rc == 0) {
4628 			return -1;
4629 		}
4630 	}
4631 
4632 	return 0;
4633 }
4634 
4635 /*
4636  * Try to end the processing, only called when a workq thread is attempting to
4637  * park (KEVENT_FLAG_PARKING is set).
4638  *
4639  * When returning -1, the kqworkq is setup again so that it is ready to be
4640  * processed (as if kqworkloop_begin_processing had just been called).
4641  *
4642  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4643  * the kqworkloop is unbound from its servicer as a side effect.
4644  */
4645 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4646 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4647 {
4648 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4649 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4650 	int rc = 0;
4651 
4652 	kqlock_held(kq);
4653 
4654 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4655 	    kqwl->kqwl_dynamicid, 0, 0);
4656 
4657 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4658 		thread_t thread = kqr_thread_fast(kqr);
4659 		thread_qos_t qos_override;
4660 
4661 		/*
4662 		 * When KEVENT_FLAG_PARKING is set, we need to attempt
4663 		 * an unbind while still under the lock.
4664 		 *
4665 		 * So we do everything kqworkloop_unbind() would do, but because
4666 		 * we're inside kqueue_process(), if the workloop actually
4667 		 * received events while our locks were dropped, we have
4668 		 * the opportunity to fail the end processing and loop again.
4669 		 *
4670 		 * This avoids going through the process-wide workqueue lock
4671 		 * hence scales better.
4672 		 */
4673 		assert(flags & KQ_PROCESSING);
4674 		qos_override = kqworkloop_acknowledge_events(kqwl);
4675 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4676 
4677 		if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4678 			rc = -1;
4679 		} else {
4680 			kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4681 			kqworkloop_release_live(kqwl);
4682 			kq->kq_state &= ~flags;
4683 			kqworkloop_unbind_delayed_override_drop(thread);
4684 		}
4685 	} else {
4686 		kq->kq_state &= ~flags;
4687 		kq->kq_state |= KQ_R2K_ARMED;
4688 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4689 	}
4690 
4691 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4692 	    kqwl->kqwl_dynamicid, 0, 0);
4693 
4694 	return rc;
4695 }
4696 
4697 /*
4698  * Called with kqueue lock held.
4699  *
4700  * 0: no more events
4701  * -1: has more events
4702  * EBADF: kqueue is in draining mode
4703  */
4704 static int
kqfile_end_processing(struct kqfile * kq)4705 kqfile_end_processing(struct kqfile *kq)
4706 {
4707 	struct knote *kn;
4708 	int procwait;
4709 
4710 	kqlock_held(kq);
4711 
4712 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4713 
4714 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4715 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4716 
4717 	/*
4718 	 * Return suppressed knotes to their original state.
4719 	 */
4720 	while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4721 		knote_unsuppress(kq, kn);
4722 	}
4723 
4724 	procwait = (kq->kqf_state & KQ_PROCWAIT);
4725 	kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4726 
4727 	if (procwait) {
4728 		/* first wake up any thread already waiting to process */
4729 		thread_wakeup(&kq->kqf_suppressed);
4730 	}
4731 
4732 	if (kq->kqf_state & KQ_DRAIN) {
4733 		return EBADF;
4734 	}
4735 	return kq->kqf_count != 0 ? -1 : 0;
4736 }
4737 
4738 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4739 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4740     struct kqueue_workloop_params *params, int *retval)
4741 {
4742 	int error = 0;
4743 	struct kqworkloop *kqwl;
4744 	struct filedesc *fdp = &p->p_fd;
4745 	workq_threadreq_param_t trp = { };
4746 #if CONFIG_PREADOPT_TG
4747 	struct thread_group *trp_permanent_preadopt_tg = NULL;
4748 	integer_t trp_preadopt_priority = 0;
4749 	integer_t trp_preadopt_policy = 0;
4750 #endif /* CONFIG_PREADOPT_TG */
4751 
4752 	switch (cmd) {
4753 	case KQ_WORKLOOP_CREATE:
4754 		if (!params->kqwlp_flags) {
4755 			error = EINVAL;
4756 			break;
4757 		}
4758 
4759 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4760 		    (params->kqwlp_sched_pri < 1 ||
4761 		    params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4762 			error = EINVAL;
4763 			break;
4764 		}
4765 
4766 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4767 		    invalid_policy(params->kqwlp_sched_pol)) {
4768 			error = EINVAL;
4769 			break;
4770 		}
4771 
4772 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4773 		    (params->kqwlp_cpu_percent <= 0 ||
4774 		    params->kqwlp_cpu_percent > 100 ||
4775 		    params->kqwlp_cpu_refillms <= 0 ||
4776 		    params->kqwlp_cpu_refillms > 0x00ffffff)) {
4777 			error = EINVAL;
4778 			break;
4779 		}
4780 
4781 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4782 #if CONFIG_PREADOPT_TG
4783 			kern_return_t kr;
4784 			kr = kern_work_interval_get_policy_from_port(params->kqwl_wi_port,
4785 			    &trp_preadopt_policy,
4786 			    &trp_preadopt_priority,
4787 			    &trp_permanent_preadopt_tg);
4788 			if (kr != KERN_SUCCESS) {
4789 				error = EINVAL;
4790 				break;
4791 			}
4792 			/* The work interval comes with scheduling policy. */
4793 			if (trp_preadopt_policy) {
4794 				trp.trp_flags |= TRP_POLICY;
4795 				trp.trp_pol = (uint8_t)trp_preadopt_policy;
4796 
4797 				trp.trp_flags |= TRP_PRIORITY;
4798 				trp.trp_pri = (uint8_t)trp_preadopt_priority;
4799 			}
4800 			/*
4801 			 * We take +1 ref on a thread group backing this work interval
4802 			 * via kern_work_interval_get_policy_from_port and pass it on to kqwl.
4803 			 * If, for whatever reasons, kqworkloop_get_or_create fails, we
4804 			 * get back this ref.
4805 			 */
4806 #else
4807 			error = ENOTSUP;
4808 			break;
4809 #endif /* CONFIG_PREADOPT_TG */
4810 		}
4811 
4812 		if (!(trp.trp_flags & (TRP_POLICY | TRP_PRIORITY))) {
4813 			/*
4814 			 * We always prefer scheduling policy + priority that comes with
4815 			 * a work interval. It it does not exist, we fallback to what the user
4816 			 * has asked.
4817 			 */
4818 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4819 				trp.trp_flags |= TRP_PRIORITY;
4820 				trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4821 			}
4822 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4823 				trp.trp_flags |= TRP_POLICY;
4824 				trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4825 			}
4826 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4827 				trp.trp_flags |= TRP_CPUPERCENT;
4828 				trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4829 				trp.trp_refillms = params->kqwlp_cpu_refillms;
4830 			}
4831 		}
4832 
4833 		error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
4834 #if CONFIG_PREADOPT_TG
4835 		    trp_permanent_preadopt_tg,
4836 #endif /* CONFIG_PREADOPT_TG */
4837 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4838 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
4839 		if (error) {
4840 #if CONFIG_PREADOPT_TG
4841 			/* In case of success, kqwl consumes this +1 ref. */
4842 			if (trp_permanent_preadopt_tg) {
4843 				thread_group_release(trp_permanent_preadopt_tg);
4844 			}
4845 #endif
4846 			break;
4847 		}
4848 
4849 		if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
4850 			/* FD_WORKLOOP indicates we've ever created a workloop
4851 			 * via this syscall but its only ever added to a process, never
4852 			 * removed.
4853 			 */
4854 			proc_fdlock(p);
4855 			fdt_flag_set(fdp, FD_WORKLOOP);
4856 			proc_fdunlock(p);
4857 		}
4858 		break;
4859 	case KQ_WORKLOOP_DESTROY:
4860 		error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
4861 #if CONFIG_PREADOPT_TG
4862 		    NULL,
4863 #endif /* CONFIG_PREADOPT_TG */
4864 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4865 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
4866 		if (error) {
4867 			break;
4868 		}
4869 		kqlock(kqwl);
4870 		trp.trp_value = kqwl->kqwl_params;
4871 		if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4872 			trp.trp_flags |= TRP_RELEASED;
4873 			kqwl->kqwl_params = trp.trp_value;
4874 			kqworkloop_release_live(kqwl);
4875 		} else {
4876 			error = EINVAL;
4877 		}
4878 		kqunlock(kqwl);
4879 		kqworkloop_release(kqwl);
4880 		break;
4881 	}
4882 	*retval = 0;
4883 	return error;
4884 }
4885 
4886 int
kqueue_workloop_ctl(proc_t p,struct kqueue_workloop_ctl_args * uap,int * retval)4887 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
4888 {
4889 	struct kqueue_workloop_params params = {
4890 		.kqwlp_id = 0,
4891 	};
4892 	if (uap->sz < sizeof(params.kqwlp_version)) {
4893 		return EINVAL;
4894 	}
4895 
4896 	size_t copyin_sz = MIN(sizeof(params), uap->sz);
4897 	int rv = copyin(uap->addr, &params, copyin_sz);
4898 	if (rv) {
4899 		return rv;
4900 	}
4901 
4902 	if (params.kqwlp_version != (int)uap->sz) {
4903 		return EINVAL;
4904 	}
4905 
4906 	return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
4907 	           retval);
4908 }
4909 
4910 static int
kqueue_select(struct fileproc * fp,int which,void * wql,__unused vfs_context_t ctx)4911 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
4912 {
4913 	struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
4914 	int retnum = 0;
4915 
4916 	assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4917 
4918 	if (which == FREAD) {
4919 		kqlock(kq);
4920 		if (kqfile_begin_processing(kq) == 0) {
4921 			retnum = kq->kqf_count;
4922 			kqfile_end_processing(kq);
4923 		} else if ((kq->kqf_state & KQ_DRAIN) == 0) {
4924 			selrecord(kq->kqf_p, &kq->kqf_sel, wql);
4925 		}
4926 		kqunlock(kq);
4927 	}
4928 	return retnum;
4929 }
4930 
4931 /*
4932  * kqueue_close -
4933  */
4934 static int
kqueue_close(struct fileglob * fg,__unused vfs_context_t ctx)4935 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4936 {
4937 	struct kqfile *kqf = fg_get_data(fg);
4938 
4939 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4940 	kqlock(kqf);
4941 	selthreadclear(&kqf->kqf_sel);
4942 	kqunlock(kqf);
4943 	kqueue_dealloc(&kqf->kqf_kqueue);
4944 	fg_set_data(fg, NULL);
4945 	return 0;
4946 }
4947 
4948 /*
4949  * Max depth of the nested kq path that can be created.
4950  * Note that this has to be less than the size of kq_level
4951  * to avoid wrapping around and mislabeling the level. We also
4952  * want to be aggressive about this so that we don't overflow the
4953  * kernel stack while posting kevents
4954  */
4955 #define MAX_NESTED_KQ 10
4956 
4957 /*
4958  * The callers has taken a use-count reference on this kqueue and will donate it
4959  * to the kqueue we are being added to.  This keeps the kqueue from closing until
4960  * that relationship is torn down.
4961  */
4962 static int
kqueue_kqfilter(struct fileproc * fp,struct knote * kn,__unused struct kevent_qos_s * kev)4963 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
4964     __unused struct kevent_qos_s *kev)
4965 {
4966 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
4967 	struct kqueue *kq = &kqf->kqf_kqueue;
4968 	struct kqueue *parentkq = knote_get_kq(kn);
4969 
4970 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
4971 
4972 	if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
4973 		knote_set_error(kn, EINVAL);
4974 		return 0;
4975 	}
4976 
4977 	/*
4978 	 * We have to avoid creating a cycle when nesting kqueues
4979 	 * inside another.  Rather than trying to walk the whole
4980 	 * potential DAG of nested kqueues, we just use a simple
4981 	 * ceiling protocol.  When a kqueue is inserted into another,
4982 	 * we check that the (future) parent is not already nested
4983 	 * into another kqueue at a lower level than the potenial
4984 	 * child (because it could indicate a cycle).  If that test
4985 	 * passes, we just mark the nesting levels accordingly.
4986 	 *
4987 	 * Only up to MAX_NESTED_KQ can be nested.
4988 	 *
4989 	 * Note: kqworkq and kqworkloop cannot be nested and have reused their
4990 	 *       kq_level field, so ignore these as parent.
4991 	 */
4992 
4993 	kqlock(parentkq);
4994 
4995 	if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
4996 		if (parentkq->kq_level > 0 &&
4997 		    parentkq->kq_level < kq->kq_level) {
4998 			kqunlock(parentkq);
4999 			knote_set_error(kn, EINVAL);
5000 			return 0;
5001 		}
5002 
5003 		/* set parent level appropriately */
5004 		uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5005 		if (plevel < kq->kq_level + 1) {
5006 			if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5007 				kqunlock(parentkq);
5008 				knote_set_error(kn, EINVAL);
5009 				return 0;
5010 			}
5011 			plevel = kq->kq_level + 1;
5012 		}
5013 
5014 		parentkq->kq_level = plevel;
5015 	}
5016 
5017 	kqunlock(parentkq);
5018 
5019 	kn->kn_filtid = EVFILTID_KQREAD;
5020 	kqlock(kq);
5021 	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5022 	/* indicate nesting in child, if needed */
5023 	if (kq->kq_level == 0) {
5024 		kq->kq_level = 1;
5025 	}
5026 
5027 	int count = kq->kq_count;
5028 	kqunlock(kq);
5029 	return count > 0;
5030 }
5031 
5032 __attribute__((noinline))
5033 static void
kqfile_wakeup(struct kqfile * kqf,long hint,wait_result_t wr)5034 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
5035 {
5036 	/* wakeup a thread waiting on this queue */
5037 	selwakeup(&kqf->kqf_sel);
5038 
5039 	/* wake up threads in kqueue_scan() */
5040 	if (kqf->kqf_state & KQ_SLEEP) {
5041 		kqf->kqf_state &= ~KQ_SLEEP;
5042 		thread_wakeup_with_result(&kqf->kqf_count, wr);
5043 	}
5044 
5045 	if (hint == NOTE_REVOKE) {
5046 		/* wakeup threads waiting their turn to process */
5047 		if (kqf->kqf_state & KQ_PROCWAIT) {
5048 			assert(kqf->kqf_state & KQ_PROCESSING);
5049 			kqf->kqf_state &= ~KQ_PROCWAIT;
5050 			thread_wakeup(&kqf->kqf_suppressed);
5051 		}
5052 
5053 		/* no need to KNOTE: knote_fdclose() takes care of it */
5054 	} else {
5055 		/* wakeup other kqueues/select sets we're inside */
5056 		KNOTE(&kqf->kqf_sel.si_note, hint);
5057 	}
5058 }
5059 
5060 /*
5061  * kqueue_drain - called when kq is closed
5062  */
5063 static int
kqueue_drain(struct fileproc * fp,__unused vfs_context_t ctx)5064 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5065 {
5066 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5067 
5068 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5069 
5070 	kqlock(kqf);
5071 	kqf->kqf_state |= KQ_DRAIN;
5072 	kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5073 	kqunlock(kqf);
5074 	return 0;
5075 }
5076 
5077 int
kqueue_stat(struct kqueue * kq,void * ub,int isstat64,proc_t p)5078 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5079 {
5080 	assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5081 
5082 	kqlock(kq);
5083 	if (isstat64 != 0) {
5084 		struct stat64 *sb64 = (struct stat64 *)ub;
5085 
5086 		bzero((void *)sb64, sizeof(*sb64));
5087 		sb64->st_size = kq->kq_count;
5088 		if (kq->kq_state & KQ_KEV_QOS) {
5089 			sb64->st_blksize = sizeof(struct kevent_qos_s);
5090 		} else if (kq->kq_state & KQ_KEV64) {
5091 			sb64->st_blksize = sizeof(struct kevent64_s);
5092 		} else if (IS_64BIT_PROCESS(p)) {
5093 			sb64->st_blksize = sizeof(struct user64_kevent);
5094 		} else {
5095 			sb64->st_blksize = sizeof(struct user32_kevent);
5096 		}
5097 		sb64->st_mode = S_IFIFO;
5098 	} else {
5099 		struct stat *sb = (struct stat *)ub;
5100 
5101 		bzero((void *)sb, sizeof(*sb));
5102 		sb->st_size = kq->kq_count;
5103 		if (kq->kq_state & KQ_KEV_QOS) {
5104 			sb->st_blksize = sizeof(struct kevent_qos_s);
5105 		} else if (kq->kq_state & KQ_KEV64) {
5106 			sb->st_blksize = sizeof(struct kevent64_s);
5107 		} else if (IS_64BIT_PROCESS(p)) {
5108 			sb->st_blksize = sizeof(struct user64_kevent);
5109 		} else {
5110 			sb->st_blksize = sizeof(struct user32_kevent);
5111 		}
5112 		sb->st_mode = S_IFIFO;
5113 	}
5114 	kqunlock(kq);
5115 	return 0;
5116 }
5117 
5118 static inline bool
kqueue_threadreq_can_use_ast(struct kqueue * kq)5119 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5120 {
5121 	if (current_proc() == kq->kq_p) {
5122 		/*
5123 		 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5124 		 * do combined send/receive and in the case of self-IPC, the AST may bet
5125 		 * set on a thread that will not return to userspace and needs the
5126 		 * thread the AST would create to unblock itself.
5127 		 *
5128 		 * At this time, we really want to target:
5129 		 *
5130 		 * - kevent variants that can cause thread creations, and dispatch
5131 		 *   really only uses kevent_qos and kevent_id,
5132 		 *
5133 		 * - workq_kernreturn (directly about thread creations)
5134 		 *
5135 		 * - bsdthread_ctl which is used for qos changes and has direct impact
5136 		 *   on the creator thread scheduling decisions.
5137 		 */
5138 		switch (current_uthread()->syscall_code) {
5139 		case SYS_kevent_qos:
5140 		case SYS_kevent_id:
5141 		case SYS_workq_kernreturn:
5142 		case SYS_bsdthread_ctl:
5143 			return true;
5144 		}
5145 	}
5146 	return false;
5147 }
5148 
5149 /*
5150  * Interact with the pthread kext to request a servicing there at a specific QoS
5151  * level.
5152  *
5153  * - Caller holds the kqlock
5154  *
5155  * - May be called with the kqueue's wait queue set locked,
5156  *   so cannot do anything that could recurse on that.
5157  */
5158 static void
kqueue_threadreq_initiate(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,int flags)5159 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5160     kq_index_t qos, int flags)
5161 {
5162 	assert(kqr_thread(kqr) == THREAD_NULL);
5163 	assert(!kqr_thread_requested(kqr));
5164 	struct turnstile *ts = TURNSTILE_NULL;
5165 
5166 	if (workq_is_exiting(kqu.kq->kq_p)) {
5167 		return;
5168 	}
5169 
5170 	kqlock_held(kqu);
5171 
5172 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5173 		struct kqworkloop *kqwl = kqu.kqwl;
5174 
5175 		assert(kqwl->kqwl_owner == THREAD_NULL);
5176 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5177 		    kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5178 		ts = kqwl->kqwl_turnstile;
5179 		/* Add a thread request reference on the kqueue. */
5180 		kqworkloop_retain(kqwl);
5181 
5182 #if CONFIG_PREADOPT_TG
5183 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5184 			&kqwl->kqwl_preadopt_tg, relaxed);
5185 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5186 			/*
5187 			 * This kqwl has been permanently configured with a thread group.
5188 			 * See kqworkloops with scheduling parameters.
5189 			 */
5190 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5191 		} else {
5192 			/*
5193 			 * This thread is the one which is ack-ing the thread group on the kqwl
5194 			 * under the kqlock and will take action accordingly, pairs with the
5195 			 * release barrier in kqueue_set_preadopted_thread_group
5196 			 */
5197 			uint16_t tg_acknowledged;
5198 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5199 			    KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5200 			    &tg_acknowledged, acquire)) {
5201 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5202 			}
5203 		}
5204 #endif
5205 	} else {
5206 		assert(kqu.kq->kq_state & KQ_WORKQ);
5207 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5208 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5209 	}
5210 
5211 	/*
5212 	 * New-style thread request supported.
5213 	 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5214 	 * its use until a corresponding kqueue_threadreq_bind callback.
5215 	 */
5216 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5217 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5218 	}
5219 	if (qos == KQWQ_QOS_MANAGER) {
5220 		qos = WORKQ_THREAD_QOS_MANAGER;
5221 	}
5222 
5223 	if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5224 		/*
5225 		 * Process is shutting down or exec'ing.
5226 		 * All the kqueues are going to be cleaned up
5227 		 * soon. Forget we even asked for a thread -
5228 		 * and make sure we don't ask for more.
5229 		 */
5230 		kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5231 		kqueue_release_live(kqu);
5232 	}
5233 }
5234 
5235 /*
5236  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5237  *
5238  * This is used when kqueue_threadreq_bind may cause a lock inversion.
5239  */
5240 __attribute__((always_inline))
5241 void
kqueue_threadreq_bind_prepost(struct proc * p __unused,workq_threadreq_t kqr,struct uthread * ut)5242 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5243     struct uthread *ut)
5244 {
5245 	ut->uu_kqr_bound = kqr;
5246 	kqr->tr_thread = get_machthread(ut);
5247 	kqr->tr_state = WORKQ_TR_STATE_BINDING;
5248 }
5249 
5250 /*
5251  * kqueue_threadreq_bind_commit - commit a bind prepost
5252  *
5253  * The workq code has to commit any binding prepost before the thread has
5254  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5255  */
5256 void
kqueue_threadreq_bind_commit(struct proc * p,thread_t thread)5257 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5258 {
5259 	struct uthread *ut = get_bsdthread_info(thread);
5260 	workq_threadreq_t kqr = ut->uu_kqr_bound;
5261 	kqueue_t kqu = kqr_kqueue(p, kqr);
5262 
5263 	kqlock(kqu);
5264 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5265 		kqueue_threadreq_bind(p, kqr, thread, 0);
5266 	}
5267 	kqunlock(kqu);
5268 }
5269 
5270 static void
kqueue_threadreq_modify(kqueue_t kqu,workq_threadreq_t kqr,kq_index_t qos,workq_kern_threadreq_flags_t flags)5271 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5272     workq_kern_threadreq_flags_t flags)
5273 {
5274 	assert(kqr_thread_requested_pending(kqr));
5275 
5276 	kqlock_held(kqu);
5277 
5278 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5279 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5280 	}
5281 
5282 #if CONFIG_PREADOPT_TG
5283 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5284 		struct kqworkloop *kqwl = kqu.kqwl;
5285 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5286 			&kqwl->kqwl_preadopt_tg, relaxed);
5287 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5288 			/*
5289 			 * This kqwl has been permanently configured with a thread group.
5290 			 * See kqworkloops with scheduling parameters.
5291 			 */
5292 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5293 		} else {
5294 			uint16_t tg_ack_status;
5295 			/*
5296 			 * This thread is the one which is ack-ing the thread group on the kqwl
5297 			 * under the kqlock and will take action accordingly, needs acquire
5298 			 * barrier.
5299 			 */
5300 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5301 			    KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5302 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5303 			}
5304 		}
5305 	}
5306 #endif
5307 
5308 	workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5309 }
5310 
5311 /*
5312  * kqueue_threadreq_bind - bind thread to processing kqrequest
5313  *
5314  * The provided thread will be responsible for delivering events
5315  * associated with the given kqrequest.  Bind it and get ready for
5316  * the thread to eventually arrive.
5317  */
5318 void
kqueue_threadreq_bind(struct proc * p,workq_threadreq_t kqr,thread_t thread,unsigned int flags)5319 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5320     unsigned int flags)
5321 {
5322 	kqueue_t kqu = kqr_kqueue(p, kqr);
5323 	struct uthread *ut = get_bsdthread_info(thread);
5324 
5325 	kqlock_held(kqu);
5326 
5327 	assert(ut->uu_kqueue_override == 0);
5328 
5329 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5330 		assert(ut->uu_kqr_bound == kqr);
5331 		assert(kqr->tr_thread == thread);
5332 	} else {
5333 		assert(kqr_thread_requested_pending(kqr));
5334 		assert(kqr->tr_thread == THREAD_NULL);
5335 		assert(ut->uu_kqr_bound == NULL);
5336 		ut->uu_kqr_bound = kqr;
5337 		kqr->tr_thread = thread;
5338 	}
5339 
5340 	kqr->tr_state = WORKQ_TR_STATE_BOUND;
5341 
5342 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5343 		struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5344 
5345 		if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5346 			/*
5347 			 * <rdar://problem/38626999> shows that asserting here is not ok.
5348 			 *
5349 			 * This is not supposed to happen for correct use of the interface,
5350 			 * but it is sadly possible for userspace (with the help of memory
5351 			 * corruption, such as over-release of a dispatch queue) to make
5352 			 * the creator thread the "owner" of a workloop.
5353 			 *
5354 			 * Once that happens, and that creator thread picks up the same
5355 			 * workloop as a servicer, we trip this codepath. We need to fixup
5356 			 * the state to forget about this thread being the owner, as the
5357 			 * entire workloop state machine expects servicers to never be
5358 			 * owners and everything would basically go downhill from here.
5359 			 */
5360 			kqu.kqwl->kqwl_owner = THREAD_NULL;
5361 			if (kqworkloop_override(kqu.kqwl)) {
5362 				thread_drop_kevent_override(thread);
5363 			}
5364 		}
5365 
5366 		if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5367 			/*
5368 			 * Past this point, the interlock is the kq req lock again,
5369 			 * so we can fix the inheritor for good.
5370 			 */
5371 			filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5372 			turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5373 		}
5374 
5375 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5376 		    thread_tid(thread), kqr->tr_kq_qos_index,
5377 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5378 
5379 		ut->uu_kqueue_override = kqr->tr_kq_override_index;
5380 		if (kqr->tr_kq_override_index) {
5381 			thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5382 		}
5383 
5384 #if CONFIG_PREADOPT_TG
5385 		/* Remove reference from kqwl and mark it as bound with the SENTINEL */
5386 		thread_group_qos_t old_tg;
5387 		thread_group_qos_t new_tg;
5388 		int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5389 			if ((old_tg == KQWL_PREADOPTED_TG_NEVER) || KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5390 			        /*
5391 			         * Either an app or a kqwl permanently configured with a thread group.
5392 			         * Nothing to do.
5393 			         */
5394 			        os_atomic_rmw_loop_give_up(break);
5395 			}
5396 			assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5397 			new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5398 		});
5399 
5400 		if (ret) {
5401 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5402 
5403 			if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5404 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5405 				assert(tg != NULL);
5406 
5407 				thread_set_preadopt_thread_group(thread, tg);
5408 				thread_group_release_live(tg); // The thread has a reference
5409 			} else {
5410 				/*
5411 				 * The thread may already have a preadopt thread group on it -
5412 				 * we need to make sure to clear that.
5413 				 */
5414 				thread_set_preadopt_thread_group(thread, NULL);
5415 			}
5416 
5417 			/* We have taken action on the preadopted thread group set on the
5418 			 * set on the kqwl, clear any redrive requests */
5419 			os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5420 		} else {
5421 			if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5422 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5423 				assert(tg != NULL);
5424 				thread_set_preadopt_thread_group(thread, tg);
5425 				/*
5426 				 * From this point on, kqwl and thread both have +1 ref on this tg.
5427 				 */
5428 			}
5429 		}
5430 #endif
5431 		kqueue_update_iotier_override(kqu);
5432 	} else {
5433 		assert(kqr->tr_kq_override_index == 0);
5434 
5435 #if CONFIG_PREADOPT_TG
5436 		/*
5437 		 * The thread may have a preadopt thread group on it already because it
5438 		 * got tagged with it as a creator thread. So we need to make sure to
5439 		 * clear that since we don't have preadopt thread groups for non-kqwl
5440 		 * cases
5441 		 */
5442 		thread_set_preadopt_thread_group(thread, NULL);
5443 #endif
5444 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5445 		    thread_tid(thread), kqr->tr_kq_qos_index,
5446 		    (kqr->tr_kq_override_index << 16) |
5447 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5448 	}
5449 }
5450 
5451 /*
5452  * kqueue_threadreq_cancel - abort a pending thread request
5453  *
5454  * Called when exiting/exec'ing. Forget our pending request.
5455  */
5456 void
kqueue_threadreq_cancel(struct proc * p,workq_threadreq_t kqr)5457 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5458 {
5459 	kqueue_release(kqr_kqueue(p, kqr));
5460 }
5461 
5462 workq_threadreq_param_t
kqueue_threadreq_workloop_param(workq_threadreq_t kqr)5463 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5464 {
5465 	struct kqworkloop *kqwl;
5466 	workq_threadreq_param_t trp;
5467 
5468 	assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5469 	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5470 	trp.trp_value = kqwl->kqwl_params;
5471 	return trp;
5472 }
5473 
5474 /*
5475  *	kqueue_threadreq_unbind - unbind thread from processing kqueue
5476  *
5477  *	End processing the per-QoS bucket of events and allow other threads
5478  *	to be requested for future servicing.
5479  *
5480  *	caller holds a reference on the kqueue.
5481  */
5482 void
kqueue_threadreq_unbind(struct proc * p,workq_threadreq_t kqr)5483 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5484 {
5485 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5486 		kqworkloop_unbind(kqr_kqworkloop(kqr));
5487 	} else {
5488 		kqworkq_unbind(p, kqr);
5489 	}
5490 }
5491 
5492 /*
5493  * If we aren't already busy processing events [for this QoS],
5494  * request workq thread support as appropriate.
5495  *
5496  * TBD - for now, we don't segregate out processing by QoS.
5497  *
5498  * - May be called with the kqueue's wait queue set locked,
5499  *   so cannot do anything that could recurse on that.
5500  */
5501 static void
kqworkq_wakeup(struct kqworkq * kqwq,kq_index_t qos_index)5502 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5503 {
5504 	workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5505 
5506 	/* convert to thread qos value */
5507 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5508 
5509 	if (!kqr_thread_requested(kqr)) {
5510 		kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5511 	}
5512 }
5513 
5514 /*
5515  * This represent the asynchronous QoS a given workloop contributes,
5516  * hence is the max of the current active knotes (override index)
5517  * and the workloop max qos (userspace async qos).
5518  */
5519 static kq_index_t
kqworkloop_override(struct kqworkloop * kqwl)5520 kqworkloop_override(struct kqworkloop *kqwl)
5521 {
5522 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5523 	return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5524 }
5525 
5526 static inline void
kqworkloop_request_fire_r2k_notification(struct kqworkloop * kqwl)5527 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5528 {
5529 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5530 
5531 	kqlock_held(kqwl);
5532 
5533 	if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5534 		kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5535 		act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5536 	}
5537 }
5538 
5539 static void
kqworkloop_update_threads_qos(struct kqworkloop * kqwl,int op,kq_index_t qos)5540 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5541 {
5542 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5543 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5544 	kq_index_t old_override = kqworkloop_override(kqwl);
5545 
5546 	kqlock_held(kqwl);
5547 
5548 	switch (op) {
5549 	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5550 		kqwl->kqwl_wakeup_qos = qos;
5551 		kqworkloop_request_fire_r2k_notification(kqwl);
5552 		goto recompute;
5553 
5554 	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5555 		kqr->tr_kq_override_index = qos;
5556 		goto recompute;
5557 
5558 	case KQWL_UTQ_PARKING:
5559 	case KQWL_UTQ_UNBINDING:
5560 		kqr->tr_kq_override_index = qos;
5561 		OS_FALLTHROUGH;
5562 
5563 	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5564 		if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5565 			assert(qos == THREAD_QOS_UNSPECIFIED);
5566 		}
5567 		if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5568 			kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5569 		}
5570 		kqwl->kqwl_wakeup_qos = 0;
5571 		for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5572 			if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5573 				kqwl->kqwl_wakeup_qos = i;
5574 				kqworkloop_request_fire_r2k_notification(kqwl);
5575 				break;
5576 			}
5577 		}
5578 		OS_FALLTHROUGH;
5579 
5580 	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5581 recompute:
5582 		/*
5583 		 * When modifying the wakeup QoS or the override QoS, we always need to
5584 		 * maintain our invariant that kqr_override_index is at least as large
5585 		 * as the highest QoS for which an event is fired.
5586 		 *
5587 		 * However this override index can be larger when there is an overriden
5588 		 * suppressed knote pushing on the kqueue.
5589 		 */
5590 		if (qos < kqwl->kqwl_wakeup_qos) {
5591 			qos = kqwl->kqwl_wakeup_qos;
5592 		}
5593 		if (kqr->tr_kq_override_index < qos) {
5594 			kqr->tr_kq_override_index = qos;
5595 		}
5596 		break;
5597 
5598 	case KQWL_UTQ_REDRIVE_EVENTS:
5599 		break;
5600 
5601 	case KQWL_UTQ_SET_QOS_INDEX:
5602 		kqr->tr_kq_qos_index = qos;
5603 		break;
5604 
5605 	default:
5606 		panic("unknown kqwl thread qos update operation: %d", op);
5607 	}
5608 
5609 	thread_t kqwl_owner = kqwl->kqwl_owner;
5610 	thread_t servicer = kqr_thread(kqr);
5611 	boolean_t qos_changed = FALSE;
5612 	kq_index_t new_override = kqworkloop_override(kqwl);
5613 
5614 	/*
5615 	 * Apply the diffs to the owner if applicable
5616 	 */
5617 	if (kqwl_owner) {
5618 #if 0
5619 		/* JMM - need new trace hooks for owner overrides */
5620 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5621 		    kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5622 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5623 #endif
5624 		if (new_override == old_override) {
5625 			// nothing to do
5626 		} else if (old_override == THREAD_QOS_UNSPECIFIED) {
5627 			thread_add_kevent_override(kqwl_owner, new_override);
5628 		} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5629 			thread_drop_kevent_override(kqwl_owner);
5630 		} else { /*  old_override != new_override */
5631 			thread_update_kevent_override(kqwl_owner, new_override);
5632 		}
5633 	}
5634 
5635 	/*
5636 	 * apply the diffs to the servicer
5637 	 */
5638 
5639 	if (!kqr_thread_requested(kqr)) {
5640 		/*
5641 		 * No servicer, nor thread-request
5642 		 *
5643 		 * Make a new thread request, unless there is an owner (or the workloop
5644 		 * is suspended in userland) or if there is no asynchronous work in the
5645 		 * first place.
5646 		 */
5647 
5648 		if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5649 			int initiate_flags = 0;
5650 			if (op == KQWL_UTQ_UNBINDING) {
5651 				initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5652 			}
5653 
5654 			/* kqueue_threadreq_initiate handles the acknowledgement of the TG
5655 			 * if needed */
5656 			kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5657 		}
5658 	} else if (servicer) {
5659 		/*
5660 		 * Servicer in flight
5661 		 *
5662 		 * Just apply the diff to the servicer
5663 		 */
5664 
5665 #if CONFIG_PREADOPT_TG
5666 		/* When there's a servicer for the kqwl already, then the servicer will
5667 		 * adopt the thread group in the kqr, we don't need to poke the
5668 		 * workqueue subsystem to make different decisions due to the thread
5669 		 * group. Consider the current request ack-ed.
5670 		 */
5671 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5672 #endif
5673 
5674 		struct uthread *ut = get_bsdthread_info(servicer);
5675 		if (ut->uu_kqueue_override != new_override) {
5676 			if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5677 				thread_add_servicer_override(servicer, new_override);
5678 			} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5679 				thread_drop_servicer_override(servicer);
5680 			} else { /* ut->uu_kqueue_override != new_override */
5681 				thread_update_servicer_override(servicer, new_override);
5682 			}
5683 			ut->uu_kqueue_override = new_override;
5684 			qos_changed = TRUE;
5685 		}
5686 	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5687 		/*
5688 		 * No events to deliver anymore.
5689 		 *
5690 		 * However canceling with turnstiles is challenging, so the fact that
5691 		 * the request isn't useful will be discovered by the servicer himself
5692 		 * later on.
5693 		 */
5694 	} else if (old_override != new_override) {
5695 		/*
5696 		 * Request is in flight
5697 		 *
5698 		 * Apply the diff to the thread request.
5699 		 */
5700 		kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
5701 		qos_changed = TRUE;
5702 	}
5703 
5704 	if (qos_changed) {
5705 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5706 		    thread_tid(servicer), kqr->tr_kq_qos_index,
5707 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5708 	}
5709 }
5710 
5711 static void
kqworkloop_update_iotier_override(struct kqworkloop * kqwl)5712 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
5713 {
5714 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5715 	thread_t servicer = kqr_thread(kqr);
5716 	uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
5717 
5718 	kqlock_held(kqwl);
5719 
5720 	if (servicer) {
5721 		thread_update_servicer_iotier_override(servicer, iotier);
5722 	}
5723 }
5724 
5725 static void
kqworkloop_wakeup(struct kqworkloop * kqwl,kq_index_t qos)5726 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5727 {
5728 	if (qos <= kqwl->kqwl_wakeup_qos) {
5729 		/*
5730 		 * Shortcut wakeups that really do nothing useful
5731 		 */
5732 		return;
5733 	}
5734 
5735 	if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5736 	    kqr_thread(&kqwl->kqwl_request) == current_thread()) {
5737 		/*
5738 		 * kqworkloop_end_processing() will perform the required QoS
5739 		 * computations when it unsets the processing mode.
5740 		 */
5741 		return;
5742 	}
5743 
5744 	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5745 }
5746 
5747 static struct kqtailq *
kqueue_get_suppressed_queue(kqueue_t kq,struct knote * kn)5748 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5749 {
5750 	if (kq.kq->kq_state & KQ_WORKLOOP) {
5751 		return &kq.kqwl->kqwl_suppressed;
5752 	} else if (kq.kq->kq_state & KQ_WORKQ) {
5753 		return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
5754 	} else {
5755 		return &kq.kqf->kqf_suppressed;
5756 	}
5757 }
5758 
5759 struct turnstile *
kqueue_alloc_turnstile(kqueue_t kqu)5760 kqueue_alloc_turnstile(kqueue_t kqu)
5761 {
5762 	struct kqworkloop *kqwl = kqu.kqwl;
5763 	kq_state_t kq_state;
5764 
5765 	kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5766 	if (kq_state & KQ_HAS_TURNSTILE) {
5767 		/* force a dependency to pair with the atomic or with release below */
5768 		return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5769 		           (uintptr_t)kq_state);
5770 	}
5771 
5772 	if (!(kq_state & KQ_WORKLOOP)) {
5773 		return TURNSTILE_NULL;
5774 	}
5775 
5776 	struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
5777 	bool workq_locked = false;
5778 
5779 	kqlock(kqu);
5780 
5781 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5782 		workq_locked = true;
5783 		workq_kern_threadreq_lock(kqwl->kqwl_p);
5784 	}
5785 
5786 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5787 		free_ts = ts;
5788 		ts = kqwl->kqwl_turnstile;
5789 	} else {
5790 		ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
5791 		    ts, TURNSTILE_WORKLOOPS);
5792 
5793 		/* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5794 		os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5795 
5796 		if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5797 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
5798 			    &kqwl->kqwl_request, kqwl->kqwl_owner,
5799 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
5800 			/*
5801 			 * The workq may no longer be the interlock after this.
5802 			 * In which case the inheritor wasn't updated.
5803 			 */
5804 		}
5805 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5806 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5807 		}
5808 	}
5809 
5810 	if (workq_locked) {
5811 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
5812 	}
5813 
5814 	kqunlock(kqu);
5815 
5816 	if (free_ts) {
5817 		turnstile_deallocate(free_ts);
5818 	} else {
5819 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
5820 	}
5821 	return ts;
5822 }
5823 
5824 __attribute__((always_inline))
5825 struct turnstile *
kqueue_turnstile(kqueue_t kqu)5826 kqueue_turnstile(kqueue_t kqu)
5827 {
5828 	kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5829 	if (kq_state & KQ_WORKLOOP) {
5830 		return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5831 	}
5832 	return TURNSTILE_NULL;
5833 }
5834 
5835 __attribute__((always_inline))
5836 struct turnstile *
kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)5837 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5838 {
5839 	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5840 	if (kqwl) {
5841 		return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5842 	}
5843 	return TURNSTILE_NULL;
5844 }
5845 
5846 static void
kqworkloop_set_overcommit(struct kqworkloop * kqwl)5847 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5848 {
5849 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5850 
5851 	/*
5852 	 * This test is racy, but since we never remove this bit,
5853 	 * it allows us to avoid taking a lock.
5854 	 */
5855 	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5856 		return;
5857 	}
5858 
5859 	kqlock_held(kqwl);
5860 
5861 	if (kqr_thread_requested_pending(kqr)) {
5862 		kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
5863 		    WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5864 	} else {
5865 		kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
5866 	}
5867 }
5868 
5869 static void
kqworkq_update_override(struct kqworkq * kqwq,struct knote * kn,kq_index_t override_index)5870 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
5871     kq_index_t override_index)
5872 {
5873 	workq_threadreq_t kqr;
5874 	kq_index_t old_override_index;
5875 	kq_index_t queue_index = kn->kn_qos_index;
5876 
5877 	if (override_index <= queue_index) {
5878 		return;
5879 	}
5880 
5881 	kqr = kqworkq_get_request(kqwq, queue_index);
5882 
5883 	kqlock_held(kqwq);
5884 
5885 	old_override_index = kqr->tr_kq_override_index;
5886 	if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5887 		thread_t servicer = kqr_thread(kqr);
5888 		kqr->tr_kq_override_index = override_index;
5889 
5890 		/* apply the override to [incoming?] servicing thread */
5891 		if (servicer) {
5892 			if (old_override_index) {
5893 				thread_update_kevent_override(servicer, override_index);
5894 			} else {
5895 				thread_add_kevent_override(servicer, override_index);
5896 			}
5897 		}
5898 	}
5899 }
5900 
5901 static void
kqueue_update_iotier_override(kqueue_t kqu)5902 kqueue_update_iotier_override(kqueue_t kqu)
5903 {
5904 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5905 		kqworkloop_update_iotier_override(kqu.kqwl);
5906 	}
5907 }
5908 
5909 static void
kqueue_update_override(kqueue_t kqu,struct knote * kn,thread_qos_t qos)5910 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5911 {
5912 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5913 		kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5914 		    qos);
5915 	} else {
5916 		kqworkq_update_override(kqu.kqwq, kn, qos);
5917 	}
5918 }
5919 
5920 static void
kqworkloop_unbind_locked(struct kqworkloop * kqwl,thread_t thread,enum kqwl_unbind_locked_mode how)5921 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5922     enum kqwl_unbind_locked_mode how)
5923 {
5924 	struct uthread *ut = get_bsdthread_info(thread);
5925 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5926 
5927 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5928 	    thread_tid(thread), 0, 0);
5929 
5930 	kqlock_held(kqwl);
5931 
5932 	assert(ut->uu_kqr_bound == kqr);
5933 	ut->uu_kqr_bound = NULL;
5934 	if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5935 	    ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5936 		thread_drop_servicer_override(thread);
5937 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5938 	}
5939 
5940 	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5941 		turnstile_update_inheritor(kqwl->kqwl_turnstile,
5942 		    TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
5943 		turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
5944 		    TURNSTILE_INTERLOCK_HELD);
5945 	}
5946 
5947 #if CONFIG_PREADOPT_TG
5948 	/* The kqueue is able to adopt a thread group again */
5949 
5950 	thread_group_qos_t old_tg, new_tg = NULL;
5951 	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5952 		new_tg = old_tg;
5953 		if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
5954 		        new_tg = KQWL_PREADOPTED_TG_NULL;
5955 		}
5956 	});
5957 
5958 	if (ret) {
5959 		KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
5960 		// Servicer can drop any preadopt thread group it has since it has
5961 		// unbound.
5962 		thread_set_preadopt_thread_group(thread, NULL);
5963 	}
5964 #endif
5965 	thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
5966 
5967 	kqr->tr_thread = THREAD_NULL;
5968 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
5969 	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5970 }
5971 
5972 static void
kqworkloop_unbind_delayed_override_drop(thread_t thread)5973 kqworkloop_unbind_delayed_override_drop(thread_t thread)
5974 {
5975 	struct uthread *ut = get_bsdthread_info(thread);
5976 	assert(ut->uu_kqr_bound == NULL);
5977 	if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5978 		thread_drop_servicer_override(thread);
5979 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5980 	}
5981 }
5982 
5983 /*
5984  *	kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5985  *
5986  *	It will acknowledge events, and possibly request a new thread if:
5987  *	- there were active events left
5988  *	- we pended waitq hook callouts during processing
5989  *	- we pended wakeups while processing (or unsuppressing)
5990  *
5991  *	Called with kqueue lock held.
5992  */
5993 static void
kqworkloop_unbind(struct kqworkloop * kqwl)5994 kqworkloop_unbind(struct kqworkloop *kqwl)
5995 {
5996 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5997 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5998 	thread_t thread = kqr_thread_fast(kqr);
5999 	int op = KQWL_UTQ_PARKING;
6000 	kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6001 
6002 	assert(thread == current_thread());
6003 
6004 	kqlock(kqwl);
6005 
6006 	/*
6007 	 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6008 	 * unsuppressing knotes not to be applied until the eventual call to
6009 	 * kqworkloop_update_threads_qos() below.
6010 	 */
6011 	assert((kq->kq_state & KQ_PROCESSING) == 0);
6012 	if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6013 		kq->kq_state |= KQ_PROCESSING;
6014 		qos_override = kqworkloop_acknowledge_events(kqwl);
6015 		kq->kq_state &= ~KQ_PROCESSING;
6016 	}
6017 
6018 	kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
6019 	kqworkloop_update_threads_qos(kqwl, op, qos_override);
6020 
6021 	kqunlock(kqwl);
6022 
6023 	/*
6024 	 * Drop the override on the current thread last, after the call to
6025 	 * kqworkloop_update_threads_qos above.
6026 	 */
6027 	kqworkloop_unbind_delayed_override_drop(thread);
6028 
6029 	/* If last reference, dealloc the workloop kq */
6030 	kqworkloop_release(kqwl);
6031 }
6032 
6033 static thread_qos_t
kqworkq_unbind_locked(struct kqworkq * kqwq,workq_threadreq_t kqr,thread_t thread)6034 kqworkq_unbind_locked(struct kqworkq *kqwq,
6035     workq_threadreq_t kqr, thread_t thread)
6036 {
6037 	struct uthread *ut = get_bsdthread_info(thread);
6038 	kq_index_t old_override = kqr->tr_kq_override_index;
6039 
6040 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6041 	    thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
6042 
6043 	kqlock_held(kqwq);
6044 
6045 	assert(ut->uu_kqr_bound == kqr);
6046 	ut->uu_kqr_bound = NULL;
6047 	kqr->tr_thread = THREAD_NULL;
6048 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
6049 	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6050 	kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6051 
6052 	return old_override;
6053 }
6054 
6055 /*
6056  *	kqworkq_unbind - unbind of a workq kqueue from a thread
6057  *
6058  *	We may have to request new threads.
6059  *	This can happen there are no waiting processing threads and:
6060  *	- there were active events we never got to (count > 0)
6061  *	- we pended waitq hook callouts during processing
6062  *	- we pended wakeups while processing (or unsuppressing)
6063  */
6064 static void
kqworkq_unbind(proc_t p,workq_threadreq_t kqr)6065 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6066 {
6067 	struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
6068 	__assert_only int rc;
6069 
6070 	kqlock(kqwq);
6071 	rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6072 	assert(rc == -1);
6073 	kqunlock(kqwq);
6074 }
6075 
6076 workq_threadreq_t
kqworkq_get_request(struct kqworkq * kqwq,kq_index_t qos_index)6077 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6078 {
6079 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6080 	return &kqwq->kqwq_request[qos_index - 1];
6081 }
6082 
6083 static void
knote_reset_priority(kqueue_t kqu,struct knote * kn,pthread_priority_t pp)6084 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6085 {
6086 	kq_index_t qos = _pthread_priority_thread_qos(pp);
6087 
6088 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6089 		assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6090 		pp = _pthread_priority_normalize(pp);
6091 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6092 		if (qos == THREAD_QOS_UNSPECIFIED) {
6093 			/* On workqueues, outside of QoS means MANAGER */
6094 			qos = KQWQ_QOS_MANAGER;
6095 			pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6096 		} else {
6097 			pp = _pthread_priority_normalize(pp);
6098 		}
6099 	} else {
6100 		pp = _pthread_unspecified_priority();
6101 		qos = THREAD_QOS_UNSPECIFIED;
6102 	}
6103 
6104 	kn->kn_qos = (int32_t)pp;
6105 
6106 	if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6107 		/* Never lower QoS when in "Merge" mode */
6108 		kn->kn_qos_override = qos;
6109 	}
6110 
6111 	/* only adjust in-use qos index when not suppressed */
6112 	if (kn->kn_status & KN_SUPPRESSED) {
6113 		kqueue_update_override(kqu, kn, qos);
6114 	} else if (kn->kn_qos_index != qos) {
6115 		knote_dequeue(kqu, kn);
6116 		kn->kn_qos_index = qos;
6117 	}
6118 }
6119 
6120 static void
knote_adjust_qos(struct kqueue * kq,struct knote * kn,int result)6121 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
6122 {
6123 	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6124 
6125 	kqlock_held(kq);
6126 
6127 	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6128 	assert(qos_index < THREAD_QOS_LAST);
6129 
6130 	/*
6131 	 * Early exit for knotes that should not change QoS
6132 	 */
6133 	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6134 		panic("filter %d cannot change QoS", kn->kn_filtid);
6135 	} else if (__improbable(!knote_has_qos(kn))) {
6136 		return;
6137 	}
6138 
6139 	/*
6140 	 * knotes with the FALLBACK flag will only use their registration QoS if the
6141 	 * incoming event has no QoS, else, the registration QoS acts as a floor.
6142 	 */
6143 	thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
6144 	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6145 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
6146 			qos_index = req_qos;
6147 		}
6148 	} else {
6149 		if (qos_index < req_qos) {
6150 			qos_index = req_qos;
6151 		}
6152 	}
6153 	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6154 		/* Never lower QoS when in "Merge" mode */
6155 		return;
6156 	}
6157 
6158 	if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6159 		/*
6160 		 * When we're trying to update the QoS override and that both an
6161 		 * f_event() and other f_* calls are running concurrently, any of these
6162 		 * in flight calls may want to perform overrides that aren't properly
6163 		 * serialized with each other.
6164 		 *
6165 		 * The first update that observes this racy situation enters a "Merge"
6166 		 * mode which causes subsequent override requests to saturate the
6167 		 * override instead of replacing its value.
6168 		 *
6169 		 * This mode is left when knote_unlock() or knote_post()
6170 		 * observe that no other f_* routine is in flight.
6171 		 */
6172 		kn->kn_status |= KN_MERGE_QOS;
6173 	}
6174 
6175 	/*
6176 	 * Now apply the override if it changed.
6177 	 */
6178 
6179 	if (kn->kn_qos_override == qos_index) {
6180 		return;
6181 	}
6182 
6183 	kn->kn_qos_override = qos_index;
6184 
6185 	if (kn->kn_status & KN_SUPPRESSED) {
6186 		/*
6187 		 * For suppressed events, the kn_qos_index field cannot be touched as it
6188 		 * allows us to know on which supress queue the knote is for a kqworkq.
6189 		 *
6190 		 * Also, there's no natural push applied on the kqueues when this field
6191 		 * changes anyway. We hence need to apply manual overrides in this case,
6192 		 * which will be cleared when the events are later acknowledged.
6193 		 */
6194 		kqueue_update_override(kq, kn, qos_index);
6195 	} else if (kn->kn_qos_index != qos_index) {
6196 		knote_dequeue(kq, kn);
6197 		kn->kn_qos_index = qos_index;
6198 	}
6199 }
6200 
6201 void
klist_init(struct klist * list)6202 klist_init(struct klist *list)
6203 {
6204 	SLIST_INIT(list);
6205 }
6206 
6207 
6208 /*
6209  *	Query/Post each knote in the object's list
6210  *
6211  *	The object lock protects the list. It is assumed that the filter/event
6212  *	routine for the object can determine that the object is already locked (via
6213  *	the hint) and not deadlock itself.
6214  *
6215  *	Autodetach is a specific contract which will detach all knotes from the
6216  *	object prior to posting the final event for that knote. This is done while
6217  *	under the object lock. A breadcrumb is left in the knote's next pointer to
6218  *	indicate to future calls to f_detach routines that they need not reattempt
6219  *	to knote_detach from the object's klist again. This is currently used by
6220  *	EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6221  *
6222  */
6223 void
knote(struct klist * list,long hint,bool autodetach)6224 knote(struct klist *list, long hint, bool autodetach)
6225 {
6226 	struct knote *kn;
6227 	struct knote *tmp_kn;
6228 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6229 		/*
6230 		 * We can modify the knote's next pointer since since we are holding the
6231 		 * object lock and the list can't be concurrently modified. Anyone
6232 		 * determining auto-detached-ness of a knote should take the primitive lock
6233 		 * to synchronize.
6234 		 *
6235 		 * Note that we do this here instead of the filter's f_event since we may
6236 		 * not even post the event if the knote is being dropped.
6237 		 */
6238 		if (autodetach) {
6239 			kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6240 		}
6241 		knote_post(kn, hint);
6242 	}
6243 
6244 	/* Blast away the entire klist */
6245 	if (autodetach) {
6246 		klist_init(list);
6247 	}
6248 }
6249 
6250 /*
6251  * attach a knote to the specified list.  Return true if this is the first entry.
6252  * The list is protected by whatever lock the object it is associated with uses.
6253  */
6254 int
knote_attach(struct klist * list,struct knote * kn)6255 knote_attach(struct klist *list, struct knote *kn)
6256 {
6257 	int ret = SLIST_EMPTY(list);
6258 	SLIST_INSERT_HEAD(list, kn, kn_selnext);
6259 	return ret;
6260 }
6261 
6262 /*
6263  * detach a knote from the specified list.  Return true if that was the last
6264  * entry.  The list is protected by whatever lock the object it is associated
6265  * with uses.
6266  */
6267 int
knote_detach(struct klist * list,struct knote * kn)6268 knote_detach(struct klist *list, struct knote *kn)
6269 {
6270 	assert(!KNOTE_IS_AUTODETACHED(kn));
6271 
6272 	SLIST_REMOVE(list, kn, knote, kn_selnext);
6273 	return SLIST_EMPTY(list);
6274 }
6275 
6276 /*
6277  * knote_vanish - Indicate that the source has vanished
6278  *
6279  * Used only for vanishing ports - vanishing fds go
6280  * through knote_fdclose()
6281  *
6282  * If the knote has requested EV_VANISHED delivery,
6283  * arrange for that. Otherwise, deliver a NOTE_REVOKE
6284  * event for backward compatibility.
6285  *
6286  * The knote is marked as having vanished. The source's
6287  * reference to the knote is dropped by caller, but the knote's
6288  * source reference is only cleaned up later when the knote is dropped.
6289  *
6290  * Our caller already has the object lock held. Calling
6291  * the detach routine would try to take that lock
6292  * recursively - which likely is not supported.
6293  */
6294 void
knote_vanish(struct klist * list,bool make_active)6295 knote_vanish(struct klist *list, bool make_active)
6296 {
6297 	struct knote *kn;
6298 	struct knote *kn_next;
6299 
6300 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6301 		struct kqueue *kq = knote_get_kq(kn);
6302 
6303 		kqlock(kq);
6304 		if (__probable(kn->kn_status & KN_REQVANISH)) {
6305 			/*
6306 			 * If EV_VANISH supported - prepare to deliver one
6307 			 */
6308 			kn->kn_status |= KN_VANISHED;
6309 		} else {
6310 			/*
6311 			 * Handle the legacy way to indicate that the port/portset was
6312 			 * deallocated or left the current Mach portspace (modern technique
6313 			 * is with an EV_VANISHED protocol).
6314 			 *
6315 			 * Deliver an EV_EOF event for these changes (hopefully it will get
6316 			 * delivered before the port name recycles to the same generation
6317 			 * count and someone tries to re-register a kevent for it or the
6318 			 * events are udata-specific - avoiding a conflict).
6319 			 */
6320 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
6321 		}
6322 		if (make_active) {
6323 			knote_activate(kq, kn, FILTER_ACTIVE);
6324 		}
6325 		kqunlock(kq);
6326 	}
6327 }
6328 
6329 /*
6330  * remove all knotes referencing a specified fd
6331  *
6332  * Entered with the proc_fd lock already held.
6333  * It returns the same way, but may drop it temporarily.
6334  */
6335 void
knote_fdclose(struct proc * p,int fd)6336 knote_fdclose(struct proc *p, int fd)
6337 {
6338 	struct filedesc *fdt = &p->p_fd;
6339 	struct klist *list;
6340 	struct knote *kn;
6341 	KNOTE_LOCK_CTX(knlc);
6342 
6343 restart:
6344 	list = &fdt->fd_knlist[fd];
6345 	SLIST_FOREACH(kn, list, kn_link) {
6346 		struct kqueue *kq = knote_get_kq(kn);
6347 
6348 		kqlock(kq);
6349 
6350 		if (kq->kq_p != p) {
6351 			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6352 			    __func__, kq->kq_p, p);
6353 		}
6354 
6355 		/*
6356 		 * If the knote supports EV_VANISHED delivery,
6357 		 * transition it to vanished mode (or skip over
6358 		 * it if already vanished).
6359 		 */
6360 		if (kn->kn_status & KN_VANISHED) {
6361 			kqunlock(kq);
6362 			continue;
6363 		}
6364 
6365 		proc_fdunlock(p);
6366 		if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6367 			/* the knote was dropped by someone, nothing to do */
6368 		} else if (kn->kn_status & KN_REQVANISH) {
6369 			/*
6370 			 * Since we have REQVANISH for this knote, we need to notify clients about
6371 			 * the EV_VANISHED.
6372 			 *
6373 			 * But unlike mach ports, we want to do the detach here as well and not
6374 			 * defer it so that we can release the iocount that is on the knote and
6375 			 * close the fp.
6376 			 */
6377 			kn->kn_status |= KN_VANISHED;
6378 
6379 			/*
6380 			 * There may be a concurrent post happening, make sure to wait for it
6381 			 * before we detach. knote_wait_for_post() unlocks on kq on exit
6382 			 */
6383 			knote_wait_for_post(kq, kn);
6384 
6385 			knote_fops(kn)->f_detach(kn);
6386 			if (kn->kn_is_fd) {
6387 				fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6388 			}
6389 			kn->kn_filtid = EVFILTID_DETACHED;
6390 			kqlock(kq);
6391 
6392 			knote_activate(kq, kn, FILTER_ACTIVE);
6393 			knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6394 		} else {
6395 			knote_drop(kq, kn, &knlc);
6396 		}
6397 
6398 		proc_fdlock(p);
6399 		goto restart;
6400 	}
6401 }
6402 
6403 /*
6404  * knote_fdfind - lookup a knote in the fd table for process
6405  *
6406  * If the filter is file-based, lookup based on fd index.
6407  * Otherwise use a hash based on the ident.
6408  *
6409  * Matching is based on kq, filter, and ident. Optionally,
6410  * it may also be based on the udata field in the kevent -
6411  * allowing multiple event registration for the file object
6412  * per kqueue.
6413  *
6414  * fd_knhashlock or fdlock held on entry (and exit)
6415  */
6416 static struct knote *
knote_fdfind(struct kqueue * kq,const struct kevent_internal_s * kev,bool is_fd,struct proc * p)6417 knote_fdfind(struct kqueue *kq,
6418     const struct kevent_internal_s *kev,
6419     bool is_fd,
6420     struct proc *p)
6421 {
6422 	struct filedesc *fdp = &p->p_fd;
6423 	struct klist *list = NULL;
6424 	struct knote *kn = NULL;
6425 
6426 	/*
6427 	 * determine where to look for the knote
6428 	 */
6429 	if (is_fd) {
6430 		/* fd-based knotes are linked off the fd table */
6431 		if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6432 			list = &fdp->fd_knlist[kev->kei_ident];
6433 		}
6434 	} else if (fdp->fd_knhashmask != 0) {
6435 		/* hash non-fd knotes here too */
6436 		list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6437 	}
6438 
6439 	/*
6440 	 * scan the selected list looking for a match
6441 	 */
6442 	if (list != NULL) {
6443 		SLIST_FOREACH(kn, list, kn_link) {
6444 			if (kq == knote_get_kq(kn) &&
6445 			    kev->kei_ident == kn->kn_id &&
6446 			    kev->kei_filter == kn->kn_filter) {
6447 				if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6448 					if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6449 					    kev->kei_udata == kn->kn_udata) {
6450 						break; /* matching udata-specific knote */
6451 					}
6452 				} else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6453 					break; /* matching non-udata-specific knote */
6454 				}
6455 			}
6456 		}
6457 	}
6458 	return kn;
6459 }
6460 
6461 /*
6462  * kq_add_knote- Add knote to the fd table for process
6463  * while checking for duplicates.
6464  *
6465  * All file-based filters associate a list of knotes by file
6466  * descriptor index. All other filters hash the knote by ident.
6467  *
6468  * May have to grow the table of knote lists to cover the
6469  * file descriptor index presented.
6470  *
6471  * fd_knhashlock and fdlock unheld on entry (and exit).
6472  *
6473  * Takes a rwlock boost if inserting the knote is successful.
6474  */
6475 static int
kq_add_knote(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc,struct proc * p)6476 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6477     struct proc *p)
6478 {
6479 	struct filedesc *fdp = &p->p_fd;
6480 	struct klist *list = NULL;
6481 	int ret = 0;
6482 	bool is_fd = kn->kn_is_fd;
6483 
6484 	if (is_fd) {
6485 		proc_fdlock(p);
6486 	} else {
6487 		knhash_lock(fdp);
6488 	}
6489 
6490 	if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6491 		/* found an existing knote: we can't add this one */
6492 		ret = ERESTART;
6493 		goto out_locked;
6494 	}
6495 
6496 	/* knote was not found: add it now */
6497 	if (!is_fd) {
6498 		if (fdp->fd_knhashmask == 0) {
6499 			u_long size = 0;
6500 
6501 			list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6502 			if (list == NULL) {
6503 				ret = ENOMEM;
6504 				goto out_locked;
6505 			}
6506 
6507 			fdp->fd_knhash = list;
6508 			fdp->fd_knhashmask = size;
6509 		}
6510 
6511 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6512 		SLIST_INSERT_HEAD(list, kn, kn_link);
6513 		ret = 0;
6514 		goto out_locked;
6515 	} else {
6516 		/* knote is fd based */
6517 
6518 		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6519 			u_int size = 0;
6520 
6521 			/* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6522 			if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6523 				ret = EINVAL;
6524 				goto out_locked;
6525 			}
6526 			/* have to grow the fd_knlist */
6527 			size = fdp->fd_knlistsize;
6528 			while (size <= kn->kn_id) {
6529 				size += KQEXTENT;
6530 			}
6531 
6532 			if (size >= (UINT_MAX / sizeof(struct klist))) {
6533 				ret = EINVAL;
6534 				goto out_locked;
6535 			}
6536 
6537 			list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6538 			if (list == NULL) {
6539 				ret = ENOMEM;
6540 				goto out_locked;
6541 			}
6542 
6543 			bcopy(fdp->fd_knlist, list,
6544 			    fdp->fd_knlistsize * sizeof(struct klist));
6545 			kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6546 			fdp->fd_knlist = list;
6547 			fdp->fd_knlistsize = size;
6548 		}
6549 
6550 		list = &fdp->fd_knlist[kn->kn_id];
6551 		SLIST_INSERT_HEAD(list, kn, kn_link);
6552 		ret = 0;
6553 		goto out_locked;
6554 	}
6555 
6556 out_locked:
6557 	if (ret == 0) {
6558 		kqlock(kq);
6559 		assert((kn->kn_status & KN_LOCKED) == 0);
6560 		(void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6561 		kqueue_retain(kq); /* retain a kq ref */
6562 	}
6563 	if (is_fd) {
6564 		proc_fdunlock(p);
6565 	} else {
6566 		knhash_unlock(fdp);
6567 	}
6568 
6569 	return ret;
6570 }
6571 
6572 /*
6573  * kq_remove_knote - remove a knote from the fd table for process
6574  *
6575  * If the filter is file-based, remove based on fd index.
6576  * Otherwise remove from the hash based on the ident.
6577  *
6578  * fd_knhashlock and fdlock unheld on entry (and exit).
6579  */
6580 static void
kq_remove_knote(struct kqueue * kq,struct knote * kn,struct proc * p,struct knote_lock_ctx * knlc)6581 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6582     struct knote_lock_ctx *knlc)
6583 {
6584 	struct filedesc *fdp = &p->p_fd;
6585 	struct klist *list = NULL;
6586 	uint16_t kq_state;
6587 	bool is_fd = kn->kn_is_fd;
6588 
6589 	if (is_fd) {
6590 		proc_fdlock(p);
6591 	} else {
6592 		knhash_lock(fdp);
6593 	}
6594 
6595 	if (is_fd) {
6596 		assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6597 		list = &fdp->fd_knlist[kn->kn_id];
6598 	} else {
6599 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6600 	}
6601 	SLIST_REMOVE(list, kn, knote, kn_link);
6602 
6603 	kqlock(kq);
6604 
6605 	/* Update the servicer iotier override */
6606 	kqueue_update_iotier_override(kq);
6607 
6608 	kq_state = kq->kq_state;
6609 	if (knlc) {
6610 		knote_unlock_cancel(kq, kn, knlc);
6611 	} else {
6612 		kqunlock(kq);
6613 	}
6614 	if (is_fd) {
6615 		proc_fdunlock(p);
6616 	} else {
6617 		knhash_unlock(fdp);
6618 	}
6619 
6620 	if (kq_state & KQ_DYNAMIC) {
6621 		kqworkloop_release((struct kqworkloop *)kq);
6622 	}
6623 }
6624 
6625 /*
6626  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6627  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6628  *
6629  * fd_knhashlock or fdlock unheld on entry (and exit)
6630  */
6631 
6632 static struct knote *
kq_find_knote_and_kq_lock(struct kqueue * kq,struct kevent_qos_s * kev,bool is_fd,struct proc * p)6633 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6634     bool is_fd, struct proc *p)
6635 {
6636 	struct filedesc *fdp = &p->p_fd;
6637 	struct knote *kn;
6638 
6639 	if (is_fd) {
6640 		proc_fdlock(p);
6641 	} else {
6642 		knhash_lock(fdp);
6643 	}
6644 
6645 	/*
6646 	 * Temporary horrible hack:
6647 	 * this cast is gross and will go away in a future change.
6648 	 * It is OK to do because we don't look at xflags/s_fflags,
6649 	 * and that when we cast down the kev this way,
6650 	 * the truncated filter field works.
6651 	 */
6652 	kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
6653 
6654 	if (kn) {
6655 		kqlock(kq);
6656 		assert(knote_get_kq(kn) == kq);
6657 	}
6658 
6659 	if (is_fd) {
6660 		proc_fdunlock(p);
6661 	} else {
6662 		knhash_unlock(fdp);
6663 	}
6664 
6665 	return kn;
6666 }
6667 
6668 static struct kqtailq *
knote_get_tailq(kqueue_t kqu,struct knote * kn)6669 knote_get_tailq(kqueue_t kqu, struct knote *kn)
6670 {
6671 	kq_index_t qos_index = kn->kn_qos_index;
6672 
6673 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6674 		assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
6675 		return &kqu.kqwl->kqwl_queue[qos_index - 1];
6676 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6677 		assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6678 		return &kqu.kqwq->kqwq_queue[qos_index - 1];
6679 	} else {
6680 		assert(qos_index == QOS_INDEX_KQFILE);
6681 		return &kqu.kqf->kqf_queue;
6682 	}
6683 }
6684 
6685 static void
knote_enqueue(kqueue_t kqu,struct knote * kn)6686 knote_enqueue(kqueue_t kqu, struct knote *kn)
6687 {
6688 	kqlock_held(kqu);
6689 
6690 	if ((kn->kn_status & KN_ACTIVE) == 0) {
6691 		return;
6692 	}
6693 
6694 	if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
6695 		return;
6696 	}
6697 
6698 	struct kqtailq *queue = knote_get_tailq(kqu, kn);
6699 	bool wakeup = TAILQ_EMPTY(queue);
6700 
6701 	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6702 	kn->kn_status |= KN_QUEUED;
6703 	kqu.kq->kq_count++;
6704 
6705 	if (wakeup) {
6706 		if (kqu.kq->kq_state & KQ_WORKLOOP) {
6707 			kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
6708 		} else if (kqu.kq->kq_state & KQ_WORKQ) {
6709 			kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
6710 		} else {
6711 			kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
6712 		}
6713 	}
6714 }
6715 
6716 __attribute__((always_inline))
6717 static inline void
knote_dequeue(kqueue_t kqu,struct knote * kn)6718 knote_dequeue(kqueue_t kqu, struct knote *kn)
6719 {
6720 	if (kn->kn_status & KN_QUEUED) {
6721 		struct kqtailq *queue = knote_get_tailq(kqu, kn);
6722 
6723 		// attaching the knote calls knote_reset_priority() without
6724 		// the kqlock which is fine, so we can't call kqlock_held()
6725 		// if we're not queued.
6726 		kqlock_held(kqu);
6727 
6728 		TAILQ_REMOVE(queue, kn, kn_tqe);
6729 		kn->kn_status &= ~KN_QUEUED;
6730 		kqu.kq->kq_count--;
6731 		if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
6732 			assert((kqu.kq->kq_count == 0) ==
6733 			    (bool)TAILQ_EMPTY(queue));
6734 		}
6735 	}
6736 }
6737 
6738 /* called with kqueue lock held */
6739 static void
knote_suppress(kqueue_t kqu,struct knote * kn)6740 knote_suppress(kqueue_t kqu, struct knote *kn)
6741 {
6742 	struct kqtailq *suppressq;
6743 
6744 	kqlock_held(kqu);
6745 
6746 	assert((kn->kn_status & KN_SUPPRESSED) == 0);
6747 	assert(kn->kn_status & KN_QUEUED);
6748 
6749 	knote_dequeue(kqu, kn);
6750 	/* deactivate - so new activations indicate a wakeup */
6751 	kn->kn_status &= ~KN_ACTIVE;
6752 	kn->kn_status |= KN_SUPPRESSED;
6753 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
6754 	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6755 }
6756 
6757 __attribute__((always_inline))
6758 static inline void
knote_unsuppress_noqueue(kqueue_t kqu,struct knote * kn)6759 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6760 {
6761 	struct kqtailq *suppressq;
6762 
6763 	kqlock_held(kqu);
6764 
6765 	assert(kn->kn_status & KN_SUPPRESSED);
6766 
6767 	kn->kn_status &= ~KN_SUPPRESSED;
6768 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
6769 	TAILQ_REMOVE(suppressq, kn, kn_tqe);
6770 
6771 	/*
6772 	 * If the knote is no longer active, reset its push,
6773 	 * and resynchronize kn_qos_index with kn_qos_override
6774 	 * for knotes with a real qos.
6775 	 */
6776 	if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
6777 		kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
6778 	}
6779 	kn->kn_qos_index = kn->kn_qos_override;
6780 }
6781 
6782 /* called with kqueue lock held */
6783 static void
knote_unsuppress(kqueue_t kqu,struct knote * kn)6784 knote_unsuppress(kqueue_t kqu, struct knote *kn)
6785 {
6786 	knote_unsuppress_noqueue(kqu, kn);
6787 	knote_enqueue(kqu, kn);
6788 }
6789 
6790 __attribute__((always_inline))
6791 static inline void
knote_mark_active(struct knote * kn)6792 knote_mark_active(struct knote *kn)
6793 {
6794 	if ((kn->kn_status & KN_ACTIVE) == 0) {
6795 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6796 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
6797 		    kn->kn_filtid);
6798 	}
6799 
6800 	kn->kn_status |= KN_ACTIVE;
6801 }
6802 
6803 /* called with kqueue lock held */
6804 static void
knote_activate(kqueue_t kqu,struct knote * kn,int result)6805 knote_activate(kqueue_t kqu, struct knote *kn, int result)
6806 {
6807 	assert(result & FILTER_ACTIVE);
6808 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6809 		// may dequeue the knote
6810 		knote_adjust_qos(kqu.kq, kn, result);
6811 	}
6812 	knote_mark_active(kn);
6813 	knote_enqueue(kqu, kn);
6814 }
6815 
6816 /*
6817  * This function applies changes requested by f_attach or f_touch for
6818  * a given filter. It proceeds in a carefully chosen order to help
6819  * every single transition do the minimal amount of work possible.
6820  */
6821 static void
knote_apply_touch(kqueue_t kqu,struct knote * kn,struct kevent_qos_s * kev,int result)6822 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
6823     int result)
6824 {
6825 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6826 		kn->kn_status &= ~KN_DISABLED;
6827 
6828 		/*
6829 		 * it is possible for userland to have knotes registered for a given
6830 		 * workloop `wl_orig` but really handled on another workloop `wl_new`.
6831 		 *
6832 		 * In that case, rearming will happen from the servicer thread of
6833 		 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6834 		 * this knote to stay suppressed forever if we only relied on
6835 		 * kqworkloop_acknowledge_events to be called by `wl_orig`.
6836 		 *
6837 		 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6838 		 * unsuppress because that would mess with the processing phase of
6839 		 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6840 		 * will be called.
6841 		 */
6842 		if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6843 			if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
6844 				knote_unsuppress_noqueue(kqu, kn);
6845 			}
6846 		}
6847 	}
6848 
6849 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
6850 		kqueue_update_iotier_override(kqu);
6851 	}
6852 
6853 	if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6854 		// may dequeue the knote
6855 		knote_reset_priority(kqu, kn, kev->qos);
6856 	}
6857 
6858 	/*
6859 	 * When we unsuppress above, or because of knote_reset_priority(),
6860 	 * the knote may have been dequeued, we need to restore the invariant
6861 	 * that if the knote is active it needs to be queued now that
6862 	 * we're done applying changes.
6863 	 */
6864 	if (result & FILTER_ACTIVE) {
6865 		knote_activate(kqu, kn, result);
6866 	} else {
6867 		knote_enqueue(kqu, kn);
6868 	}
6869 
6870 	if ((result & FILTER_THREADREQ_NODEFEER) &&
6871 	    act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6872 		workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
6873 	}
6874 }
6875 
6876 /*
6877  * knote_drop - disconnect and drop the knote
6878  *
6879  * Called with the kqueue locked, returns with the kqueue unlocked.
6880  *
6881  * If a knote locking context is passed, it is canceled.
6882  *
6883  * The knote may have already been detached from
6884  * (or not yet attached to) its source object.
6885  */
6886 static void
knote_drop(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)6887 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
6888 {
6889 	struct proc *p = kq->kq_p;
6890 
6891 	kqlock_held(kq);
6892 
6893 	assert((kn->kn_status & KN_DROPPING) == 0);
6894 	if (knlc == NULL) {
6895 		assert((kn->kn_status & KN_LOCKED) == 0);
6896 	}
6897 	kn->kn_status |= KN_DROPPING;
6898 
6899 	if (kn->kn_status & KN_SUPPRESSED) {
6900 		knote_unsuppress_noqueue(kq, kn);
6901 	} else {
6902 		knote_dequeue(kq, kn);
6903 	}
6904 	knote_wait_for_post(kq, kn);
6905 
6906 	/* Even if we are autodetached, the filter may need to do cleanups of any
6907 	 * stuff stashed on the knote so always make the call and let each filter
6908 	 * handle the possibility of autodetached-ness */
6909 	knote_fops(kn)->f_detach(kn);
6910 
6911 	/* kq may be freed when kq_remove_knote() returns */
6912 	kq_remove_knote(kq, kn, p, knlc);
6913 	if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
6914 		fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6915 	}
6916 
6917 	knote_free(kn);
6918 }
6919 
6920 void
knote_init(void)6921 knote_init(void)
6922 {
6923 #if CONFIG_MEMORYSTATUS
6924 	/* Initialize the memorystatus list lock */
6925 	memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
6926 #endif
6927 }
6928 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6929 
6930 const struct filterops *
knote_fops(struct knote * kn)6931 knote_fops(struct knote *kn)
6932 {
6933 	return sysfilt_ops[kn->kn_filtid];
6934 }
6935 
6936 static struct knote *
knote_alloc(void)6937 knote_alloc(void)
6938 {
6939 	return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
6940 }
6941 
6942 static void
knote_free(struct knote * kn)6943 knote_free(struct knote *kn)
6944 {
6945 	assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
6946 	zfree(knote_zone, kn);
6947 }
6948 
6949 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6950 
6951 kevent_ctx_t
kevent_get_context(thread_t thread)6952 kevent_get_context(thread_t thread)
6953 {
6954 	uthread_t ut = get_bsdthread_info(thread);
6955 	return &ut->uu_save.uus_kevent;
6956 }
6957 
6958 static inline bool
kevent_args_requesting_events(unsigned int flags,int nevents)6959 kevent_args_requesting_events(unsigned int flags, int nevents)
6960 {
6961 	return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
6962 }
6963 
6964 static inline int
kevent_adjust_flags_for_proc(proc_t p,int flags)6965 kevent_adjust_flags_for_proc(proc_t p, int flags)
6966 {
6967 	__builtin_assume(p);
6968 	return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
6969 }
6970 
6971 /*!
6972  * @function kevent_get_kqfile
6973  *
6974  * @brief
6975  * Lookup a kqfile by fd.
6976  *
6977  * @discussion
6978  * Callers: kevent, kevent64, kevent_qos
6979  *
6980  * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6981  */
6982 OS_NOINLINE
6983 static int
kevent_get_kqfile(struct proc * p,int fd,int flags,struct fileproc ** fpp,struct kqueue ** kqp)6984 kevent_get_kqfile(struct proc *p, int fd, int flags,
6985     struct fileproc **fpp, struct kqueue **kqp)
6986 {
6987 	int error = 0;
6988 	struct kqueue *kq;
6989 
6990 	error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
6991 	if (__improbable(error)) {
6992 		return error;
6993 	}
6994 	kq = (struct kqueue *)fp_get_data((*fpp));
6995 
6996 	uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6997 	if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
6998 		kqlock(kq);
6999 		kq_state = kq->kq_state;
7000 		if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
7001 			if (flags & KEVENT_FLAG_LEGACY32) {
7002 				kq_state |= KQ_KEV32;
7003 			} else if (flags & KEVENT_FLAG_LEGACY64) {
7004 				kq_state |= KQ_KEV64;
7005 			} else {
7006 				kq_state |= KQ_KEV_QOS;
7007 			}
7008 			kq->kq_state = kq_state;
7009 		}
7010 		kqunlock(kq);
7011 	}
7012 
7013 	/*
7014 	 * kqfiles can't be used through the legacy kevent()
7015 	 * and other interfaces at the same time.
7016 	 */
7017 	if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7018 	    (bool)(kq_state & KQ_KEV32))) {
7019 		fp_drop(p, fd, *fpp, 0);
7020 		return EINVAL;
7021 	}
7022 
7023 	*kqp = kq;
7024 	return 0;
7025 }
7026 
7027 /*!
7028  * @function kevent_get_kqwq
7029  *
7030  * @brief
7031  * Lookup or create the process kqwq (faspath).
7032  *
7033  * @discussion
7034  * Callers: kevent64, kevent_qos
7035  */
7036 OS_ALWAYS_INLINE
7037 static int
kevent_get_kqwq(proc_t p,int flags,int nevents,struct kqueue ** kqp)7038 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7039 {
7040 	struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7041 
7042 	if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7043 		return EINVAL;
7044 	}
7045 	if (__improbable(kqwq == NULL)) {
7046 		kqwq = kqworkq_alloc(p, flags);
7047 		if (__improbable(kqwq == NULL)) {
7048 			return ENOMEM;
7049 		}
7050 	}
7051 
7052 	*kqp = &kqwq->kqwq_kqueue;
7053 	return 0;
7054 }
7055 
7056 #pragma mark kevent copyio
7057 
7058 /*!
7059  * @function kevent_get_data_size
7060  *
7061  * @brief
7062  * Copies in the extra data size from user-space.
7063  */
7064 static int
kevent_get_data_size(int flags,user_addr_t data_avail,user_addr_t data_out,kevent_ctx_t kectx)7065 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7066     kevent_ctx_t kectx)
7067 {
7068 	if (!data_avail || !data_out) {
7069 		kectx->kec_data_size  = 0;
7070 		kectx->kec_data_resid = 0;
7071 	} else if (flags & KEVENT_FLAG_PROC64) {
7072 		user64_size_t usize = 0;
7073 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7074 		if (__improbable(error)) {
7075 			return error;
7076 		}
7077 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7078 	} else {
7079 		user32_size_t usize = 0;
7080 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7081 		if (__improbable(error)) {
7082 			return error;
7083 		}
7084 		kectx->kec_data_avail = data_avail;
7085 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7086 	}
7087 	kectx->kec_data_out   = data_out;
7088 	kectx->kec_data_avail = data_avail;
7089 	return 0;
7090 }
7091 
7092 /*!
7093  * @function kevent_put_data_size
7094  *
7095  * @brief
7096  * Copies out the residual data size to user-space if any has been used.
7097  */
7098 static int
kevent_put_data_size(unsigned int flags,kevent_ctx_t kectx)7099 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7100 {
7101 	if (kectx->kec_data_resid == kectx->kec_data_size) {
7102 		return 0;
7103 	}
7104 	if (flags & KEVENT_FLAG_KERNEL) {
7105 		*(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7106 		return 0;
7107 	}
7108 	if (flags & KEVENT_FLAG_PROC64) {
7109 		user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7110 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7111 	} else {
7112 		user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7113 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7114 	}
7115 }
7116 
7117 /*!
7118  * @function kevent_legacy_copyin
7119  *
7120  * @brief
7121  * Handles the copyin of a kevent/kevent64 event.
7122  */
7123 static int
kevent_legacy_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp,unsigned int flags)7124 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
7125 {
7126 	int error;
7127 
7128 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7129 
7130 	if (flags & KEVENT_FLAG_LEGACY64) {
7131 		struct kevent64_s kev64;
7132 
7133 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7134 		if (__improbable(error)) {
7135 			return error;
7136 		}
7137 		*addrp += sizeof(kev64);
7138 		*kevp = (struct kevent_qos_s){
7139 			.ident  = kev64.ident,
7140 			.filter = kev64.filter,
7141 			/* Make sure user doesn't pass in any system flags */
7142 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7143 			.udata  = kev64.udata,
7144 			.fflags = kev64.fflags,
7145 			.data   = kev64.data,
7146 			.ext[0] = kev64.ext[0],
7147 			.ext[1] = kev64.ext[1],
7148 		};
7149 	} else if (flags & KEVENT_FLAG_PROC64) {
7150 		struct user64_kevent kev64;
7151 
7152 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7153 		if (__improbable(error)) {
7154 			return error;
7155 		}
7156 		*addrp += sizeof(kev64);
7157 		*kevp = (struct kevent_qos_s){
7158 			.ident  = kev64.ident,
7159 			.filter = kev64.filter,
7160 			/* Make sure user doesn't pass in any system flags */
7161 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7162 			.udata  = kev64.udata,
7163 			.fflags = kev64.fflags,
7164 			.data   = kev64.data,
7165 		};
7166 	} else {
7167 		struct user32_kevent kev32;
7168 
7169 		error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7170 		if (__improbable(error)) {
7171 			return error;
7172 		}
7173 		*addrp += sizeof(kev32);
7174 		*kevp = (struct kevent_qos_s){
7175 			.ident  = (uintptr_t)kev32.ident,
7176 			.filter = kev32.filter,
7177 			/* Make sure user doesn't pass in any system flags */
7178 			.flags  = kev32.flags & ~EV_SYSFLAGS,
7179 			.udata  = CAST_USER_ADDR_T(kev32.udata),
7180 			.fflags = kev32.fflags,
7181 			.data   = (intptr_t)kev32.data,
7182 		};
7183 	}
7184 
7185 	return 0;
7186 }
7187 
7188 /*!
7189  * @function kevent_modern_copyin
7190  *
7191  * @brief
7192  * Handles the copyin of a kevent_qos/kevent_id event.
7193  */
7194 static int
kevent_modern_copyin(user_addr_t * addrp,struct kevent_qos_s * kevp)7195 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7196 {
7197 	int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7198 	if (__probable(!error)) {
7199 		/* Make sure user doesn't pass in any system flags */
7200 		*addrp += sizeof(struct kevent_qos_s);
7201 		kevp->flags &= ~EV_SYSFLAGS;
7202 	}
7203 	return error;
7204 }
7205 
7206 /*!
7207  * @function kevent_legacy_copyout
7208  *
7209  * @brief
7210  * Handles the copyout of a kevent/kevent64 event.
7211  */
7212 static int
kevent_legacy_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp,unsigned int flags)7213 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7214 {
7215 	int advance;
7216 	int error;
7217 
7218 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7219 
7220 	/*
7221 	 * fully initialize the differnt output event structure
7222 	 * types from the internal kevent (and some universal
7223 	 * defaults for fields not represented in the internal
7224 	 * form).
7225 	 *
7226 	 * Note: these structures have no padding hence the C99
7227 	 *       initializers below do not leak kernel info.
7228 	 */
7229 	if (flags & KEVENT_FLAG_LEGACY64) {
7230 		struct kevent64_s kev64 = {
7231 			.ident  = kevp->ident,
7232 			.filter = kevp->filter,
7233 			.flags  = kevp->flags,
7234 			.fflags = kevp->fflags,
7235 			.data   = (int64_t)kevp->data,
7236 			.udata  = kevp->udata,
7237 			.ext[0] = kevp->ext[0],
7238 			.ext[1] = kevp->ext[1],
7239 		};
7240 		advance = sizeof(struct kevent64_s);
7241 		error = copyout((caddr_t)&kev64, *addrp, advance);
7242 	} else if (flags & KEVENT_FLAG_PROC64) {
7243 		/*
7244 		 * deal with the special case of a user-supplied
7245 		 * value of (uintptr_t)-1.
7246 		 */
7247 		uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7248 		    (uint64_t)-1LL : (uint64_t)kevp->ident;
7249 		struct user64_kevent kev64 = {
7250 			.ident  = ident,
7251 			.filter = kevp->filter,
7252 			.flags  = kevp->flags,
7253 			.fflags = kevp->fflags,
7254 			.data   = (int64_t) kevp->data,
7255 			.udata  = (user_addr_t) kevp->udata,
7256 		};
7257 		advance = sizeof(kev64);
7258 		error = copyout((caddr_t)&kev64, *addrp, advance);
7259 	} else {
7260 		struct user32_kevent kev32 = {
7261 			.ident  = (uint32_t)kevp->ident,
7262 			.filter = kevp->filter,
7263 			.flags  = kevp->flags,
7264 			.fflags = kevp->fflags,
7265 			.data   = (int32_t)kevp->data,
7266 			.udata  = (uint32_t)kevp->udata,
7267 		};
7268 		advance = sizeof(kev32);
7269 		error = copyout((caddr_t)&kev32, *addrp, advance);
7270 	}
7271 	if (__probable(!error)) {
7272 		*addrp += advance;
7273 	}
7274 	return error;
7275 }
7276 
7277 /*!
7278  * @function kevent_modern_copyout
7279  *
7280  * @brief
7281  * Handles the copyout of a kevent_qos/kevent_id event.
7282  */
7283 OS_ALWAYS_INLINE
7284 static inline int
kevent_modern_copyout(struct kevent_qos_s * kevp,user_addr_t * addrp)7285 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7286 {
7287 	int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7288 	if (__probable(!error)) {
7289 		*addrp += sizeof(struct kevent_qos_s);
7290 	}
7291 	return error;
7292 }
7293 
7294 #pragma mark kevent core implementation
7295 
7296 /*!
7297  * @function kevent_callback_inline
7298  *
7299  * @brief
7300  * Callback for each individual event
7301  *
7302  * @discussion
7303  * This is meant to be inlined in kevent_modern_callback and
7304  * kevent_legacy_callback.
7305  */
7306 OS_ALWAYS_INLINE
7307 static inline int
kevent_callback_inline(struct kevent_qos_s * kevp,kevent_ctx_t kectx,bool legacy)7308 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7309 {
7310 	int error;
7311 
7312 	assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7313 
7314 	/*
7315 	 * Copy out the appropriate amount of event data for this user.
7316 	 */
7317 	if (legacy) {
7318 		error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7319 		    kectx->kec_process_flags);
7320 	} else {
7321 		error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7322 	}
7323 
7324 	/*
7325 	 * If there isn't space for additional events, return
7326 	 * a harmless error to stop the processing here
7327 	 */
7328 	if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7329 		error = EWOULDBLOCK;
7330 	}
7331 	return error;
7332 }
7333 
7334 /*!
7335  * @function kevent_modern_callback
7336  *
7337  * @brief
7338  * Callback for each individual modern event.
7339  *
7340  * @discussion
7341  * This callback handles kevent_qos/kevent_id events.
7342  */
7343 static int
kevent_modern_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7344 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7345 {
7346 	return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7347 }
7348 
7349 /*!
7350  * @function kevent_legacy_callback
7351  *
7352  * @brief
7353  * Callback for each individual legacy event.
7354  *
7355  * @discussion
7356  * This callback handles kevent/kevent64 events.
7357  */
7358 static int
kevent_legacy_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)7359 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7360 {
7361 	return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7362 }
7363 
7364 /*!
7365  * @function kevent_cleanup
7366  *
7367  * @brief
7368  * Handles the cleanup returning from a kevent call.
7369  *
7370  * @discussion
7371  * kevent entry points will take a reference on workloops,
7372  * and a usecount on the fileglob of kqfiles.
7373  *
7374  * This function undoes this on the exit paths of kevents.
7375  *
7376  * @returns
7377  * The error to return to userspace.
7378  */
7379 static int
kevent_cleanup(kqueue_t kqu,int flags,int error,kevent_ctx_t kectx)7380 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7381 {
7382 	// poll should not call any codepath leading to this
7383 	assert((flags & KEVENT_FLAG_POLL) == 0);
7384 
7385 	if (flags & KEVENT_FLAG_WORKLOOP) {
7386 		kqworkloop_release(kqu.kqwl);
7387 	} else if (flags & KEVENT_FLAG_WORKQ) {
7388 		/* nothing held */
7389 	} else {
7390 		fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7391 	}
7392 
7393 	/* don't restart after signals... */
7394 	if (error == ERESTART) {
7395 		error = EINTR;
7396 	} else if (error == 0) {
7397 		/* don't abandon other output just because of residual copyout failures */
7398 		(void)kevent_put_data_size(flags, kectx);
7399 	}
7400 
7401 	if (flags & KEVENT_FLAG_PARKING) {
7402 		thread_t th = current_thread();
7403 		struct uthread *uth = get_bsdthread_info(th);
7404 		if (uth->uu_kqr_bound) {
7405 			thread_unfreeze_base_pri(th);
7406 		}
7407 	}
7408 	return error;
7409 }
7410 
7411 /*!
7412  * @function kqueue_process
7413  *
7414  * @brief
7415  * Process the triggered events in a kqueue.
7416  *
7417  * @discussion
7418  * Walk the queued knotes and validate that they are really still triggered
7419  * events by calling the filter routines (if necessary).
7420  *
7421  * For each event that is still considered triggered, invoke the callback
7422  * routine provided.
7423  *
7424  * caller holds a reference on the kqueue.
7425  * kqueue locked on entry and exit - but may be dropped
7426  * kqueue list locked (held for duration of call)
7427  *
7428  * This is only called by kqueue_scan() so that the compiler can inline it.
7429  *
7430  * @returns
7431  * - 0:            no event was returned, no other error occured
7432  * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
7433  * - EWOULDBLOCK:  (not an error) events have been found and we should return
7434  * - EFAULT:       copyout failed
7435  * - filter specific errors
7436  */
7437 static int
kqueue_process(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7438 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7439     kevent_callback_t callback)
7440 {
7441 	workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7442 	struct knote *kn;
7443 	int error = 0, rc = 0;
7444 	struct kqtailq *base_queue, *queue;
7445 	uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7446 
7447 	if (kq_type & KQ_WORKQ) {
7448 		rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7449 	} else if (kq_type & KQ_WORKLOOP) {
7450 		rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7451 	} else {
7452 kqfile_retry:
7453 		rc = kqfile_begin_processing(kqu.kqf);
7454 		if (rc == EBADF) {
7455 			return EBADF;
7456 		}
7457 	}
7458 
7459 	if (rc == -1) {
7460 		/* Nothing to process */
7461 		return 0;
7462 	}
7463 
7464 	/*
7465 	 * loop through the enqueued knotes associated with this request,
7466 	 * processing each one. Each request may have several queues
7467 	 * of knotes to process (depending on the type of kqueue) so we
7468 	 * have to loop through all the queues as long as we have additional
7469 	 * space.
7470 	 */
7471 
7472 process_again:
7473 	if (kq_type & KQ_WORKQ) {
7474 		base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7475 	} else if (kq_type & KQ_WORKLOOP) {
7476 		base_queue = &kqu.kqwl->kqwl_queue[0];
7477 		queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7478 	} else {
7479 		base_queue = queue = &kqu.kqf->kqf_queue;
7480 	}
7481 
7482 	do {
7483 		while ((kn = TAILQ_FIRST(queue)) != NULL) {
7484 			error = knote_process(kn, kectx, callback);
7485 			if (error == EJUSTRETURN) {
7486 				error = 0;
7487 			} else if (__improbable(error)) {
7488 				/* error is EWOULDBLOCK when the out event array is full */
7489 				goto stop_processing;
7490 			}
7491 		}
7492 	} while (queue-- > base_queue);
7493 
7494 	if (kectx->kec_process_noutputs) {
7495 		/* callers will transform this into no error */
7496 		error = EWOULDBLOCK;
7497 	}
7498 
7499 stop_processing:
7500 	/*
7501 	 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7502 	 * we want to unbind the kqrequest from the thread.
7503 	 *
7504 	 * However, because the kq locks are dropped several times during process,
7505 	 * new knotes may have fired again, in which case, we want to fail the end
7506 	 * processing and process again, until it converges.
7507 	 *
7508 	 * If we have an error or returned events, end processing never fails.
7509 	 */
7510 	if (error) {
7511 		flags &= ~KEVENT_FLAG_PARKING;
7512 	}
7513 	if (kq_type & KQ_WORKQ) {
7514 		rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7515 	} else if (kq_type & KQ_WORKLOOP) {
7516 		rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7517 	} else {
7518 		rc = kqfile_end_processing(kqu.kqf);
7519 	}
7520 
7521 	if (__probable(error)) {
7522 		return error;
7523 	}
7524 
7525 	if (__probable(rc >= 0)) {
7526 		assert(rc == 0 || rc == EBADF);
7527 		return rc;
7528 	}
7529 
7530 	if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7531 		assert(flags & KEVENT_FLAG_PARKING);
7532 		goto process_again;
7533 	} else {
7534 		goto kqfile_retry;
7535 	}
7536 }
7537 
7538 /*!
7539  * @function kqueue_scan_continue
7540  *
7541  * @brief
7542  * The continuation used by kqueue_scan for kevent entry points.
7543  *
7544  * @discussion
7545  * Assumes we inherit a use/ref count on the kq or its fileglob.
7546  *
7547  * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7548  * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7549  */
7550 OS_NORETURN OS_NOINLINE
7551 static void
kqueue_scan_continue(void * data,wait_result_t wait_result)7552 kqueue_scan_continue(void *data, wait_result_t wait_result)
7553 {
7554 	uthread_t ut = current_uthread();
7555 	kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7556 	int error = 0, flags = kectx->kec_process_flags;
7557 	struct kqueue *kq = data;
7558 
7559 	/*
7560 	 * only kevent variants call in here, so we know the callback is
7561 	 * kevent_legacy_callback or kevent_modern_callback.
7562 	 */
7563 	assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7564 
7565 	switch (wait_result) {
7566 	case THREAD_AWAKENED:
7567 		if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7568 			error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7569 		} else {
7570 			error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7571 		}
7572 		break;
7573 	case THREAD_TIMED_OUT:
7574 		error = 0;
7575 		break;
7576 	case THREAD_INTERRUPTED:
7577 		error = EINTR;
7578 		break;
7579 	case THREAD_RESTART:
7580 		error = EBADF;
7581 		break;
7582 	default:
7583 		panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7584 	}
7585 
7586 
7587 	error = kevent_cleanup(kq, flags, error, kectx);
7588 	*(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7589 	unix_syscall_return(error);
7590 }
7591 
7592 /*!
7593  * @function kqueue_scan
7594  *
7595  * @brief
7596  * Scan and wait for events in a kqueue (used by poll & kevent).
7597  *
7598  * @discussion
7599  * Process the triggered events in a kqueue.
7600  *
7601  * If there are no events triggered arrange to wait for them:
7602  * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7603  * - possibly until kectx->kec_deadline expires
7604  *
7605  * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7606  * are set, then it will wait in the kqueue_scan_continue continuation.
7607  *
7608  * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7609  * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7610  *
7611  * @param kqu
7612  * The kqueue being scanned.
7613  *
7614  * @param flags
7615  * The KEVENT_FLAG_* flags for this call.
7616  *
7617  * @param kectx
7618  * The context used for this scan.
7619  * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7620  *
7621  * @param callback
7622  * The callback to be called on events sucessfully processed.
7623  * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7624  */
7625 int
kqueue_scan(kqueue_t kqu,int flags,kevent_ctx_t kectx,kevent_callback_t callback)7626 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7627     kevent_callback_t callback)
7628 {
7629 	int error;
7630 
7631 	for (;;) {
7632 		kqlock(kqu);
7633 		error = kqueue_process(kqu, flags, kectx, callback);
7634 
7635 		/*
7636 		 * If we got an error, events returned (EWOULDBLOCK)
7637 		 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7638 		 * just return.
7639 		 */
7640 		if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
7641 			kqunlock(kqu);
7642 			return error == EWOULDBLOCK ? 0 : error;
7643 		}
7644 
7645 		assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
7646 
7647 		kqu.kqf->kqf_state |= KQ_SLEEP;
7648 		assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
7649 		    kectx->kec_deadline);
7650 		kqunlock(kqu);
7651 
7652 		if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
7653 			thread_block_parameter(kqueue_scan_continue, kqu.kqf);
7654 			__builtin_unreachable();
7655 		}
7656 
7657 		wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7658 		switch (wr) {
7659 		case THREAD_AWAKENED:
7660 			break;
7661 		case THREAD_TIMED_OUT:
7662 			return 0;
7663 		case THREAD_INTERRUPTED:
7664 			return EINTR;
7665 		case THREAD_RESTART:
7666 			return EBADF;
7667 		default:
7668 			panic("%s: - bad wait_result (%d)", __func__, wr);
7669 		}
7670 	}
7671 }
7672 
7673 /*!
7674  * @function kevent_internal
7675  *
7676  * @brief
7677  * Common kevent code.
7678  *
7679  * @discussion
7680  * Needs to be inlined to specialize for legacy or modern and
7681  * eliminate dead code.
7682  *
7683  * This is the core logic of kevent entry points, that will:
7684  * - register kevents
7685  * - optionally scan the kqueue for events
7686  *
7687  * The caller is giving kevent_internal a reference on the kqueue
7688  * or its fileproc that needs to be cleaned up by kevent_cleanup().
7689  */
7690 OS_ALWAYS_INLINE
7691 static inline int
kevent_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval,bool legacy)7692 kevent_internal(kqueue_t kqu,
7693     user_addr_t changelist, int nchanges,
7694     user_addr_t ueventlist, int nevents,
7695     int flags, kevent_ctx_t kectx, int32_t *retval,
7696     bool legacy)
7697 {
7698 	int error = 0, noutputs = 0, register_rc;
7699 
7700 	/* only bound threads can receive events on workloops */
7701 	if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7702 #if CONFIG_WORKLOOP_DEBUG
7703 		UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7704 			.uu_kqid = kqu.kqwl->kqwl_dynamicid,
7705 			.uu_kq = error ? NULL : kqu.kq,
7706 			.uu_error = error,
7707 			.uu_nchanges = nchanges,
7708 			.uu_nevents = nevents,
7709 			.uu_flags = flags,
7710 		});
7711 #endif // CONFIG_WORKLOOP_DEBUG
7712 
7713 		if (flags & KEVENT_FLAG_KERNEL) {
7714 			/* see kevent_workq_internal */
7715 			error = copyout(&kqu.kqwl->kqwl_dynamicid,
7716 			    ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7717 			kectx->kec_data_resid -= sizeof(kqueue_id_t);
7718 			if (__improbable(error)) {
7719 				goto out;
7720 			}
7721 		}
7722 
7723 		if (kevent_args_requesting_events(flags, nevents)) {
7724 			/*
7725 			 * Disable the R2K notification while doing a register, if the
7726 			 * caller wants events too, we don't want the AST to be set if we
7727 			 * will process these events soon.
7728 			 */
7729 			kqlock(kqu);
7730 			kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7731 			kqunlock(kqu);
7732 			flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
7733 		}
7734 	}
7735 
7736 	/* register all the change requests the user provided... */
7737 	while (nchanges > 0 && error == 0) {
7738 		struct kevent_qos_s kev;
7739 		struct knote *kn = NULL;
7740 
7741 		if (legacy) {
7742 			error = kevent_legacy_copyin(&changelist, &kev, flags);
7743 		} else {
7744 			error = kevent_modern_copyin(&changelist, &kev);
7745 		}
7746 		if (error) {
7747 			break;
7748 		}
7749 
7750 		register_rc = kevent_register(kqu.kq, &kev, &kn);
7751 		if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7752 			thread_t thread = current_thread();
7753 
7754 			kqlock_held(kqu);
7755 
7756 			if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7757 				workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7758 			}
7759 
7760 			// f_post_register_wait is meant to call a continuation and not to
7761 			// return, which is why we don't support FILTER_REGISTER_WAIT if
7762 			// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7763 			// waits isn't the last.
7764 			//
7765 			// It is implementable, but not used by any userspace code at the
7766 			// moment, so for now return ENOTSUP if someone tries to do it.
7767 			if (nchanges == 1 && noutputs < nevents &&
7768 			    (flags & KEVENT_FLAG_KERNEL) == 0 &&
7769 			    (flags & KEVENT_FLAG_PARKING) == 0 &&
7770 			    (flags & KEVENT_FLAG_ERROR_EVENTS) &&
7771 			    (flags & KEVENT_FLAG_WORKLOOP)) {
7772 				uthread_t ut = get_bsdthread_info(thread);
7773 
7774 				/*
7775 				 * store the continuation/completion data in the uthread
7776 				 *
7777 				 * Note: the kectx aliases with this,
7778 				 * and is destroyed in the process.
7779 				 */
7780 				ut->uu_save.uus_kevent_register = (struct _kevent_register){
7781 					.kev        = kev,
7782 					.kqwl       = kqu.kqwl,
7783 					.eventout   = noutputs,
7784 					.ueventlist = ueventlist,
7785 				};
7786 				knote_fops(kn)->f_post_register_wait(ut, kn,
7787 				    &ut->uu_save.uus_kevent_register);
7788 				__builtin_unreachable();
7789 			}
7790 			kqunlock(kqu);
7791 
7792 			kev.flags |= EV_ERROR;
7793 			kev.data = ENOTSUP;
7794 		} else {
7795 			assert((register_rc & FILTER_REGISTER_WAIT) == 0);
7796 		}
7797 
7798 		// keep in sync with kevent_register_wait_return()
7799 		if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
7800 			if ((kev.flags & EV_ERROR) == 0) {
7801 				kev.flags |= EV_ERROR;
7802 				kev.data = 0;
7803 			}
7804 			if (legacy) {
7805 				error = kevent_legacy_copyout(&kev, &ueventlist, flags);
7806 			} else {
7807 				error = kevent_modern_copyout(&kev, &ueventlist);
7808 			}
7809 			if (error == 0) {
7810 				noutputs++;
7811 			}
7812 		} else if (kev.flags & EV_ERROR) {
7813 			error = (int)kev.data;
7814 		}
7815 		nchanges--;
7816 	}
7817 
7818 	if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
7819 	    nevents > 0 && noutputs == 0 && error == 0) {
7820 		kectx->kec_process_flags = flags;
7821 		kectx->kec_process_nevents = nevents;
7822 		kectx->kec_process_noutputs = 0;
7823 		kectx->kec_process_eventlist = ueventlist;
7824 
7825 		if (legacy) {
7826 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
7827 		} else {
7828 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
7829 		}
7830 
7831 		noutputs = kectx->kec_process_noutputs;
7832 	} else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7833 		/*
7834 		 * If we didn't through kqworkloop_end_processing(),
7835 		 * we need to do it here.
7836 		 *
7837 		 * kqueue_scan will call kqworkloop_end_processing(),
7838 		 * so we only need to do it if we didn't scan.
7839 		 */
7840 		kqlock(kqu);
7841 		kqworkloop_end_processing(kqu.kqwl, 0, 0);
7842 		kqunlock(kqu);
7843 	}
7844 
7845 	*retval = noutputs;
7846 out:
7847 	return kevent_cleanup(kqu.kq, flags, error, kectx);
7848 }
7849 
7850 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7851 
7852 /*!
7853  * @function kevent_modern_internal
7854  *
7855  * @brief
7856  * The backend of the kevent_id and kevent_workq_internal entry points.
7857  *
7858  * @discussion
7859  * Needs to be inline due to the number of arguments.
7860  */
7861 OS_NOINLINE
7862 static int
kevent_modern_internal(kqueue_t kqu,user_addr_t changelist,int nchanges,user_addr_t ueventlist,int nevents,int flags,kevent_ctx_t kectx,int32_t * retval)7863 kevent_modern_internal(kqueue_t kqu,
7864     user_addr_t changelist, int nchanges,
7865     user_addr_t ueventlist, int nevents,
7866     int flags, kevent_ctx_t kectx, int32_t *retval)
7867 {
7868 	return kevent_internal(kqu.kq, changelist, nchanges,
7869 	           ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
7870 }
7871 
7872 /*!
7873  * @function kevent_id
7874  *
7875  * @brief
7876  * The kevent_id() syscall.
7877  */
7878 int
kevent_id(struct proc * p,struct kevent_id_args * uap,int32_t * retval)7879 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
7880 {
7881 	int error, flags = uap->flags & KEVENT_FLAG_USER;
7882 	uthread_t uth = current_uthread();
7883 	workq_threadreq_t kqr = uth->uu_kqr_bound;
7884 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7885 	kqueue_t kqu;
7886 
7887 	flags = kevent_adjust_flags_for_proc(p, flags);
7888 	flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
7889 
7890 	if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
7891 	    KEVENT_FLAG_WORKLOOP)) {
7892 		return EINVAL;
7893 	}
7894 
7895 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7896 	if (__improbable(error)) {
7897 		return error;
7898 	}
7899 
7900 	kectx->kec_deadline = 0;
7901 	kectx->kec_fp       = NULL;
7902 	kectx->kec_fd       = -1;
7903 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7904 
7905 	/*
7906 	 * Get the kq we are going to be working on
7907 	 * As a fastpath, look at the currently bound workloop.
7908 	 */
7909 	kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7910 	if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7911 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7912 			return EEXIST;
7913 		}
7914 		kqworkloop_retain(kqu.kqwl);
7915 	} else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7916 		return EXDEV;
7917 	} else {
7918 		error = kqworkloop_get_or_create(p, uap->id, NULL,
7919 #if CONFIG_PREADOPT_TG
7920 		    NULL,
7921 #endif /* CONFIG_PREADOPT_TG */
7922 		    flags, &kqu.kqwl);
7923 		if (__improbable(error)) {
7924 			return error;
7925 		}
7926 	}
7927 
7928 	return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
7929 	           uap->eventlist, uap->nevents, flags, kectx, retval);
7930 }
7931 
7932 /**!
7933  * @function kevent_workq_internal
7934  *
7935  * @discussion
7936  * This function is exported for the sake of the workqueue subsystem.
7937  *
7938  * It is called in two ways:
7939  * - when a thread is about to go to userspace to ask for pending event
7940  * - when a thread is returning from userspace with events back
7941  *
7942  * the workqueue subsystem will only use the following flags:
7943  * - KEVENT_FLAG_STACK_DATA (always)
7944  * - KEVENT_FLAG_IMMEDIATE (always)
7945  * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7946  *   userspace).
7947  *
7948  * It implicitly acts on the bound kqueue, and for the case of workloops
7949  * will copyout the kqueue ID before anything else.
7950  *
7951  *
7952  * Pthread will have setup the various arguments to fit this stack layout:
7953  *
7954  * +-------....----+--------------+-----------+--------------------+
7955  * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
7956  * +-------....----+--------------+-----------+--------------------+
7957  *                 ^              ^
7958  *             data_out       eventlist
7959  *
7960  * When a workloop is used, the workloop ID is copied out right before
7961  * the eventlist and is taken from the data buffer.
7962  *
7963  * @warning
7964  * This function is carefuly tailored to not make any call except the final tail
7965  * call into kevent_modern_internal. (LTO inlines current_uthread()).
7966  *
7967  * This function is performance sensitive due to the workq subsystem.
7968  */
7969 int
kevent_workq_internal(struct proc * p,user_addr_t changelist,int nchanges,user_addr_t eventlist,int nevents,user_addr_t data_out,user_size_t * data_available,unsigned int flags,int32_t * retval)7970 kevent_workq_internal(struct proc *p,
7971     user_addr_t changelist, int nchanges,
7972     user_addr_t eventlist, int nevents,
7973     user_addr_t data_out, user_size_t *data_available,
7974     unsigned int flags, int32_t *retval)
7975 {
7976 	uthread_t uth = current_uthread();
7977 	workq_threadreq_t kqr = uth->uu_kqr_bound;
7978 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7979 	kqueue_t kqu;
7980 
7981 	assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
7982 	    flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
7983 
7984 	kectx->kec_data_out   = data_out;
7985 	kectx->kec_data_avail = (uint64_t)data_available;
7986 	kectx->kec_data_size  = *data_available;
7987 	kectx->kec_data_resid = *data_available;
7988 	kectx->kec_deadline   = 0;
7989 	kectx->kec_fp         = NULL;
7990 	kectx->kec_fd         = -1;
7991 	/* the kec_process_* fields are filled if kqueue_scann is called only */
7992 
7993 	flags = kevent_adjust_flags_for_proc(p, flags);
7994 
7995 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7996 		kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7997 		kqworkloop_retain(kqu.kqwl);
7998 
7999 		flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
8000 		    KEVENT_FLAG_KERNEL;
8001 	} else {
8002 		kqu.kqwq = p->p_fd.fd_wqkqueue;
8003 
8004 		flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
8005 	}
8006 
8007 	return kevent_modern_internal(kqu, changelist, nchanges,
8008 	           eventlist, nevents, flags, kectx, retval);
8009 }
8010 
8011 /*!
8012  * @function kevent_qos
8013  *
8014  * @brief
8015  * The kevent_qos() syscall.
8016  */
8017 int
kevent_qos(struct proc * p,struct kevent_qos_args * uap,int32_t * retval)8018 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
8019 {
8020 	uthread_t uth = current_uthread();
8021 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8022 	int error, flags = uap->flags & KEVENT_FLAG_USER;
8023 	struct kqueue *kq;
8024 
8025 	if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8026 		return EINVAL;
8027 	}
8028 
8029 	flags = kevent_adjust_flags_for_proc(p, flags);
8030 
8031 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8032 	if (__improbable(error)) {
8033 		return error;
8034 	}
8035 
8036 	kectx->kec_deadline = 0;
8037 	kectx->kec_fp       = NULL;
8038 	kectx->kec_fd       = uap->fd;
8039 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8040 
8041 	/* get the kq we are going to be working on */
8042 	if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8043 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8044 	} else {
8045 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8046 	}
8047 	if (__improbable(error)) {
8048 		return error;
8049 	}
8050 
8051 	return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
8052 	           uap->eventlist, uap->nevents, flags, kectx, retval);
8053 }
8054 
8055 #pragma mark legacy syscalls: kevent, kevent64
8056 
8057 /*!
8058  * @function kevent_legacy_get_deadline
8059  *
8060  * @brief
8061  * Compute the deadline for the legacy kevent syscalls.
8062  *
8063  * @discussion
8064  * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8065  * as this takes precedence over the deadline.
8066  *
8067  * This function will fail if utimeout is USER_ADDR_NULL
8068  * (the caller should check).
8069  */
8070 static int
kevent_legacy_get_deadline(int flags,user_addr_t utimeout,uint64_t * deadline)8071 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8072 {
8073 	struct timespec ts;
8074 
8075 	if (flags & KEVENT_FLAG_PROC64) {
8076 		struct user64_timespec ts64;
8077 		int error = copyin(utimeout, &ts64, sizeof(ts64));
8078 		if (__improbable(error)) {
8079 			return error;
8080 		}
8081 		ts.tv_sec = (unsigned long)ts64.tv_sec;
8082 		ts.tv_nsec = (long)ts64.tv_nsec;
8083 	} else {
8084 		struct user32_timespec ts32;
8085 		int error = copyin(utimeout, &ts32, sizeof(ts32));
8086 		if (__improbable(error)) {
8087 			return error;
8088 		}
8089 		ts.tv_sec = ts32.tv_sec;
8090 		ts.tv_nsec = ts32.tv_nsec;
8091 	}
8092 	if (!timespec_is_valid(&ts)) {
8093 		return EINVAL;
8094 	}
8095 
8096 	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
8097 	return 0;
8098 }
8099 
8100 /*!
8101  * @function kevent_legacy_internal
8102  *
8103  * @brief
8104  * The core implementation for kevent and kevent64
8105  */
8106 OS_NOINLINE
8107 static int
kevent_legacy_internal(struct proc * p,struct kevent64_args * uap,int32_t * retval,int flags)8108 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
8109     int32_t *retval, int flags)
8110 {
8111 	uthread_t uth = current_uthread();
8112 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8113 	struct kqueue *kq;
8114 	int error;
8115 
8116 	if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8117 		return EINVAL;
8118 	}
8119 
8120 	flags = kevent_adjust_flags_for_proc(p, flags);
8121 
8122 	kectx->kec_data_out   = 0;
8123 	kectx->kec_data_avail = 0;
8124 	kectx->kec_data_size  = 0;
8125 	kectx->kec_data_resid = 0;
8126 	kectx->kec_deadline   = 0;
8127 	kectx->kec_fp         = NULL;
8128 	kectx->kec_fd         = uap->fd;
8129 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8130 
8131 	/* convert timeout to absolute - if we have one (and not immediate) */
8132 	if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8133 		error = kevent_legacy_get_deadline(flags, uap->timeout,
8134 		    &kectx->kec_deadline);
8135 		if (__improbable(error)) {
8136 			return error;
8137 		}
8138 	}
8139 
8140 	/* get the kq we are going to be working on */
8141 	if (flags & KEVENT_FLAG_WORKQ) {
8142 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8143 	} else {
8144 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8145 	}
8146 	if (__improbable(error)) {
8147 		return error;
8148 	}
8149 
8150 	return kevent_internal(kq, uap->changelist, uap->nchanges,
8151 	           uap->eventlist, uap->nevents, flags, kectx, retval,
8152 	           /*legacy*/ true);
8153 }
8154 
8155 /*!
8156  * @function kevent
8157  *
8158  * @brief
8159  * The legacy kevent() syscall.
8160  */
8161 int
kevent(struct proc * p,struct kevent_args * uap,int32_t * retval)8162 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8163 {
8164 	struct kevent64_args args = {
8165 		.fd         = uap->fd,
8166 		.changelist = uap->changelist,
8167 		.nchanges   = uap->nchanges,
8168 		.eventlist  = uap->eventlist,
8169 		.nevents    = uap->nevents,
8170 		.timeout    = uap->timeout,
8171 	};
8172 
8173 	return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8174 }
8175 
8176 /*!
8177  * @function kevent64
8178  *
8179  * @brief
8180  * The legacy kevent64() syscall.
8181  */
8182 int
kevent64(struct proc * p,struct kevent64_args * uap,int32_t * retval)8183 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8184 {
8185 	int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8186 	return kevent_legacy_internal(p, uap, retval, flags);
8187 }
8188 
8189 #pragma mark - socket interface
8190 
8191 #if SOCKETS
8192 #include <sys/param.h>
8193 #include <sys/socket.h>
8194 #include <sys/protosw.h>
8195 #include <sys/domain.h>
8196 #include <sys/mbuf.h>
8197 #include <sys/kern_event.h>
8198 #include <sys/malloc.h>
8199 #include <sys/sys_domain.h>
8200 #include <sys/syslog.h>
8201 
8202 #ifndef ROUNDUP64
8203 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8204 #endif
8205 
8206 #ifndef ADVANCE64
8207 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8208 #endif
8209 
8210 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8211 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8212 
8213 static int kev_attach(struct socket *so, int proto, struct proc *p);
8214 static int kev_detach(struct socket *so);
8215 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8216     struct ifnet *ifp, struct proc *p);
8217 static lck_mtx_t * event_getlock(struct socket *, int);
8218 static int event_lock(struct socket *, int, void *);
8219 static int event_unlock(struct socket *, int, void *);
8220 
8221 static int event_sofreelastref(struct socket *);
8222 static void kev_delete(struct kern_event_pcb *);
8223 
8224 static struct pr_usrreqs event_usrreqs = {
8225 	.pru_attach =           kev_attach,
8226 	.pru_control =          kev_control,
8227 	.pru_detach =           kev_detach,
8228 	.pru_soreceive =        soreceive,
8229 };
8230 
8231 static struct protosw eventsw[] = {
8232 	{
8233 		.pr_type =              SOCK_RAW,
8234 		.pr_protocol =          SYSPROTO_EVENT,
8235 		.pr_flags =             PR_ATOMIC,
8236 		.pr_usrreqs =           &event_usrreqs,
8237 		.pr_lock =              event_lock,
8238 		.pr_unlock =            event_unlock,
8239 		.pr_getlock =           event_getlock,
8240 	}
8241 };
8242 
8243 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8244 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8245 
8246 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8247     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8248 
8249 struct kevtstat kevtstat;
8250 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8251     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8252     kevt_getstat, "S,kevtstat", "");
8253 
8254 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8255     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8256     kevt_pcblist, "S,xkevtpcb", "");
8257 
8258 static lck_mtx_t *
event_getlock(struct socket * so,int flags)8259 event_getlock(struct socket *so, int flags)
8260 {
8261 #pragma unused(flags)
8262 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8263 
8264 	if (so->so_pcb != NULL) {
8265 		if (so->so_usecount < 0) {
8266 			panic("%s: so=%p usecount=%d lrh= %s", __func__,
8267 			    so, so->so_usecount, solockhistory_nr(so));
8268 		}
8269 		/* NOTREACHED */
8270 	} else {
8271 		panic("%s: so=%p NULL NO so_pcb %s", __func__,
8272 		    so, solockhistory_nr(so));
8273 		/* NOTREACHED */
8274 	}
8275 	return &ev_pcb->evp_mtx;
8276 }
8277 
8278 static int
event_lock(struct socket * so,int refcount,void * lr)8279 event_lock(struct socket *so, int refcount, void *lr)
8280 {
8281 	void *lr_saved;
8282 
8283 	if (lr == NULL) {
8284 		lr_saved = __builtin_return_address(0);
8285 	} else {
8286 		lr_saved = lr;
8287 	}
8288 
8289 	if (so->so_pcb != NULL) {
8290 		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8291 	} else {
8292 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8293 		    so, lr_saved, solockhistory_nr(so));
8294 		/* NOTREACHED */
8295 	}
8296 
8297 	if (so->so_usecount < 0) {
8298 		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8299 		    so, so->so_pcb, lr_saved, so->so_usecount,
8300 		    solockhistory_nr(so));
8301 		/* NOTREACHED */
8302 	}
8303 
8304 	if (refcount) {
8305 		so->so_usecount++;
8306 	}
8307 
8308 	so->lock_lr[so->next_lock_lr] = lr_saved;
8309 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8310 	return 0;
8311 }
8312 
8313 static int
event_unlock(struct socket * so,int refcount,void * lr)8314 event_unlock(struct socket *so, int refcount, void *lr)
8315 {
8316 	void *lr_saved;
8317 	lck_mtx_t *mutex_held;
8318 
8319 	if (lr == NULL) {
8320 		lr_saved = __builtin_return_address(0);
8321 	} else {
8322 		lr_saved = lr;
8323 	}
8324 
8325 	if (refcount) {
8326 		so->so_usecount--;
8327 	}
8328 	if (so->so_usecount < 0) {
8329 		panic("%s: so=%p usecount=%d lrh= %s", __func__,
8330 		    so, so->so_usecount, solockhistory_nr(so));
8331 		/* NOTREACHED */
8332 	}
8333 	if (so->so_pcb == NULL) {
8334 		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8335 		    so, so->so_usecount, (void *)lr_saved,
8336 		    solockhistory_nr(so));
8337 		/* NOTREACHED */
8338 	}
8339 	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8340 
8341 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8342 	so->unlock_lr[so->next_unlock_lr] = lr_saved;
8343 	so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8344 
8345 	if (so->so_usecount == 0) {
8346 		VERIFY(so->so_flags & SOF_PCBCLEARING);
8347 		event_sofreelastref(so);
8348 	} else {
8349 		lck_mtx_unlock(mutex_held);
8350 	}
8351 
8352 	return 0;
8353 }
8354 
8355 static int
event_sofreelastref(struct socket * so)8356 event_sofreelastref(struct socket *so)
8357 {
8358 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8359 
8360 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8361 
8362 	so->so_pcb = NULL;
8363 
8364 	/*
8365 	 * Disable upcall in the event another thread is in kev_post_msg()
8366 	 * appending record to the receive socket buffer, since sbwakeup()
8367 	 * may release the socket lock otherwise.
8368 	 */
8369 	so->so_rcv.sb_flags &= ~SB_UPCALL;
8370 	so->so_snd.sb_flags &= ~SB_UPCALL;
8371 	so->so_event = sonullevent;
8372 	lck_mtx_unlock(&(ev_pcb->evp_mtx));
8373 
8374 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8375 	lck_rw_lock_exclusive(&kev_rwlock);
8376 	LIST_REMOVE(ev_pcb, evp_link);
8377 	kevtstat.kes_pcbcount--;
8378 	kevtstat.kes_gencnt++;
8379 	lck_rw_done(&kev_rwlock);
8380 	kev_delete(ev_pcb);
8381 
8382 	sofreelastref(so, 1);
8383 	return 0;
8384 }
8385 
8386 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8387 
8388 static
8389 struct kern_event_head kern_event_head;
8390 
8391 static u_int32_t static_event_id = 0;
8392 
8393 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8394 
8395 /*
8396  * Install the protosw's for the NKE manager.  Invoked at extension load time
8397  */
8398 void
kern_event_init(struct domain * dp)8399 kern_event_init(struct domain *dp)
8400 {
8401 	struct protosw *pr;
8402 	int i;
8403 
8404 	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8405 	VERIFY(dp == systemdomain);
8406 
8407 	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8408 		net_add_proto(pr, dp, 1);
8409 	}
8410 }
8411 
8412 static int
kev_attach(struct socket * so,__unused int proto,__unused struct proc * p)8413 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8414 {
8415 	int error = 0;
8416 	struct kern_event_pcb *ev_pcb;
8417 
8418 	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8419 	if (error != 0) {
8420 		return error;
8421 	}
8422 
8423 	ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8424 	lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8425 
8426 	ev_pcb->evp_socket = so;
8427 	ev_pcb->evp_vendor_code_filter = 0xffffffff;
8428 
8429 	so->so_pcb = (caddr_t) ev_pcb;
8430 	lck_rw_lock_exclusive(&kev_rwlock);
8431 	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8432 	kevtstat.kes_pcbcount++;
8433 	kevtstat.kes_gencnt++;
8434 	lck_rw_done(&kev_rwlock);
8435 
8436 	return error;
8437 }
8438 
8439 static void
kev_delete(struct kern_event_pcb * ev_pcb)8440 kev_delete(struct kern_event_pcb *ev_pcb)
8441 {
8442 	VERIFY(ev_pcb != NULL);
8443 	lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8444 	zfree(ev_pcb_zone, ev_pcb);
8445 }
8446 
8447 static int
kev_detach(struct socket * so)8448 kev_detach(struct socket *so)
8449 {
8450 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8451 
8452 	if (ev_pcb != NULL) {
8453 		soisdisconnected(so);
8454 		so->so_flags |= SOF_PCBCLEARING;
8455 	}
8456 
8457 	return 0;
8458 }
8459 
8460 /*
8461  * For now, kev_vendor_code and mbuf_tags use the same
8462  * mechanism.
8463  */
8464 errno_t
kev_vendor_code_find(const char * string,u_int32_t * out_vendor_code)8465 kev_vendor_code_find(
8466 	const char      *string,
8467 	u_int32_t       *out_vendor_code)
8468 {
8469 	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8470 		return EINVAL;
8471 	}
8472 	return net_str_id_find_internal(string, out_vendor_code,
8473 	           NSI_VENDOR_CODE, 1);
8474 }
8475 
8476 errno_t
kev_msg_post(struct kev_msg * event_msg)8477 kev_msg_post(struct kev_msg *event_msg)
8478 {
8479 	mbuf_tag_id_t min_vendor, max_vendor;
8480 
8481 	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8482 
8483 	if (event_msg == NULL) {
8484 		return EINVAL;
8485 	}
8486 
8487 	/*
8488 	 * Limit third parties to posting events for registered vendor codes
8489 	 * only
8490 	 */
8491 	if (event_msg->vendor_code < min_vendor ||
8492 	    event_msg->vendor_code > max_vendor) {
8493 		os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8494 		return EINVAL;
8495 	}
8496 	return kev_post_msg(event_msg);
8497 }
8498 
8499 static int
kev_post_msg_internal(struct kev_msg * event_msg,int wait)8500 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8501 {
8502 	struct mbuf *m, *m2;
8503 	struct kern_event_pcb *ev_pcb;
8504 	struct kern_event_msg *ev;
8505 	char *tmp;
8506 	u_int32_t total_size;
8507 	int i;
8508 
8509 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8510 	/*
8511 	 * Special hook for ALF state updates
8512 	 */
8513 	if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8514 	    event_msg->kev_class == KEV_NKE_CLASS &&
8515 	    event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8516 	    event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8517 #if (DEBUG || DEVELOPMENT)
8518 		os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8519 #endif /* DEBUG || DEVELOPMENT */
8520 		net_filter_event_mark(NET_FILTER_EVENT_ALF,
8521 		    net_check_compatible_alf());
8522 	}
8523 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8524 
8525 	/* Verify the message is small enough to fit in one mbuf w/o cluster */
8526 	total_size = KEV_MSG_HEADER_SIZE;
8527 
8528 	for (i = 0; i < 5; i++) {
8529 		if (event_msg->dv[i].data_length == 0) {
8530 			break;
8531 		}
8532 		total_size += event_msg->dv[i].data_length;
8533 	}
8534 
8535 	if (total_size > MLEN) {
8536 		os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8537 		return EMSGSIZE;
8538 	}
8539 
8540 	m = m_get(wait, MT_DATA);
8541 	if (m == 0) {
8542 		os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8543 		return ENOMEM;
8544 	}
8545 	ev = mtod(m, struct kern_event_msg *);
8546 	total_size = KEV_MSG_HEADER_SIZE;
8547 
8548 	tmp = (char *) &ev->event_data[0];
8549 	for (i = 0; i < 5; i++) {
8550 		if (event_msg->dv[i].data_length == 0) {
8551 			break;
8552 		}
8553 
8554 		total_size += event_msg->dv[i].data_length;
8555 		bcopy(event_msg->dv[i].data_ptr, tmp,
8556 		    event_msg->dv[i].data_length);
8557 		tmp += event_msg->dv[i].data_length;
8558 	}
8559 
8560 	ev->id = ++static_event_id;
8561 	ev->total_size   = total_size;
8562 	ev->vendor_code  = event_msg->vendor_code;
8563 	ev->kev_class    = event_msg->kev_class;
8564 	ev->kev_subclass = event_msg->kev_subclass;
8565 	ev->event_code   = event_msg->event_code;
8566 
8567 	m->m_len = total_size;
8568 	lck_rw_lock_shared(&kev_rwlock);
8569 	for (ev_pcb = LIST_FIRST(&kern_event_head);
8570 	    ev_pcb;
8571 	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8572 		lck_mtx_lock(&ev_pcb->evp_mtx);
8573 		if (ev_pcb->evp_socket->so_pcb == NULL) {
8574 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8575 			continue;
8576 		}
8577 		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8578 			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8579 				lck_mtx_unlock(&ev_pcb->evp_mtx);
8580 				continue;
8581 			}
8582 
8583 			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8584 				if (ev_pcb->evp_class_filter != ev->kev_class) {
8585 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8586 					continue;
8587 				}
8588 
8589 				if ((ev_pcb->evp_subclass_filter !=
8590 				    KEV_ANY_SUBCLASS) &&
8591 				    (ev_pcb->evp_subclass_filter !=
8592 				    ev->kev_subclass)) {
8593 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8594 					continue;
8595 				}
8596 			}
8597 		}
8598 
8599 		m2 = m_copym(m, 0, m->m_len, wait);
8600 		if (m2 == 0) {
8601 			os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8602 			m_free(m);
8603 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8604 			lck_rw_done(&kev_rwlock);
8605 			return ENOMEM;
8606 		}
8607 		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8608 			/*
8609 			 * We use "m" for the socket stats as it would be
8610 			 * unsafe to use "m2"
8611 			 */
8612 			so_inc_recv_data_stat(ev_pcb->evp_socket,
8613 			    1, m->m_len, MBUF_TC_BE);
8614 
8615 			sorwakeup(ev_pcb->evp_socket);
8616 			os_atomic_inc(&kevtstat.kes_posted, relaxed);
8617 		} else {
8618 			os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8619 		}
8620 		lck_mtx_unlock(&ev_pcb->evp_mtx);
8621 	}
8622 	m_free(m);
8623 	lck_rw_done(&kev_rwlock);
8624 
8625 	return 0;
8626 }
8627 
8628 int
kev_post_msg(struct kev_msg * event_msg)8629 kev_post_msg(struct kev_msg *event_msg)
8630 {
8631 	return kev_post_msg_internal(event_msg, M_WAIT);
8632 }
8633 
8634 int
kev_post_msg_nowait(struct kev_msg * event_msg)8635 kev_post_msg_nowait(struct kev_msg *event_msg)
8636 {
8637 	return kev_post_msg_internal(event_msg, M_NOWAIT);
8638 }
8639 
8640 static int
kev_control(struct socket * so,u_long cmd,caddr_t data,__unused struct ifnet * ifp,__unused struct proc * p)8641 kev_control(struct socket *so,
8642     u_long cmd,
8643     caddr_t data,
8644     __unused struct ifnet *ifp,
8645     __unused struct proc *p)
8646 {
8647 	struct kev_request *kev_req = (struct kev_request *) data;
8648 	struct kern_event_pcb  *ev_pcb;
8649 	struct kev_vendor_code *kev_vendor;
8650 	u_int32_t  *id_value = (u_int32_t *) data;
8651 
8652 	switch (cmd) {
8653 	case SIOCGKEVID:
8654 		*id_value = static_event_id;
8655 		break;
8656 	case SIOCSKEVFILT:
8657 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8658 		ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8659 		ev_pcb->evp_class_filter = kev_req->kev_class;
8660 		ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
8661 		break;
8662 	case SIOCGKEVFILT:
8663 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8664 		kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8665 		kev_req->kev_class   = ev_pcb->evp_class_filter;
8666 		kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8667 		break;
8668 	case SIOCGKEVVENDOR:
8669 		kev_vendor = (struct kev_vendor_code *)data;
8670 		/* Make sure string is NULL terminated */
8671 		kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8672 		return net_str_id_find_internal(kev_vendor->vendor_string,
8673 		           &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8674 	default:
8675 		return ENOTSUP;
8676 	}
8677 
8678 	return 0;
8679 }
8680 
8681 int
8682 kevt_getstat SYSCTL_HANDLER_ARGS
8683 {
8684 #pragma unused(oidp, arg1, arg2)
8685 	int error = 0;
8686 
8687 	lck_rw_lock_shared(&kev_rwlock);
8688 
8689 	if (req->newptr != USER_ADDR_NULL) {
8690 		error = EPERM;
8691 		goto done;
8692 	}
8693 	if (req->oldptr == USER_ADDR_NULL) {
8694 		req->oldidx = sizeof(struct kevtstat);
8695 		goto done;
8696 	}
8697 
8698 	error = SYSCTL_OUT(req, &kevtstat,
8699 	    MIN(sizeof(struct kevtstat), req->oldlen));
8700 done:
8701 	lck_rw_done(&kev_rwlock);
8702 
8703 	return error;
8704 }
8705 
8706 __private_extern__ int
8707 kevt_pcblist SYSCTL_HANDLER_ARGS
8708 {
8709 #pragma unused(oidp, arg1, arg2)
8710 	int error = 0;
8711 	uint64_t n, i;
8712 	struct xsystmgen xsg;
8713 	void *buf = NULL;
8714 	size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8715 	    ROUNDUP64(sizeof(struct xsocket_n)) +
8716 	    2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8717 	    ROUNDUP64(sizeof(struct xsockstat_n));
8718 	struct kern_event_pcb  *ev_pcb;
8719 
8720 	buf = kalloc_data(item_size, Z_WAITOK | Z_ZERO);
8721 	if (buf == NULL) {
8722 		return ENOMEM;
8723 	}
8724 
8725 	lck_rw_lock_shared(&kev_rwlock);
8726 
8727 	n = kevtstat.kes_pcbcount;
8728 
8729 	if (req->oldptr == USER_ADDR_NULL) {
8730 		req->oldidx = (size_t) ((n + n / 8) * item_size);
8731 		goto done;
8732 	}
8733 	if (req->newptr != USER_ADDR_NULL) {
8734 		error = EPERM;
8735 		goto done;
8736 	}
8737 	bzero(&xsg, sizeof(xsg));
8738 	xsg.xg_len = sizeof(xsg);
8739 	xsg.xg_count = n;
8740 	xsg.xg_gen = kevtstat.kes_gencnt;
8741 	xsg.xg_sogen = so_gencnt;
8742 	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8743 	if (error) {
8744 		goto done;
8745 	}
8746 	/*
8747 	 * We are done if there is no pcb
8748 	 */
8749 	if (n == 0) {
8750 		goto done;
8751 	}
8752 
8753 	i = 0;
8754 	for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8755 	    i < n && ev_pcb != NULL;
8756 	    i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8757 		struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8758 		struct xsocket_n *xso = (struct xsocket_n *)
8759 		    ADVANCE64(xk, sizeof(*xk));
8760 		struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8761 		    ADVANCE64(xso, sizeof(*xso));
8762 		struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8763 		    ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8764 		struct xsockstat_n *xsostats = (struct xsockstat_n *)
8765 		    ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8766 
8767 		bzero(buf, item_size);
8768 
8769 		lck_mtx_lock(&ev_pcb->evp_mtx);
8770 
8771 		xk->kep_len = sizeof(struct xkevtpcb);
8772 		xk->kep_kind = XSO_EVT;
8773 		xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8774 		xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8775 		xk->kep_class_filter = ev_pcb->evp_class_filter;
8776 		xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8777 
8778 		sotoxsocket_n(ev_pcb->evp_socket, xso);
8779 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
8780 		    &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8781 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
8782 		    &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8783 		sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8784 
8785 		lck_mtx_unlock(&ev_pcb->evp_mtx);
8786 
8787 		error = SYSCTL_OUT(req, buf, item_size);
8788 	}
8789 
8790 	if (error == 0) {
8791 		/*
8792 		 * Give the user an updated idea of our state.
8793 		 * If the generation differs from what we told
8794 		 * her before, she knows that something happened
8795 		 * while we were processing this request, and it
8796 		 * might be necessary to retry.
8797 		 */
8798 		bzero(&xsg, sizeof(xsg));
8799 		xsg.xg_len = sizeof(xsg);
8800 		xsg.xg_count = n;
8801 		xsg.xg_gen = kevtstat.kes_gencnt;
8802 		xsg.xg_sogen = so_gencnt;
8803 		error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8804 		if (error) {
8805 			goto done;
8806 		}
8807 	}
8808 
8809 done:
8810 	lck_rw_done(&kev_rwlock);
8811 
8812 	kfree_data(buf, item_size);
8813 	return error;
8814 }
8815 
8816 #endif /* SOCKETS */
8817 
8818 
8819 int
fill_kqueueinfo(kqueue_t kqu,struct kqueue_info * kinfo)8820 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
8821 {
8822 	struct vinfo_stat * st;
8823 
8824 	st = &kinfo->kq_stat;
8825 
8826 	st->vst_size = kqu.kq->kq_count;
8827 	if (kqu.kq->kq_state & KQ_KEV_QOS) {
8828 		st->vst_blksize = sizeof(struct kevent_qos_s);
8829 	} else if (kqu.kq->kq_state & KQ_KEV64) {
8830 		st->vst_blksize = sizeof(struct kevent64_s);
8831 	} else {
8832 		st->vst_blksize = sizeof(struct kevent);
8833 	}
8834 	st->vst_mode = S_IFIFO;
8835 	st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
8836 	    kqu.kqwl->kqwl_dynamicid : 0;
8837 
8838 	/* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8839 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8840 	static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
8841 	static_assert(PROC_KQUEUE_32 == KQ_KEV32);
8842 	static_assert(PROC_KQUEUE_64 == KQ_KEV64);
8843 	static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
8844 	static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
8845 	static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
8846 	kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
8847 	if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
8848 		if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
8849 			kinfo->kq_state |= PROC_KQUEUE_SELECT;
8850 		}
8851 	}
8852 
8853 	return 0;
8854 }
8855 
8856 static int
fill_kqueue_dyninfo(struct kqworkloop * kqwl,struct kqueue_dyninfo * kqdi)8857 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
8858 {
8859 	workq_threadreq_t kqr = &kqwl->kqwl_request;
8860 	workq_threadreq_param_t trp = {};
8861 	int err;
8862 
8863 	if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
8864 		return EINVAL;
8865 	}
8866 
8867 	if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
8868 		return err;
8869 	}
8870 
8871 	kqlock(kqwl);
8872 
8873 	kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
8874 	kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8875 	kqdi->kqdi_request_state = kqr->tr_state;
8876 	kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8877 	kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8878 	kqdi->kqdi_sync_waiters = 0;
8879 	kqdi->kqdi_sync_waiter_qos = 0;
8880 
8881 	trp.trp_value = kqwl->kqwl_params;
8882 	if (trp.trp_flags & TRP_PRIORITY) {
8883 		kqdi->kqdi_pri = trp.trp_pri;
8884 	} else {
8885 		kqdi->kqdi_pri = 0;
8886 	}
8887 
8888 	if (trp.trp_flags & TRP_POLICY) {
8889 		kqdi->kqdi_pol = trp.trp_pol;
8890 	} else {
8891 		kqdi->kqdi_pol = 0;
8892 	}
8893 
8894 	if (trp.trp_flags & TRP_CPUPERCENT) {
8895 		kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8896 	} else {
8897 		kqdi->kqdi_cpupercent = 0;
8898 	}
8899 
8900 	kqunlock(kqwl);
8901 
8902 	return 0;
8903 }
8904 
8905 
8906 static unsigned long
kevent_extinfo_emit(struct kqueue * kq,struct knote * kn,struct kevent_extinfo * buf,unsigned long buflen,unsigned long nknotes)8907 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8908     unsigned long buflen, unsigned long nknotes)
8909 {
8910 	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8911 		if (kq == knote_get_kq(kn)) {
8912 			if (nknotes < buflen) {
8913 				struct kevent_extinfo *info = &buf[nknotes];
8914 
8915 				kqlock(kq);
8916 
8917 				if (knote_fops(kn)->f_sanitized_copyout) {
8918 					knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
8919 				} else {
8920 					info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
8921 				}
8922 
8923 				if (knote_has_qos(kn)) {
8924 					info->kqext_kev.qos =
8925 					    _pthread_priority_thread_qos_fast(kn->kn_qos);
8926 				} else {
8927 					info->kqext_kev.qos = kn->kn_qos_override;
8928 				}
8929 				info->kqext_kev.filter |= 0xff00; /* sign extend filter */
8930 				info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
8931 				info->kqext_kev.data    = 0; /* this is where sdata lives */
8932 				info->kqext_sdata       = kn->kn_sdata;
8933 				info->kqext_status      = kn->kn_status;
8934 				info->kqext_sfflags     = kn->kn_sfflags;
8935 
8936 				kqunlock(kq);
8937 			}
8938 
8939 			/* we return total number of knotes, which may be more than requested */
8940 			nknotes++;
8941 		}
8942 	}
8943 
8944 	return nknotes;
8945 }
8946 
8947 int
kevent_copyout_proc_dynkqids(void * proc,user_addr_t ubuf,uint32_t ubufsize,int32_t * nkqueues_out)8948 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8949     int32_t *nkqueues_out)
8950 {
8951 	proc_t p = (proc_t)proc;
8952 	struct filedesc *fdp = &p->p_fd;
8953 	unsigned int nkqueues = 0;
8954 	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8955 	size_t buflen, bufsize;
8956 	kqueue_id_t *kq_ids = NULL;
8957 	int err = 0;
8958 
8959 	assert(p != NULL);
8960 
8961 	if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8962 		err = EINVAL;
8963 		goto out;
8964 	}
8965 
8966 	buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8967 
8968 	if (ubuflen != 0) {
8969 		if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8970 			err = ERANGE;
8971 			goto out;
8972 		}
8973 		kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
8974 		if (!kq_ids) {
8975 			err = ENOMEM;
8976 			goto out;
8977 		}
8978 	}
8979 
8980 	kqhash_lock(fdp);
8981 
8982 	u_long kqhashmask = fdp->fd_kqhashmask;
8983 	if (kqhashmask > 0) {
8984 		for (uint32_t i = 0; i < kqhashmask + 1; i++) {
8985 			struct kqworkloop *kqwl;
8986 
8987 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8988 				/* report the number of kqueues, even if they don't all fit */
8989 				if (nkqueues < buflen) {
8990 					kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8991 				}
8992 				nkqueues++;
8993 			}
8994 
8995 			/*
8996 			 * Drop the kqhash lock and take it again to give some breathing room
8997 			 */
8998 			kqhash_unlock(fdp);
8999 			kqhash_lock(fdp);
9000 
9001 			/*
9002 			 * Reevaluate to see if we have raced with someone who changed this -
9003 			 * if we have, we should bail out with the set of info captured so far
9004 			 */
9005 			if (fdp->fd_kqhashmask != kqhashmask) {
9006 				break;
9007 			}
9008 		}
9009 	}
9010 
9011 	kqhash_unlock(fdp);
9012 
9013 	if (kq_ids) {
9014 		size_t copysize;
9015 		if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
9016 			err = ERANGE;
9017 			goto out;
9018 		}
9019 
9020 		assert(ubufsize >= copysize);
9021 		err = copyout(kq_ids, ubuf, copysize);
9022 	}
9023 
9024 out:
9025 	if (kq_ids) {
9026 		kfree_data(kq_ids, bufsize);
9027 	}
9028 
9029 	if (!err) {
9030 		*nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9031 	}
9032 	return err;
9033 }
9034 
9035 int
kevent_copyout_dynkqinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * size_out)9036 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9037     uint32_t ubufsize, int32_t *size_out)
9038 {
9039 	proc_t p = (proc_t)proc;
9040 	struct kqworkloop *kqwl;
9041 	int err = 0;
9042 	struct kqueue_dyninfo kqdi = { };
9043 
9044 	assert(p != NULL);
9045 
9046 	if (ubufsize < sizeof(struct kqueue_info)) {
9047 		return ENOBUFS;
9048 	}
9049 
9050 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9051 	if (!kqwl) {
9052 		return ESRCH;
9053 	}
9054 
9055 	/*
9056 	 * backward compatibility: allow the argument to this call to only be
9057 	 * a struct kqueue_info
9058 	 */
9059 	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9060 		ubufsize = sizeof(struct kqueue_dyninfo);
9061 		err = fill_kqueue_dyninfo(kqwl, &kqdi);
9062 	} else {
9063 		ubufsize = sizeof(struct kqueue_info);
9064 		err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
9065 	}
9066 	if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9067 		*size_out = ubufsize;
9068 	}
9069 	kqworkloop_release(kqwl);
9070 	return err;
9071 }
9072 
9073 int
kevent_copyout_dynkqextinfo(void * proc,kqueue_id_t kq_id,user_addr_t ubuf,uint32_t ubufsize,int32_t * nknotes_out)9074 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9075     uint32_t ubufsize, int32_t *nknotes_out)
9076 {
9077 	proc_t p = (proc_t)proc;
9078 	struct kqworkloop *kqwl;
9079 	int err;
9080 
9081 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9082 	if (!kqwl) {
9083 		return ESRCH;
9084 	}
9085 
9086 	err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
9087 	kqworkloop_release(kqwl);
9088 	return err;
9089 }
9090 
9091 int
pid_kqueue_extinfo(proc_t p,struct kqueue * kq,user_addr_t ubuf,uint32_t bufsize,int32_t * retval)9092 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9093     uint32_t bufsize, int32_t *retval)
9094 {
9095 	struct knote *kn;
9096 	int i;
9097 	int err = 0;
9098 	struct filedesc *fdp = &p->p_fd;
9099 	unsigned long nknotes = 0;
9100 	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9101 	struct kevent_extinfo *kqext = NULL;
9102 
9103 	/* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9104 	buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9105 
9106 	kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
9107 	if (kqext == NULL) {
9108 		err = ENOMEM;
9109 		goto out;
9110 	}
9111 
9112 	proc_fdlock(p);
9113 	u_long fd_knlistsize = fdp->fd_knlistsize;
9114 	struct klist *fd_knlist = fdp->fd_knlist;
9115 
9116 	for (i = 0; i < fd_knlistsize; i++) {
9117 		kn = SLIST_FIRST(&fd_knlist[i]);
9118 		nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9119 
9120 		proc_fdunlock(p);
9121 		proc_fdlock(p);
9122 		/*
9123 		 * Reevaluate to see if we have raced with someone who changed this -
9124 		 * if we have, we return the set of info for fd_knlistsize we knew
9125 		 * in the beginning except if knotes_dealloc interleaves with us.
9126 		 * In that case, we bail out early with the set of info captured so far.
9127 		 */
9128 		if (fd_knlistsize != fdp->fd_knlistsize) {
9129 			if (fdp->fd_knlistsize) {
9130 				/* kq_add_knote might grow fdp->fd_knlist. */
9131 				fd_knlist = fdp->fd_knlist;
9132 			} else {
9133 				break;
9134 			}
9135 		}
9136 	}
9137 	proc_fdunlock(p);
9138 
9139 	knhash_lock(fdp);
9140 	u_long knhashmask = fdp->fd_knhashmask;
9141 
9142 	if (knhashmask != 0) {
9143 		for (i = 0; i < (int)knhashmask + 1; i++) {
9144 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9145 			nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9146 
9147 			knhash_unlock(fdp);
9148 			knhash_lock(fdp);
9149 
9150 			/*
9151 			 * Reevaluate to see if we have raced with someone who changed this -
9152 			 * if we have, we should bail out with the set of info captured so far
9153 			 */
9154 			if (fdp->fd_knhashmask != knhashmask) {
9155 				break;
9156 			}
9157 		}
9158 	}
9159 	knhash_unlock(fdp);
9160 
9161 	assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9162 	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9163 
9164 out:
9165 	kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9166 
9167 	if (!err) {
9168 		*retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9169 	}
9170 	return err;
9171 }
9172 
9173 static unsigned int
klist_copy_udata(struct klist * list,uint64_t * buf,unsigned int buflen,unsigned int nknotes)9174 klist_copy_udata(struct klist *list, uint64_t *buf,
9175     unsigned int buflen, unsigned int nknotes)
9176 {
9177 	struct knote *kn;
9178 	SLIST_FOREACH(kn, list, kn_link) {
9179 		if (nknotes < buflen) {
9180 			/*
9181 			 * kevent_register will always set kn_udata atomically
9182 			 * so that we don't have to take any kqlock here.
9183 			 */
9184 			buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9185 		}
9186 		/* we return total number of knotes, which may be more than requested */
9187 		nknotes++;
9188 	}
9189 
9190 	return nknotes;
9191 }
9192 
9193 int
kevent_proc_copy_uptrs(void * proc,uint64_t * buf,uint32_t bufsize)9194 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9195 {
9196 	proc_t p = (proc_t)proc;
9197 	struct filedesc *fdp = &p->p_fd;
9198 	unsigned int nuptrs = 0;
9199 	unsigned int buflen = bufsize / sizeof(uint64_t);
9200 	struct kqworkloop *kqwl;
9201 	u_long size = 0;
9202 	struct klist *fd_knlist = NULL;
9203 
9204 	if (buflen > 0) {
9205 		assert(buf != NULL);
9206 	}
9207 
9208 	/*
9209 	 * Copyout the uptrs as much as possible but make sure to drop the respective
9210 	 * locks and take them again periodically so that we don't blow through
9211 	 * preemption disabled timeouts. Always reevaluate to see if we have raced
9212 	 * with someone who changed size of the hash - if we have, we return info for
9213 	 * the size of the hash we knew in the beginning except if it drops to 0.
9214 	 * In that case, we bail out with the set of info captured so far
9215 	 */
9216 	proc_fdlock(p);
9217 	size = fdp->fd_knlistsize;
9218 	fd_knlist = fdp->fd_knlist;
9219 
9220 	for (int i = 0; i < size; i++) {
9221 		nuptrs = klist_copy_udata(&fd_knlist[i], buf, buflen, nuptrs);
9222 
9223 		proc_fdunlock(p);
9224 		proc_fdlock(p);
9225 		if (size != fdp->fd_knlistsize) {
9226 			if (fdp->fd_knlistsize) {
9227 				/* kq_add_knote might grow fdp->fd_knlist. */
9228 				fd_knlist = fdp->fd_knlist;
9229 			} else {
9230 				break;
9231 			}
9232 		}
9233 	}
9234 	proc_fdunlock(p);
9235 
9236 	knhash_lock(fdp);
9237 	size = fdp->fd_knhashmask;
9238 
9239 	if (size != 0) {
9240 		for (size_t i = 0; i < size + 1; i++) {
9241 			nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9242 
9243 			knhash_unlock(fdp);
9244 			knhash_lock(fdp);
9245 			/* The only path that can interleave with us today is knotes_dealloc. */
9246 			if (size != fdp->fd_knhashmask) {
9247 				break;
9248 			}
9249 		}
9250 	}
9251 	knhash_unlock(fdp);
9252 
9253 	kqhash_lock(fdp);
9254 	size = fdp->fd_kqhashmask;
9255 
9256 	if (size != 0) {
9257 		for (size_t i = 0; i < size + 1; i++) {
9258 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9259 				if (nuptrs < buflen) {
9260 					buf[nuptrs] = kqwl->kqwl_dynamicid;
9261 				}
9262 				nuptrs++;
9263 			}
9264 
9265 			kqhash_unlock(fdp);
9266 			kqhash_lock(fdp);
9267 			if (size != fdp->fd_kqhashmask) {
9268 				break;
9269 			}
9270 		}
9271 	}
9272 	kqhash_unlock(fdp);
9273 
9274 	return (int)nuptrs;
9275 }
9276 
9277 static void
kevent_set_return_to_kernel_user_tsd(proc_t p,thread_t thread)9278 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9279 {
9280 	uint64_t ast_addr;
9281 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9282 	size_t user_addr_size = proc_is_64bit ? 8 : 4;
9283 	uint32_t ast_flags32 = 0;
9284 	uint64_t ast_flags64 = 0;
9285 	struct uthread *ut = get_bsdthread_info(thread);
9286 
9287 	if (ut->uu_kqr_bound != NULL) {
9288 		ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9289 	}
9290 
9291 	if (ast_flags64 == 0) {
9292 		return;
9293 	}
9294 
9295 	if (!(p->p_flag & P_LP64)) {
9296 		ast_flags32 = (uint32_t)ast_flags64;
9297 		assert(ast_flags64 < 0x100000000ull);
9298 	}
9299 
9300 	ast_addr = thread_rettokern_addr(thread);
9301 	if (ast_addr == 0) {
9302 		return;
9303 	}
9304 
9305 	if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9306 	    (user_addr_t)ast_addr,
9307 	    user_addr_size) != 0) {
9308 		printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9309 		    "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9310 	}
9311 }
9312 
9313 /*
9314  * Semantics of writing to TSD value:
9315  *
9316  * 1. It is written to by the kernel and cleared by userspace.
9317  * 2. When the userspace code clears the TSD field, it takes responsibility for
9318  * taking action on the quantum expiry action conveyed by kernel.
9319  * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9320  * userspace back to kernel to make sure that it is never leaked across thread
9321  * requests.
9322  */
9323 void
kevent_set_workq_quantum_expiry_user_tsd(proc_t p,thread_t thread,uint64_t flags)9324 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9325     uint64_t flags)
9326 {
9327 	uint64_t ast_addr;
9328 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9329 	uint32_t ast_flags32 = 0;
9330 	uint64_t ast_flags64 = flags;
9331 
9332 	if (ast_flags64 == 0) {
9333 		return;
9334 	}
9335 
9336 	if (!(p->p_flag & P_LP64)) {
9337 		ast_flags32 = (uint32_t)ast_flags64;
9338 		assert(ast_flags64 < 0x100000000ull);
9339 	}
9340 
9341 	ast_addr = thread_wqquantum_addr(thread);
9342 	assert(ast_addr != 0);
9343 
9344 	if (proc_is_64bit) {
9345 		if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9346 #if DEBUG || DEVELOPMENT
9347 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9348 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9349 #endif
9350 		}
9351 	} else {
9352 		if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9353 #if DEBUG || DEVELOPMENT
9354 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9355 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9356 #endif
9357 		}
9358 	}
9359 }
9360 
9361 void
kevent_ast(thread_t thread,uint16_t bits)9362 kevent_ast(thread_t thread, uint16_t bits)
9363 {
9364 	proc_t p = current_proc();
9365 
9366 
9367 	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9368 		workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9369 	}
9370 	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9371 		kevent_set_return_to_kernel_user_tsd(p, thread);
9372 	}
9373 
9374 	if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9375 		workq_kern_quantum_expiry_reevaluate(p, thread);
9376 	}
9377 }
9378 
9379 #if DEVELOPMENT || DEBUG
9380 
9381 #define KEVENT_SYSCTL_BOUND_ID 1
9382 
9383 static int
9384 kevent_sysctl SYSCTL_HANDLER_ARGS
9385 {
9386 #pragma unused(oidp, arg2)
9387 	uintptr_t type = (uintptr_t)arg1;
9388 	uint64_t bound_id = 0;
9389 
9390 	if (type != KEVENT_SYSCTL_BOUND_ID) {
9391 		return EINVAL;
9392 	}
9393 
9394 	if (req->newptr) {
9395 		return EINVAL;
9396 	}
9397 
9398 	struct uthread *ut = current_uthread();
9399 	if (!ut) {
9400 		return EFAULT;
9401 	}
9402 
9403 	workq_threadreq_t kqr = ut->uu_kqr_bound;
9404 	if (kqr) {
9405 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9406 			bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9407 		} else {
9408 			bound_id = -1;
9409 		}
9410 	}
9411 
9412 	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9413 }
9414 
9415 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9416     "kevent information");
9417 
9418 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9419     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9420     (void *)KEVENT_SYSCTL_BOUND_ID,
9421     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9422     "get the ID of the bound kqueue");
9423 
9424 #endif /* DEVELOPMENT || DEBUG */
9425