xref: /xnu-8020.101.4/osfmk/kern/waitq.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <kern/ast.h>
58 #include <kern/backtrace.h>
59 #include <kern/kern_types.h>
60 #include <kern/mach_param.h>
61 #include <kern/percpu.h>
62 #include <kern/queue.h>
63 #include <kern/sched_prim.h>
64 #include <kern/simple_lock.h>
65 #include <kern/spl.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/policy_internal.h>
69 #include <kern/turnstile.h>
70 
71 #include <os/hash.h>
72 #include <libkern/section_keywords.h>
73 #include <mach/sync_policy.h>
74 #include <vm/vm_kern.h>
75 
76 #include <sys/kdebug.h>
77 
78 /*!
79  * @const waitq_set_unlink_batch
80  *
81  * @brief
82  * How many links are unhooked under a single set lock hold.
83  *
84  * @discussion
85  * Holding a waitq set lock for too long can cause
86  * extreme contention (when a set is being torn down concurrently
87  * to messages being sent to ports who used to belong to that set).
88  *
89  * In order to fight this, large wait queue sets will drop
90  * and reacquire their lock for each unlinking batch.
91  */
92 static TUNABLE(uint32_t, waitq_set_unlink_batch, "waitq_set_unlink_batch", 64);
93 
94 /*!
95  * @const WQL_PREPOST_MARKER
96  *
97  * @brief
98  * Marker set in the @c wql_wqs field of wait queue linkages to denote that
99  * this linkage has preposted to its wait queue set already.
100  *
101  * @discussion
102  * This bit is manipulated under both the wait queue and the wait queue set
103  * locks, and is used for two purposes:
104  *
105  * - for port set queues, it denotes in which circle queue the linkage
106  *   is queued on (@c waitq_set::wqset_links or @c waitq_set::wqset_preposts)
107  *
108  * - as an optimization during pre-post to not walk sets this link already
109  *   preposted to.
110  */
111 #define WQL_PREPOST_MARKER 1ul
112 
113 #if __LP64__
114 /*!
115  * @struct waitq_link_hdr
116  *
117  * @brief
118  * Common "header" between all linkages, in order to find the waitq_set
119  * of this linkage.
120  *
121  * @discussion
122  * Due to unfortunate alignment constraints on @c queue_chain_t,
123  * this is wildly different for LP64 and ILP32.
124  *
125  * Do note that `wql
126  */
127 struct waitq_link_hdr {
128 	uintptr_t       wql_wqs;
129 };
130 
131 /*!
132  * @struct waitq_sellink
133  *
134  * @brief
135  * Linkages used for select waitq queues to select wait queue sets.
136  *
137  * @discussion
138  * Select linkages are one way (queue to set) for two reasons:
139  *
140  * 1. select doesn't use the wait queue subsystem to discover which file
141  *    descriptor woke up the set (it will instead scan all fds again),
142  *
143  * 2. all linkages are unhooked on each syscall return, so we minimize
144  *    work to be done to be as quick as possible, using a fast invalidation
145  *    scheme based on unique identifiers and sequestering
146  *    (see @c select_set_nextid()).
147  */
148 struct waitq_sellink {
149 	uintptr_t       wql_wqs;
150 	struct waitq_link_list_entry wql_next;
151 	uint64_t        wql_setid;
152 };
153 
154 /*!
155  * @struct waitq_link
156  *
157  * @brief
158  * Linkages used for port wait queues and port-set wait queue sets.
159  *
160  * @discussion
161  * Those linkages go both ways so that receiving messages through a port-set
162  * can quickly find ports that preposted to the set.
163  *
164  * It also means that unhooking linkages cannot be lazy.
165  */
166 struct waitq_link {
167 	uintptr_t       wql_wqs;       /**< wait queue set for this link      */
168 	queue_chain_t   wql_qlink;     /**< linkage through the waitq list    */
169 	queue_chain_t   wql_slink;     /**< linkage through the wqset list    */
170 	struct waitq   *wql_wq;        /**< wait queue for this link          */
171 };
172 #else
173 struct waitq_link_hdr {
174 	uint64_t        __wql_padding;
175 	uintptr_t       wql_wqs;
176 };
177 
178 struct waitq_sellink {
179 	struct waitq_link_list_entry wql_next;
180 	uintptr_t       __wql_padding;
181 	uintptr_t       wql_wqs;
182 	uint64_t        wql_setid;
183 };
184 
185 struct waitq_link {
186 	queue_chain_t   wql_qlink;
187 	uintptr_t       wql_wqs;
188 	struct waitq   *wql_wq;
189 	queue_chain_t   wql_slink;
190 };
191 #endif
192 
193 static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
194     offsetof(struct waitq_sellink, wql_wqs));
195 static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
196     offsetof(struct waitq_link, wql_wqs));
197 static_assert(sizeof(struct waitq) <= WQ_OPAQUE_SIZE, "waitq structure size mismatch");
198 static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure alignment mismatch");
199 
200 static KALLOC_TYPE_DEFINE(waitq_sellink_zone, struct waitq_sellink, KT_PRIV_ACCT);
201 static KALLOC_TYPE_DEFINE(waitq_link_zone, struct waitq_link, KT_PRIV_ACCT);
202 ZONE_DEFINE_ID(ZONE_ID_SELECT_SET, "select_set", struct select_set,
203     ZC_SEQUESTER | ZC_KASAN_NOQUARANTINE | ZC_ZFREE_CLEARMEM);
204 
205 static LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
206 
207 static uint64_t PERCPU_DATA(select_setid);
208 struct waitq select_conflict_queue;
209 
210 #pragma mark waitq links
211 
212 static inline bool
waitq_is_sellink(waitq_type_t type)213 waitq_is_sellink(waitq_type_t type)
214 {
215 	return type == WQT_SELECT || type == WQT_SELECT_SET;
216 }
217 
218 static inline bool
wql_sellink_valid(struct select_set * selset,struct waitq_sellink * link)219 wql_sellink_valid(struct select_set *selset, struct waitq_sellink *link)
220 {
221 	return waitq_valid(selset) && selset->selset_id == link->wql_setid;
222 }
223 
224 static waitq_t
wql_wqs(waitq_link_t link)225 wql_wqs(waitq_link_t link)
226 {
227 	return (waitq_t){ (void *)(link.wqlh->wql_wqs & ~WQL_PREPOST_MARKER) };
228 }
229 
230 static bool
wql_wqs_preposted(waitq_link_t link)231 wql_wqs_preposted(waitq_link_t link)
232 {
233 	return link.wqlh->wql_wqs & WQL_PREPOST_MARKER;
234 }
235 
236 static void
wql_wqs_mark_preposted(waitq_link_t link)237 wql_wqs_mark_preposted(waitq_link_t link)
238 {
239 	assert(!wql_wqs_preposted(link));
240 	link.wqlh->wql_wqs |= WQL_PREPOST_MARKER;
241 }
242 
243 static void
wql_wqs_clear_preposted(waitq_link_t link)244 wql_wqs_clear_preposted(waitq_link_t link)
245 {
246 	assert(wql_wqs_preposted(link));
247 	link.wqlh->wql_wqs &= ~WQL_PREPOST_MARKER;
248 }
249 
250 static circle_queue_t
wql_wqs_queue(struct waitq_set * wqs,struct waitq_link * link)251 wql_wqs_queue(struct waitq_set *wqs, struct waitq_link *link)
252 {
253 	return wql_wqs_preposted(link) ? &wqs->wqset_preposts : &wqs->wqset_links;
254 }
255 
256 static void
wql_list_push(waitq_link_list_t * list,waitq_link_t link)257 wql_list_push(waitq_link_list_t *list, waitq_link_t link)
258 {
259 	link.wqls->wql_next.next = list->next;
260 	list->next = &link.wqls->wql_next;
261 }
262 
263 static inline struct waitq_sellink *
wql_list_elem(struct waitq_link_list_entry * e)264 wql_list_elem(struct waitq_link_list_entry *e)
265 {
266 	return e ? __container_of(e, struct waitq_sellink, wql_next) : NULL;
267 }
268 
269 /*!
270  * @function wql_list_next()
271  *
272  * @brief
273  * Helper function to implement wait queue link list enumeration.
274  *
275  * @param e             in: pointer to the current element,
276  *                      out: pointer to the next element or NULL
277  * @param end           which element to stop enumeration at (NULL for lists,
278  *                      or the first element enumerated for circle queues).
279  * @returns true        (makes writing for(;;) based enumerators easier).
280  */
281 static inline bool
wql_list_next(struct waitq_link_list_entry ** e,struct waitq_link_list_entry * end)282 wql_list_next(struct waitq_link_list_entry **e, struct waitq_link_list_entry *end)
283 {
284 	if (*e == NULL || (*e)->next == end) {
285 		*e = NULL;
286 	} else {
287 		*e = (*e)->next;
288 	}
289 	return true;
290 }
291 
292 #define __wql_list_foreach(it, head, end) \
293 	for (struct waitq_link_list_entry *__it = (head)->next, *__end = end; \
294 	    ((it) = wql_list_elem(__it)); wql_list_next(&__it, __end))
295 
296 #define wql_list_foreach(it, head) \
297 	__wql_list_foreach(it, head, NULL)
298 
299 #define wql_list_foreach_safe(it, head) \
300 	for (struct waitq_link_list_entry *__it = (head)->next;                \
301 	    ((it) = wql_list_elem(__it)) && wql_list_next(&__it, NULL); )
302 
303 /*
304  * Gross hack: passing `__it` to `__wql_list_foreach` makes it stop whether
305  * we circle back to the first element or NULL (whichever comes first).
306  *
307  * This allows to have a single enumeration function oblivious to whether
308  * we enumerate a circle queue or a sellink list.
309  */
310 #define waitq_link_foreach(link, waitq) \
311 	__wql_list_foreach((link).wqls, &(waitq).wq_q->waitq_sellinks, __it)
312 
313 static_assert(offsetof(struct waitq, waitq_sellinks) ==
314     offsetof(struct waitq, waitq_links));
315 static_assert(offsetof(struct waitq_sellink, wql_next) ==
316     offsetof(struct waitq_link, wql_qlink.next));
317 
318 static struct waitq_link *
wql_find(struct waitq * waitq,waitq_t wqset)319 wql_find(struct waitq *waitq, waitq_t wqset)
320 {
321 	struct waitq_link *link;
322 
323 	cqe_foreach_element(link, &waitq->waitq_links, wql_qlink) {
324 		if (waitq_same(wql_wqs(link), wqset)) {
325 			return link;
326 		}
327 	}
328 
329 	return NULL;
330 }
331 
332 waitq_link_t
waitq_link_alloc(waitq_type_t type)333 waitq_link_alloc(waitq_type_t type)
334 {
335 	waitq_link_t link;
336 
337 	if (waitq_is_sellink(type)) {
338 		link.wqls = zalloc_flags(waitq_sellink_zone, Z_WAITOK | Z_ZERO);
339 	} else {
340 		link.wqll = zalloc_flags(waitq_link_zone, Z_WAITOK | Z_ZERO);
341 	}
342 	return link;
343 }
344 
345 void
waitq_link_free(waitq_type_t type,waitq_link_t link)346 waitq_link_free(waitq_type_t type, waitq_link_t link)
347 {
348 	if (waitq_is_sellink(type)) {
349 		return zfree(waitq_sellink_zone, link.wqls);
350 	} else {
351 		return zfree(waitq_link_zone, link.wqll);
352 	}
353 }
354 
355 void
waitq_link_free_list(waitq_type_t type,waitq_link_list_t * free_l)356 waitq_link_free_list(waitq_type_t type, waitq_link_list_t *free_l)
357 {
358 	waitq_link_t link;
359 
360 	wql_list_foreach_safe(link.wqls, free_l) {
361 		waitq_link_free(type, link);
362 	}
363 
364 	free_l->next = NULL;
365 }
366 
367 
368 #pragma mark global wait queues
369 
370 static __startup_data struct waitq g_boot_waitq;
371 static SECURITY_READ_ONLY_LATE(struct waitq *) global_waitqs = &g_boot_waitq;
372 static SECURITY_READ_ONLY_LATE(uint32_t) g_num_waitqs = 1;
373 
374 /*
375  * Zero out the used MSBs of the event.
376  */
377 #define _CAST_TO_EVENT_MASK(event) \
378 	((waitq_flags_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
379 
380 static inline uint32_t
waitq_hash(char * key,size_t length)381 waitq_hash(char *key, size_t length)
382 {
383 	return os_hash_jenkins(key, length) & (g_num_waitqs - 1);
384 }
385 
386 /* return a global waitq pointer corresponding to the given event */
387 struct waitq *
_global_eventq(char * event,size_t event_length)388 _global_eventq(char *event, size_t event_length)
389 {
390 	return &global_waitqs[waitq_hash(event, event_length)];
391 }
392 
393 bool
waitq_is_valid(waitq_t waitq)394 waitq_is_valid(waitq_t waitq)
395 {
396 	return waitq_valid(waitq);
397 }
398 
399 static inline bool
waitq_is_global(waitq_t waitq)400 waitq_is_global(waitq_t waitq)
401 {
402 	if (waitq_type(waitq) != WQT_QUEUE) {
403 		return false;
404 	}
405 	return waitq.wq_q >= global_waitqs && waitq.wq_q < global_waitqs + g_num_waitqs;
406 }
407 
408 static inline bool
waitq_empty(waitq_t wq)409 waitq_empty(waitq_t wq)
410 {
411 	struct turnstile *ts;
412 
413 	switch (waitq_type(wq)) {
414 	case WQT_TURNSTILE:
415 		return priority_queue_empty(&wq.wq_q->waitq_prio_queue);
416 	case WQT_PORT:
417 		ts = wq.wq_q->waitq_ts;
418 		return ts == TURNSTILE_NULL ||
419 		       priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
420 	case WQT_QUEUE:
421 	case WQT_SELECT:
422 	case WQT_PORT_SET:
423 	case WQT_SELECT_SET:
424 		return circle_queue_empty(&wq.wq_q->waitq_queue);
425 
426 	default:
427 		return true;
428 	}
429 }
430 
431 #if CONFIG_WAITQ_STATS
432 #define NWAITQ_BTFRAMES 5
433 
434 struct wq_stats {
435 	uint64_t waits;
436 	uint64_t wakeups;
437 	uint64_t clears;
438 	uint64_t failed_wakeups;
439 
440 	uintptr_t last_wait[NWAITQ_BTFRAMES];
441 	uintptr_t last_wakeup[NWAITQ_BTFRAMES];
442 	uintptr_t last_failed_wakeup[NWAITQ_BTFRAMES];
443 };
444 
445 /* this global is for lldb */
446 const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
447 struct wq_stats g_boot_stats;
448 struct wq_stats *g_waitq_stats = &g_boot_stats;
449 
450 static __inline__ void
waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES],unsigned skip)451 waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], unsigned skip)
452 {
453 	uintptr_t buf[NWAITQ_BTFRAMES + skip];
454 
455 	memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
456 	backtrace(buf, g_nwaitq_btframes + skip, NULL, NULL);
457 	memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
458 }
459 
460 static __inline__ struct wq_stats *
waitq_global_stats(waitq_t waitq)461 waitq_global_stats(waitq_t waitq)
462 {
463 	struct wq_stats *wqs;
464 	uint32_t idx;
465 
466 	if (!waitq_is_global(waitq)) {
467 		return NULL;
468 	}
469 
470 	idx = (uint32_t)(waitq.wq_q - global_waitqs);
471 	assert(idx < g_num_waitqs);
472 	wqs = &g_waitq_stats[idx];
473 	return wqs;
474 }
475 
476 static __inline__ void
waitq_stats_count_wait(waitq_t waitq)477 waitq_stats_count_wait(waitq_t waitq)
478 {
479 	struct wq_stats *wqs = waitq_global_stats(waitq);
480 	if (wqs != NULL) {
481 		wqs->waits++;
482 		waitq_grab_backtrace(wqs->last_wait, 2);
483 	}
484 }
485 
486 static __inline__ void
waitq_stats_count_wakeup(waitq_t waitq,int n)487 waitq_stats_count_wakeup(waitq_t waitq, int n)
488 {
489 	struct wq_stats *wqs = waitq_global_stats(waitq);
490 	if (wqs != NULL) {
491 		if (n > 0) {
492 			wqs->wakeups += n;
493 			waitq_grab_backtrace(wqs->last_wakeup, 2);
494 		} else {
495 			wqs->failed_wakeups++;
496 			waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
497 		}
498 	}
499 }
500 
501 static __inline__ void
waitq_stats_count_clear_wakeup(waitq_t waitq)502 waitq_stats_count_clear_wakeup(waitq_t waitq)
503 {
504 	struct wq_stats *wqs = waitq_global_stats(waitq);
505 	if (wqs != NULL) {
506 		wqs->wakeups++;
507 		wqs->clears++;
508 		waitq_grab_backtrace(wqs->last_wakeup, 2);
509 	}
510 }
511 #else /* !CONFIG_WAITQ_STATS */
512 #define waitq_stats_count_wait(q)         do { } while (0)
513 #define waitq_stats_count_wakeup(q, n)    do { } while (0)
514 #define waitq_stats_count_clear_wakeup(q) do { } while (0)
515 #endif
516 
517 static struct waitq *
waitq_get_safeq(waitq_t waitq)518 waitq_get_safeq(waitq_t waitq)
519 {
520 	if (waitq_type(waitq) == WQT_PORT) {
521 		struct turnstile *ts = waitq.wq_q->waitq_ts;
522 		return ts ? &ts->ts_waitq : NULL;
523 	}
524 
525 	uint32_t hash = os_hash_kernel_pointer(waitq.wq_q);
526 	return &global_waitqs[hash & (g_num_waitqs - 1)];
527 }
528 
529 /*
530  * Since the priority ordered waitq uses basepri as the
531  * ordering key assert that this value fits in a uint8_t.
532  */
533 static_assert(MAXPRI <= UINT8_MAX);
534 
535 static inline void
waitq_thread_insert(struct waitq * safeq,thread_t thread,waitq_t wq,event64_t event)536 waitq_thread_insert(struct waitq *safeq, thread_t thread,
537     waitq_t wq, event64_t event)
538 {
539 	if (waitq_type(safeq) == WQT_TURNSTILE) {
540 		turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
541 		turnstile_waitq_add_thread_priority_queue(safeq, thread);
542 	} else {
543 		turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
544 		/*
545 		 * This is the extent to which we currently take scheduling
546 		 * attributes into account:
547 		 *
548 		 * - If the thread is vm privileged, we stick it at the front
549 		 *   of the queue, later, these queues will honor the policy
550 		 *   value set at waitq_init time.
551 		 *
552 		 * - Realtime threads get priority for wait queue placements.
553 		 *   This allows wait_queue_wakeup_one to prefer a waiting
554 		 *   realtime thread, similar in principle to performing
555 		 *   a wait_queue_wakeup_all and allowing scheduler
556 		 *   prioritization to run the realtime thread, but without
557 		 *   causing the lock contention of that scenario.
558 		 */
559 		if (thread->sched_pri >= BASEPRI_REALTIME ||
560 		    !safeq->waitq_fifo ||
561 		    (thread->options & TH_OPT_VMPRIV)) {
562 			circle_enqueue_head(&safeq->waitq_queue, &thread->wait_links);
563 		} else {
564 			circle_enqueue_tail(&safeq->waitq_queue, &thread->wait_links);
565 		}
566 	}
567 
568 	/* mark the event and real waitq, even if enqueued on a global safeq */
569 	thread->wait_event = event;
570 	thread->waitq = wq;
571 }
572 
573 /**
574  * clear the thread-related waitq state
575  *
576  * Conditions:
577  *	'thread' is locked
578  */
579 static inline void
thread_clear_waitq_state(thread_t thread)580 thread_clear_waitq_state(thread_t thread)
581 {
582 	thread->waitq.wq_q = NULL;
583 	thread->wait_event = NO_EVENT64;
584 	thread->at_safe_point = FALSE;
585 }
586 
587 static inline void
waitq_thread_remove(waitq_t wq,thread_t thread)588 waitq_thread_remove(waitq_t wq, thread_t thread)
589 {
590 	if (waitq_type(wq) == WQT_TURNSTILE) {
591 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
592 		    (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS,
593 		    (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
594 		    VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq.wq_q)),
595 		    thread_tid(thread), 0, 0, 0);
596 		priority_queue_remove(&wq.wq_q->waitq_prio_queue,
597 		    &thread->wait_prioq_links);
598 	} else {
599 		circle_dequeue(&wq.wq_q->waitq_queue, &thread->wait_links);
600 		if (waitq_is_global(wq) && waitq_empty(wq)) {
601 			wq.wq_q->waitq_eventmask = 0;
602 		}
603 	}
604 
605 	thread_clear_waitq_state(thread);
606 }
607 
608 __startup_func
609 static void
waitq_bootstrap(void)610 waitq_bootstrap(void)
611 {
612 	const uint32_t qsz = sizeof(struct waitq);
613 	kern_return_t kret;
614 	vm_offset_t whsize;
615 	int cpu = 0;
616 
617 	/*
618 	 * Determine the amount of memory we're willing to reserve for
619 	 * the waitqueue hash table
620 	 */
621 	if (!PE_parse_boot_argn("wqsize", &whsize, sizeof(whsize))) {
622 		whsize = round_page(thread_max * qsz / 5);
623 	}
624 
625 	/*
626 	 * Determine the number of waitqueues we can fit.
627 	 * The hash algorithm requires that this be a power of 2.
628 	 */
629 	g_num_waitqs = 0x80000000u >> __builtin_clzl(whsize / qsz);
630 	assert(g_num_waitqs > 0);
631 	whsize = round_page(g_num_waitqs * qsz);
632 
633 	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
634 	    whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_PERMANENT,
635 	    VM_KERN_MEMORY_WAITQ);
636 	if (kret != KERN_SUCCESS || global_waitqs == NULL) {
637 		panic("kernel_memory_allocate() failed to alloc global_waitqs"
638 		    ", error: %d, whsize: 0x%x", kret, (uint32_t)whsize);
639 	}
640 
641 #if CONFIG_WAITQ_STATS
642 	whsize = round_page(g_num_waitqs * sizeof(struct wq_stats));
643 	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
644 	    whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_ZERO | KMA_PERMANENT,
645 	    VM_KERN_MEMORY_WAITQ);
646 	if (kret != KERN_SUCCESS || global_waitqs == NULL) {
647 		panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
648 		    ", error: %d, whsize: 0x%x", kret, whsize);
649 	}
650 #endif
651 
652 	for (uint32_t i = 0; i < g_num_waitqs; i++) {
653 		waitq_init(&global_waitqs[i], WQT_QUEUE, SYNC_POLICY_FIFO);
654 	}
655 
656 	waitq_init(&select_conflict_queue, WQT_SELECT, SYNC_POLICY_FIFO);
657 
658 	percpu_foreach(setid, select_setid) {
659 		/* is not cpu_number() but CPUs haven't been numbered yet */
660 		*setid = cpu++;
661 	}
662 }
663 STARTUP(MACH_IPC, STARTUP_RANK_FIRST, waitq_bootstrap);
664 
665 
666 #pragma mark locking
667 
668 /*
669  * Double the standard lock timeout, because wait queues tend
670  * to iterate over a number of threads - locking each.  If there is
671  * a problem with a thread lock, it normally times out at the wait
672  * queue level first, hiding the real problem.
673  */
674 /* For x86, the hardware timeout is in TSC units. */
675 #if defined(__i386__) || defined(__x86_64__)
676 #define waitq_timeout (2 * LockTimeOutTSC)
677 #else
678 #define waitq_timeout (2 * os_atomic_load(&LockTimeOut, relaxed))
679 #endif
680 
681 static hw_lock_timeout_status_t
waitq_timeout_handler(void * _lock,uint64_t timeout,uint64_t start,uint64_t now,uint64_t interrupt_time)682 waitq_timeout_handler(void *_lock, uint64_t timeout,
683     uint64_t start, uint64_t now, uint64_t interrupt_time)
684 {
685 #pragma unused(interrupt_time)
686 
687 	lck_spinlock_to_info_t lsti;
688 	hw_lck_ticket_t *lck = _lock;
689 	hw_lck_ticket_t tmp;
690 	struct waitq *wq = __container_of(lck, struct waitq, waitq_interlock);
691 
692 	if (machine_timeout_suspended()) {
693 		return HW_LOCK_TIMEOUT_CONTINUE;
694 	}
695 
696 	lsti = lck_spinlock_timeout_hit(lck, 0);
697 	tmp.tcurnext = os_atomic_load(&lck->tcurnext, relaxed);
698 
699 	panic("waitq(%p) lock timeout after %llu ticks; cpu=%d, "
700 	    "cticket: 0x%x, nticket: 0x%x, waiting for 0x%x, "
701 #if INTERRUPT_MASKED_DEBUG
702 	    "interrupt time: %llu, "
703 #endif /* INTERRUPT_MASKED_DEBUG */
704 	    "start time: %llu, now: %llu, timeout: %llu",
705 	    wq, now - start, cpu_number(),
706 	    tmp.cticket, tmp.nticket, lsti->extra,
707 #if INTERRUPT_MASKED_DEBUG
708 	    interrupt_time,
709 #endif /* INTERRUPT_MASKED_DEBUG */
710 	    start, now, timeout);
711 }
712 
713 void
waitq_invalidate(waitq_t waitq)714 waitq_invalidate(waitq_t waitq)
715 {
716 	hw_lck_ticket_invalidate(&waitq.wq_q->waitq_interlock);
717 }
718 
719 bool
waitq_held(waitq_t wq)720 waitq_held(waitq_t wq)
721 {
722 	return hw_lck_ticket_held(&wq.wq_q->waitq_interlock);
723 }
724 
725 void
waitq_lock(waitq_t wq)726 waitq_lock(waitq_t wq)
727 {
728 	(void)hw_lck_ticket_lock_to(&wq.wq_q->waitq_interlock,
729 	    waitq_timeout, waitq_timeout_handler, &waitq_lck_grp);
730 #if defined(__x86_64__)
731 	pltrace(FALSE);
732 #endif
733 }
734 
735 bool
waitq_lock_try(waitq_t wq)736 waitq_lock_try(waitq_t wq)
737 {
738 	bool rc = hw_lck_ticket_lock_try(&wq.wq_q->waitq_interlock, &waitq_lck_grp);
739 
740 #if defined(__x86_64__)
741 	if (rc) {
742 		pltrace(FALSE);
743 	}
744 #endif
745 	return rc;
746 }
747 
748 bool
waitq_lock_reserve(waitq_t wq,uint32_t * ticket)749 waitq_lock_reserve(waitq_t wq, uint32_t *ticket)
750 {
751 	return hw_lck_ticket_reserve(&wq.wq_q->waitq_interlock, ticket, &waitq_lck_grp);
752 }
753 
754 static hw_lock_status_t
waitq_lock_reserve_allow_invalid(waitq_t wq,uint32_t * ticket)755 waitq_lock_reserve_allow_invalid(waitq_t wq, uint32_t *ticket)
756 {
757 	return hw_lck_ticket_reserve_allow_invalid(&wq.wq_q->waitq_interlock,
758 	           ticket, &waitq_lck_grp);
759 }
760 
761 void
waitq_lock_wait(waitq_t wq,uint32_t ticket)762 waitq_lock_wait(waitq_t wq, uint32_t ticket)
763 {
764 	(void)hw_lck_ticket_wait(&wq.wq_q->waitq_interlock, ticket,
765 	    waitq_timeout, waitq_timeout_handler, &waitq_lck_grp);
766 #if defined(__x86_64__)
767 	pltrace(FALSE);
768 #endif
769 }
770 
771 bool
waitq_lock_allow_invalid(waitq_t wq)772 waitq_lock_allow_invalid(waitq_t wq)
773 {
774 	hw_lock_status_t rc;
775 
776 	rc = hw_lck_ticket_lock_allow_invalid(&wq.wq_q->waitq_interlock,
777 	    waitq_timeout, waitq_timeout_handler, &waitq_lck_grp);
778 
779 #if defined(__x86_64__)
780 	if (rc == HW_LOCK_ACQUIRED) {
781 		pltrace(FALSE);
782 	}
783 #endif
784 	return rc == HW_LOCK_ACQUIRED;
785 }
786 
787 void
waitq_unlock(waitq_t wq)788 waitq_unlock(waitq_t wq)
789 {
790 	assert(waitq_held(wq));
791 #if defined(__x86_64__)
792 	pltrace(TRUE);
793 #endif
794 	hw_lck_ticket_unlock(&wq.wq_q->waitq_interlock);
795 }
796 
797 
798 #pragma mark assert_wait / wakeup
799 
800 typedef thread_t (^waitq_select_cb)(struct waitq *waitq, thread_t thread);
801 
802 struct waitq_select_args {
803 	/* input parameters */
804 	event64_t            event;
805 	waitq_select_cb      select_cb;
806 	int                  priority;
807 	wait_result_t        result;
808 	waitq_options_t      options;
809 
810 	/* output parameters */
811 	uint32_t             max_threads;
812 	uint32_t             nthreads;
813 	spl_t                spl;
814 	circle_queue_head_t  threadq;
815 };
816 
817 static inline void
maybe_adjust_thread_pri(thread_t thread,int priority,__kdebug_only waitq_t waitq)818 maybe_adjust_thread_pri(thread_t thread, int priority,
819     __kdebug_only waitq_t waitq)
820 {
821 	/*
822 	 * If the caller is requesting the waitq subsystem to promote the
823 	 * priority of the awoken thread, then boost the thread's priority to
824 	 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
825 	 * higher priority).  This boost must be removed via a call to
826 	 * waitq_clear_promotion_locked before the thread waits again.
827 	 *
828 	 * WAITQ_PROMOTE_PRIORITY is -2.
829 	 * Anything above 0 represents a mutex promotion.
830 	 * The default 'no action' value is -1.
831 	 * TODO: define this in a header
832 	 */
833 	if (priority == WAITQ_PROMOTE_PRIORITY) {
834 		uintptr_t trace_waitq = 0;
835 		if (__improbable(kdebug_enable)) {
836 			trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq.wq_q);
837 		}
838 
839 		sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
840 	}
841 }
842 
843 static void
waitq_select_queue_flush(waitq_t waitq,struct waitq_select_args * args)844 waitq_select_queue_flush(waitq_t waitq, struct waitq_select_args *args)
845 {
846 	thread_t thread = THREAD_NULL;
847 	__assert_only kern_return_t kr;
848 
849 	cqe_foreach_element_safe(thread, &args->threadq, wait_links) {
850 		circle_dequeue(&args->threadq, &thread->wait_links);
851 		maybe_adjust_thread_pri(thread, args->priority, waitq);
852 		kr = thread_go(thread, args->result, args->options);
853 		assert(kr == KERN_SUCCESS);
854 		thread_unlock(thread);
855 	}
856 }
857 
858 /**
859  * Routine to iterate over the waitq for non-priority ordered waitqs
860  *
861  * Conditions:
862  *	args->waitq (and the posted waitq) is locked
863  *
864  * Notes:
865  *	Uses the optional select callback function to refine the selection
866  *	of one or more threads from a waitq. The select callback is invoked
867  *	once for every thread that is found to be waiting on the input args->waitq.
868  *
869  *	If one or more threads are selected, this may disable interrupts.
870  *	The previous interrupt state is returned in args->spl and should
871  *	be used in a call to splx() if threads are returned to the caller.
872  */
873 static thread_t
waitq_queue_iterate_locked(struct waitq * safeq,struct waitq * waitq,struct waitq_select_args * args,waitq_flags_t * remaining_eventmask)874 waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,
875     struct waitq_select_args *args, waitq_flags_t *remaining_eventmask)
876 {
877 	thread_t thread = THREAD_NULL;
878 	thread_t first_thread = THREAD_NULL;
879 
880 	cqe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
881 		thread_t t = THREAD_NULL;
882 		assert_thread_magic(thread);
883 
884 		/*
885 		 * For non-priority ordered waitqs, we allow multiple events to be
886 		 * mux'ed into the same waitq. Also safeqs may contain threads from
887 		 * multiple waitqs. Only pick threads that match the
888 		 * requested wait event.
889 		 */
890 		if (waitq_same(thread->waitq, waitq) && thread->wait_event == args->event) {
891 			t = thread;
892 			if (first_thread == THREAD_NULL) {
893 				first_thread = thread;
894 			}
895 
896 			/* allow the caller to futher refine the selection */
897 			if (args->select_cb) {
898 				t = args->select_cb(waitq, thread);
899 			}
900 			if (t != THREAD_NULL) {
901 				args->nthreads += 1;
902 				if (args->nthreads == 1 && safeq == waitq) {
903 					args->spl = splsched();
904 				}
905 				thread_lock(t);
906 				thread_clear_waitq_state(t);
907 				circle_dequeue(&safeq->waitq_queue, &thread->wait_links);
908 				circle_enqueue_tail(&args->threadq, &t->wait_links);
909 				/* only enqueue up to 'max' threads */
910 				if (args->nthreads >= args->max_threads) {
911 					break;
912 				}
913 			}
914 		}
915 
916 		/* thread wasn't selected so track its event */
917 		if (t == THREAD_NULL) {
918 			*remaining_eventmask |= waitq_same(thread->waitq, safeq)
919 			    ? _CAST_TO_EVENT_MASK(thread->wait_event)
920 			    : _CAST_TO_EVENT_MASK(thread->waitq.wq_q);
921 		}
922 	}
923 
924 	return first_thread;
925 }
926 
927 /**
928  * Routine to iterate and remove threads from priority ordered waitqs
929  *
930  * Conditions:
931  *	args->waitq (and the posted waitq) is locked
932  *
933  * Notes:
934  *	The priority ordered waitqs only support maximum priority element removal.
935  *
936  *	Also, the implementation makes sure that all threads in a priority ordered
937  *	waitq are waiting on the same wait event. This is not necessarily true for
938  *	non-priority ordered waitqs. If one or more threads are selected, this may
939  *	disable interrupts. The previous interrupt state is returned in args->spl
940  *	and should be used in a call to splx() if threads are returned to the caller.
941  *
942  *	In the future, we could support priority ordered waitqs with multiple wait
943  *	events in the same queue. The way to implement that would be to keep removing
944  *	elements from the waitq and if the event does not match the requested one,
945  *	add it to a local list. This local list of elements needs to be re-inserted
946  *	into the priority queue at the end and the select_cb return value &
947  *	remaining_eventmask would need to be handled appropriately. The implementation
948  *	is not very efficient but would work functionally.
949  */
950 static thread_t
waitq_prioq_iterate_locked(struct waitq * safeq,struct waitq * waitq,struct waitq_select_args * args,waitq_flags_t * remaining_eventmask)951 waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
952     struct waitq_select_args *args, waitq_flags_t *remaining_eventmask)
953 {
954 	thread_t first_thread = THREAD_NULL;
955 	thread_t thread = THREAD_NULL;
956 
957 	/*
958 	 * The only possible values for remaining_eventmask for the priority queue
959 	 * waitq are either 0 (for the remove all threads case) or the original
960 	 * safeq->waitq_eventmask (for the lookup/remove one thread cases).
961 	 */
962 	*remaining_eventmask = safeq->waitq_eventmask;
963 
964 	while (args->nthreads < args->max_threads) {
965 		if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
966 			*remaining_eventmask = 0;
967 			break;
968 		}
969 
970 		thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
971 		    struct thread, wait_prioq_links);
972 
973 		/*
974 		 * Ensure the wait event matches since priority ordered waitqs do not
975 		 * support multiple events in the same waitq.
976 		 */
977 		assert(waitq_same(thread->waitq, waitq) && (thread->wait_event == args->event));
978 
979 		if (args->select_cb) {
980 			/*
981 			 * Call the select_cb passed into the waitq_select args. The callback
982 			 * updates the select_ctx with information about the highest priority
983 			 * thread which is eventually used by the caller.
984 			 */
985 			thread_t __assert_only ret_thread = args->select_cb(waitq, thread);
986 			assert(ret_thread == thread);
987 		}
988 
989 		if (first_thread == THREAD_NULL) {
990 			first_thread = thread;
991 			/*
992 			 * turnstile_kernel_update_inheritor_on_wake_locked will lock
993 			 * first_thread, so call it before locking it.
994 			 */
995 			if (args->priority == WAITQ_PROMOTE_ON_WAKE &&
996 			    first_thread != THREAD_NULL &&
997 			    waitq_type(safeq) == WQT_TURNSTILE) {
998 				turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq),
999 				    (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
1000 			}
1001 		}
1002 
1003 		/* Add the thread to the result thread list */
1004 		args->nthreads += 1;
1005 		if (args->nthreads == 1 && safeq == waitq) {
1006 			args->spl = splsched();
1007 		}
1008 		thread_lock(thread);
1009 		thread_clear_waitq_state(thread);
1010 		circle_enqueue_tail(&args->threadq, &thread->wait_links);
1011 	}
1012 
1013 	return first_thread;
1014 }
1015 
1016 /**
1017  * @function do_waitq_select_n_locked_queue
1018  *
1019  * @brief
1020  * Selects threads waiting on a wait queue.
1021  *
1022  * @discussion
1023  * @c waitq is locked.
1024  * If @c waitq is a set, then the wait queue posting to it is locked too.
1025  *
1026  * Uses the optional select callback function to refine the selection
1027  * of one or more threads from a waitq.
1028  *
1029  * The select callback is invoked once for every thread that
1030  * is found to be waiting on the input args->waitq.
1031  *
1032  * If one or more threads are selected, this may disable interrupts.
1033  * The previous interrupt state is returned in args->spl and should
1034  * be used in a call to splx() if threads are returned to the caller.
1035  */
1036 static void
do_waitq_select_n_locked_queue(waitq_t waitq,struct waitq_select_args * args)1037 do_waitq_select_n_locked_queue(waitq_t waitq, struct waitq_select_args *args)
1038 {
1039 	thread_t first_thread = THREAD_NULL;
1040 	struct waitq *safeq;
1041 	waitq_flags_t remaining_eventmask = 0;
1042 	waitq_flags_t eventmask;
1043 
1044 	if (waitq_irq_safe(waitq)) {
1045 		eventmask = _CAST_TO_EVENT_MASK(args->event);
1046 		safeq = waitq.wq_q;
1047 	} else {
1048 		/* JMM - add flag to waitq to avoid global lookup if no waiters */
1049 		eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1050 		safeq = waitq_get_safeq(waitq);
1051 		if (safeq == NULL) {
1052 			return;
1053 		}
1054 
1055 		if (args->nthreads == 0) {
1056 			args->spl = splsched();
1057 		}
1058 		waitq_lock(safeq);
1059 	}
1060 
1061 	/*
1062 	 * If the safeq doesn't have an eventmask (not global) or the event
1063 	 * we're looking for IS set in its eventmask, then scan the threads
1064 	 * in that queue for ones that match the original <waitq,event> pair.
1065 	 */
1066 	if (waitq_type(safeq) == WQT_TURNSTILE) {
1067 		first_thread = waitq_prioq_iterate_locked(safeq, waitq.wq_q,
1068 		    args, &remaining_eventmask);
1069 	} else if (!waitq_is_global(safeq) ||
1070 	    (safeq->waitq_eventmask & eventmask) == eventmask) {
1071 		first_thread = waitq_queue_iterate_locked(safeq, waitq.wq_q,
1072 		    args, &remaining_eventmask);
1073 
1074 		/*
1075 		 * Update the eventmask of global queues we just scanned:
1076 		 * - If we selected all the threads in the queue,
1077 		 *   we can clear its eventmask.
1078 		 *
1079 		 * - If we didn't find enough threads to fill our needs,
1080 		 *   then we can assume we looked at every thread in the queue
1081 		 *   and the mask we computed is complete - so reset it.
1082 		 */
1083 		if (waitq_is_global(safeq)) {
1084 			if (waitq_empty(safeq)) {
1085 				safeq->waitq_eventmask = 0;
1086 			} else if (args->nthreads < args->max_threads) {
1087 				safeq->waitq_eventmask = remaining_eventmask;
1088 			}
1089 		}
1090 	}
1091 
1092 	/*
1093 	 * Grab the first thread in the queue if no other thread was selected.
1094 	 * We can guarantee that no one has manipulated this thread because
1095 	 * it's waiting on the given waitq, and we have that waitq locked.
1096 	 */
1097 	if (args->nthreads == 0 && first_thread != THREAD_NULL) {
1098 		/* we know this is the first (and only) thread */
1099 		args->nthreads += 1;
1100 		if (safeq == waitq.wq_q) {
1101 			args->spl = splsched();
1102 		}
1103 
1104 		thread_lock(first_thread);
1105 		waitq_thread_remove(safeq, first_thread);
1106 		circle_enqueue_tail(&args->threadq, &first_thread->wait_links);
1107 	}
1108 
1109 	/* unlock the safe queue if we locked one above */
1110 	if (!waitq_same(waitq, safeq)) {
1111 		waitq_unlock(safeq);
1112 		if (args->nthreads == 0) {
1113 			splx(args->spl);
1114 			args->spl = 0;
1115 		}
1116 	}
1117 }
1118 
1119 /**
1120  * @function do_waitq_link_select_n_locked()
1121  *
1122  * @brief
1123  * Selects threads waiting on any set a wait queue belongs to,
1124  * or preposts the wait queue onto them.
1125  *
1126  * @discussion
1127  * @c waitq is locked.
1128  */
1129 __attribute__((noinline))
1130 static void
do_waitq_select_n_locked_sets(waitq_t waitq,struct waitq_select_args * args)1131 do_waitq_select_n_locked_sets(waitq_t waitq, struct waitq_select_args *args)
1132 {
1133 	waitq_type_t wq_type = waitq_type(waitq);
1134 	waitq_link_t link;
1135 	hw_lock_status_t st;
1136 	uint32_t ticket;
1137 
1138 	assert(args->event == NO_EVENT64);
1139 	assert(waitq_preposts(waitq));
1140 
1141 	waitq_link_foreach(link, waitq) {
1142 		waitq_t wqset = wql_wqs(link);
1143 
1144 		if (wql_wqs_preposted(link)) {
1145 			/*
1146 			 * The wql_wqs_preposted() bit is cleared
1147 			 * under both the wq/wqset lock.
1148 			 *
1149 			 * If the wqset is still preposted,
1150 			 * we really won't find threads there.
1151 			 *
1152 			 * Just mark the waitq as preposted and move on.
1153 			 */
1154 			if (wq_type == WQT_PORT) {
1155 				waitq.wq_q->waitq_preposted = true;
1156 			}
1157 			continue;
1158 		}
1159 
1160 		if (wq_type == WQT_SELECT) {
1161 			/*
1162 			 * If PGZ picked this select set,
1163 			 * translate it to the real address
1164 			 *
1165 			 * If it is still a select set
1166 			 * (the slot could have been reused),
1167 			 * then keep using it for the rest of the logic.
1168 			 *
1169 			 * Even in the extremely unlikely case where
1170 			 * the slot was reused for another select_set,
1171 			 * the `wql_sellink_valid` check below will
1172 			 * take care of debouncing it. But we must
1173 			 * forget the original pointer we read
1174 			 * so that we unlock the proper object.
1175 			 */
1176 			wqset.wqs_sel = pgz_decode_allow_invalid(wqset.wqs_sel,
1177 			    ZONE_ID_SELECT_SET);
1178 			if (!wqset.wqs_sel) {
1179 				continue;
1180 			}
1181 			st = waitq_lock_reserve_allow_invalid(wqset, &ticket);
1182 			if (st == HW_LOCK_INVALID) {
1183 				continue;
1184 			}
1185 		} else {
1186 			static_assert(HW_LOCK_CONTENDED == 0);
1187 			st = waitq_lock_reserve(wqset, &ticket);
1188 		}
1189 		if (st == HW_LOCK_CONTENDED) {
1190 			if (!circle_queue_empty(&args->threadq)) {
1191 				/*
1192 				 * We are holding several thread locks.
1193 				 *
1194 				 * If we fail to acquire this waitq set lock,
1195 				 * it is possible that another core is holding
1196 				 * that (non IRQ-safe) waitq set lock,
1197 				 * while an interrupt is trying to grab the
1198 				 * thread lock of ones of those threads.
1199 				 *
1200 				 * In order to avoid deadlocks, flush out
1201 				 * the queue of threads.
1202 				 *
1203 				 * Note: this code will never run for `identify`
1204 				 *       variants (when `max_threads` is 1).
1205 				 */
1206 				assert(args->max_threads > 1);
1207 				waitq_select_queue_flush(waitq, args);
1208 			}
1209 			waitq_lock_wait(wqset, ticket);
1210 		}
1211 
1212 		if (wq_type == WQT_SELECT) {
1213 			if (!wql_sellink_valid(wqset.wqs_sel, link.wqls)) {
1214 				goto out_unlock;
1215 			}
1216 		} else if (!waitq_valid(wqset)) {
1217 			goto out_unlock;
1218 		}
1219 
1220 		/*
1221 		 * Find any threads waiting on this wait queue set as a queue.
1222 		 */
1223 		do_waitq_select_n_locked_queue(wqset, args);
1224 
1225 		if (args->nthreads == 0) {
1226 			/* No thread selected: prepost 'waitq' to 'wqset' */
1227 			wql_wqs_mark_preposted(link);
1228 			if (wq_type == WQT_SELECT) {
1229 				wqset.wqs_sel->selset_preposted = true;
1230 			} else {
1231 				waitq.wq_q->waitq_preposted = true;
1232 				circle_dequeue(&wqset.wqs_set->wqset_links,
1233 				    &link.wqll->wql_slink);
1234 				circle_enqueue_tail(&wqset.wqs_set->wqset_preposts,
1235 				    &link.wqll->wql_slink);
1236 				ipc_pset_prepost(wqset.wqs_set, waitq.wq_q);
1237 			}
1238 		}
1239 
1240 out_unlock:
1241 		waitq_unlock(wqset);
1242 
1243 		if (args->nthreads >= args->max_threads) {
1244 			break;
1245 		}
1246 	}
1247 }
1248 
1249 /**
1250  * @function do_waitq_select_n_locked
1251  *
1252  * @brief
1253  * Selects threads waiting on a wait queue, or preposts it.
1254  *
1255  * @discussion
1256  * @c waitq is locked.
1257  *
1258  * Recurses into all sets this wait queue belongs to.
1259  */
1260 static void
do_waitq_select_n_locked(waitq_t waitq,struct waitq_select_args * args)1261 do_waitq_select_n_locked(waitq_t waitq, struct waitq_select_args *args)
1262 {
1263 	do_waitq_select_n_locked_queue(waitq, args);
1264 
1265 	if (args->nthreads >= args->max_threads) {
1266 		/* already enough threads found */
1267 		return;
1268 	}
1269 
1270 	if (args->event != NO_EVENT64 || !waitq_preposts(waitq)) {
1271 		/* this wakeup should not recurse into sets */
1272 		return;
1273 	}
1274 
1275 	do_waitq_select_n_locked_sets(waitq, args);
1276 }
1277 
1278 static inline bool
waitq_is_preposted_set(waitq_t waitq)1279 waitq_is_preposted_set(waitq_t waitq)
1280 {
1281 	switch (waitq_type(waitq)) {
1282 	case WQT_PORT_SET:
1283 		return waitq_set_first_prepost(waitq.wqs_set, WQS_PREPOST_PEEK) != NULL;
1284 
1285 	case WQT_SELECT_SET:
1286 		return waitq.wqs_sel->selset_preposted;
1287 
1288 	default:
1289 		return false;
1290 	}
1291 }
1292 
1293 wait_result_t
waitq_assert_wait64_locked(waitq_t waitq,event64_t wait_event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway,thread_t thread)1294 waitq_assert_wait64_locked(waitq_t waitq,
1295     event64_t wait_event,
1296     wait_interrupt_t interruptible,
1297     wait_timeout_urgency_t urgency,
1298     uint64_t deadline,
1299     uint64_t leeway,
1300     thread_t thread)
1301 {
1302 	wait_result_t wait_result;
1303 	struct waitq *safeq;
1304 	uintptr_t eventmask;
1305 	spl_t s;
1306 
1307 	switch (waitq_type(waitq)) {
1308 	case WQT_PORT:
1309 	case WQT_SELECT:
1310 	case WQT_PORT_SET:
1311 	case WQT_SELECT_SET:
1312 		assert(wait_event == NO_EVENT64);
1313 		break;
1314 	default:
1315 		assert(wait_event != NO_EVENT64);
1316 		break;
1317 	}
1318 
1319 	/*
1320 	 * Warning: Do _not_ place debugging print statements here.
1321 	 *          The waitq is locked!
1322 	 */
1323 	assert(!thread->started || thread == current_thread());
1324 
1325 	if (!waitq_wait_possible(thread)) {
1326 		panic("thread already waiting on %p", thread->waitq.wq_q);
1327 	}
1328 
1329 	s = splsched();
1330 
1331 	/*
1332 	 * early-out if the thread is waiting on a wait queue set
1333 	 * that has already been pre-posted.
1334 	 *
1335 	 * Note: waitq_is_preposted_set() may unlock the waitq-set
1336 	 */
1337 	if (waitq_is_preposted_set(waitq)) {
1338 		thread_lock(thread);
1339 		thread->wait_result = THREAD_AWAKENED;
1340 		thread_unlock(thread);
1341 		splx(s);
1342 		return THREAD_AWAKENED;
1343 	}
1344 
1345 	/*
1346 	 * If already dealing with an irq safe wait queue, we are all set.
1347 	 * Otherwise, determine a global queue to use and lock it.
1348 	 */
1349 	if (waitq_irq_safe(waitq)) {
1350 		safeq = waitq.wq_q;
1351 		eventmask = _CAST_TO_EVENT_MASK(wait_event);
1352 	} else {
1353 		safeq = waitq_get_safeq(waitq);
1354 		if (__improbable(safeq == NULL)) {
1355 			panic("Trying to assert_wait on a turnstile proxy "
1356 			    "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1357 		}
1358 		eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1359 		waitq_lock(safeq);
1360 	}
1361 
1362 	/* lock the thread now that we have the irq-safe waitq locked */
1363 	thread_lock(thread);
1364 
1365 	wait_result = thread_mark_wait_locked(thread, interruptible);
1366 	/* thread->wait_result has been set */
1367 	if (wait_result == THREAD_WAITING) {
1368 		waitq_thread_insert(safeq, thread, waitq, wait_event);
1369 
1370 		if (deadline != 0) {
1371 			boolean_t act;
1372 
1373 			act = timer_call_enter_with_leeway(thread->wait_timer,
1374 			    NULL,
1375 			    deadline, leeway,
1376 			    urgency, FALSE);
1377 			if (!act) {
1378 				thread->wait_timer_active++;
1379 			}
1380 			thread->wait_timer_is_set = TRUE;
1381 		}
1382 
1383 		if (waitq_is_global(safeq)) {
1384 			safeq->waitq_eventmask |= (waitq_flags_t)eventmask;
1385 		}
1386 
1387 		waitq_stats_count_wait(waitq);
1388 	}
1389 
1390 	/* unlock the thread */
1391 	thread_unlock(thread);
1392 
1393 	/* update the inheritor's thread priority if the waitq is embedded in turnstile */
1394 	if (waitq_type(safeq) == WQT_TURNSTILE && wait_result == THREAD_WAITING) {
1395 		turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
1396 		turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
1397 	}
1398 
1399 	/* unlock the safeq if we locked it here */
1400 	if (!waitq_same(waitq, safeq)) {
1401 		waitq_unlock(safeq);
1402 	}
1403 
1404 	splx(s);
1405 
1406 	return wait_result;
1407 }
1408 
1409 bool
waitq_pull_thread_locked(waitq_t waitq,thread_t thread)1410 waitq_pull_thread_locked(waitq_t waitq, thread_t thread)
1411 {
1412 	struct waitq *safeq;
1413 	uint32_t ticket;
1414 
1415 	assert_thread_magic(thread);
1416 
1417 	/* Find the interrupts disabled queue thread is waiting on */
1418 	if (waitq_irq_safe(waitq)) {
1419 		safeq = waitq.wq_q;
1420 	} else {
1421 		safeq = waitq_get_safeq(waitq);
1422 		if (__improbable(safeq == NULL)) {
1423 			panic("Trying to clear_wait on a turnstile proxy "
1424 			    "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1425 		}
1426 	}
1427 
1428 	/*
1429 	 * thread is already locked so have to try for the waitq lock.
1430 	 *
1431 	 * We can't wait for the waitq lock under the thread lock,
1432 	 * however we can reserve our slot in the lock queue,
1433 	 * and if that reservation requires waiting, we are guaranteed
1434 	 * that this waitq can't die until we got our turn!
1435 	 */
1436 	if (!waitq_lock_reserve(safeq, &ticket)) {
1437 		thread_unlock(thread);
1438 		waitq_lock_wait(safeq, ticket);
1439 		thread_lock(thread);
1440 
1441 		if (!waitq_same(waitq, thread->waitq)) {
1442 			/*
1443 			 * While we were waiting for our reservation the thread
1444 			 * stopped waiting on this waitq, bail out.
1445 			 */
1446 			waitq_unlock(safeq);
1447 			return false;
1448 		}
1449 	}
1450 
1451 	waitq_thread_remove(safeq, thread);
1452 	waitq_stats_count_clear_wakeup(waitq);
1453 	waitq_unlock(safeq);
1454 	return true;
1455 }
1456 
1457 
1458 void
waitq_clear_promotion_locked(waitq_t waitq,thread_t thread)1459 waitq_clear_promotion_locked(waitq_t waitq, thread_t thread)
1460 {
1461 	spl_t s = 0;
1462 
1463 	assert(waitq_held(waitq));
1464 	assert(thread != THREAD_NULL);
1465 	assert(thread == current_thread());
1466 
1467 	/* This flag is only cleared by the thread itself, so safe to check outside lock */
1468 	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
1469 		return;
1470 	}
1471 
1472 	if (!waitq_irq_safe(waitq)) {
1473 		s = splsched();
1474 	}
1475 	thread_lock(thread);
1476 
1477 	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
1478 
1479 	thread_unlock(thread);
1480 	if (!waitq_irq_safe(waitq)) {
1481 		splx(s);
1482 	}
1483 }
1484 
1485 kern_return_t
waitq_wakeup64_all_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,int priority,waitq_lock_state_t lock_state)1486 waitq_wakeup64_all_locked(waitq_t waitq,
1487     event64_t wake_event,
1488     wait_result_t result,
1489     int priority,
1490     waitq_lock_state_t lock_state)
1491 {
1492 	struct waitq_select_args args = {
1493 		.event = wake_event,
1494 		.priority = priority,
1495 		.max_threads = UINT32_MAX,
1496 		.result = result,
1497 		.options = WQ_OPTION_NONE,
1498 	};
1499 
1500 	assert(waitq_held(waitq));
1501 
1502 	do_waitq_select_n_locked(waitq, &args);
1503 	waitq_stats_count_wakeup(waitq, args.nthreads);
1504 
1505 	if (lock_state == WAITQ_UNLOCK) {
1506 		waitq_unlock(waitq);
1507 	}
1508 
1509 	waitq_select_queue_flush(waitq, &args);
1510 
1511 	if (args.nthreads > 0) {
1512 		splx(args.spl);
1513 		return KERN_SUCCESS;
1514 	}
1515 
1516 	return KERN_NOT_WAITING;
1517 }
1518 
1519 kern_return_t
waitq_wakeup64_one_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,int priority,waitq_lock_state_t lock_state,waitq_options_t option)1520 waitq_wakeup64_one_locked(waitq_t waitq,
1521     event64_t wake_event,
1522     wait_result_t result,
1523     int priority,
1524     waitq_lock_state_t lock_state,
1525     waitq_options_t option)
1526 {
1527 	struct waitq_select_args args = {
1528 		.event = wake_event,
1529 		.priority = priority,
1530 		.max_threads = 1,
1531 		.result = result,
1532 		.options = option,
1533 	};
1534 
1535 	assert(waitq_held(waitq));
1536 
1537 	do_waitq_select_n_locked(waitq, &args);
1538 	waitq_stats_count_wakeup(waitq, args.nthreads);
1539 
1540 	if (lock_state == WAITQ_UNLOCK) {
1541 		waitq_unlock(waitq);
1542 	}
1543 
1544 	waitq_select_queue_flush(waitq, &args);
1545 
1546 	if (args.nthreads > 0) {
1547 		splx(args.spl);
1548 		return KERN_SUCCESS;
1549 	}
1550 
1551 	return KERN_NOT_WAITING;
1552 }
1553 
1554 thread_t
waitq_wakeup64_identify_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,spl_t * spl,int priority,waitq_lock_state_t lock_state)1555 waitq_wakeup64_identify_locked(waitq_t waitq,
1556     event64_t        wake_event,
1557     wait_result_t    result,
1558     spl_t            *spl,
1559     int              priority,
1560     waitq_lock_state_t lock_state)
1561 {
1562 	struct waitq_select_args args = {
1563 		.event = wake_event,
1564 		.priority = priority,
1565 		.max_threads = 1,
1566 	};
1567 	thread_t thread = THREAD_NULL;
1568 
1569 	assert(waitq_held(waitq));
1570 
1571 	do_waitq_select_n_locked(waitq, &args);
1572 	waitq_stats_count_wakeup(waitq, args.nthreads);
1573 
1574 	if (lock_state == WAITQ_UNLOCK) {
1575 		waitq_unlock(waitq);
1576 	}
1577 
1578 	if (args.nthreads > 0) {
1579 		kern_return_t __assert_only ret;
1580 
1581 		thread = cqe_dequeue_head(&args.threadq, struct thread, wait_links);
1582 		assert(args.nthreads == 1 && circle_queue_empty(&args.threadq));
1583 
1584 		maybe_adjust_thread_pri(thread, priority, waitq);
1585 		ret = thread_go(thread, result, WQ_OPTION_NONE);
1586 		assert(ret == KERN_SUCCESS);
1587 		*spl = args.spl;
1588 	}
1589 
1590 	return thread; /* locked if not NULL (caller responsible for spl) */
1591 }
1592 
1593 kern_return_t
waitq_wakeup64_thread_and_unlock(struct waitq * waitq,event64_t event,thread_t thread,wait_result_t result)1594 waitq_wakeup64_thread_and_unlock(struct waitq *waitq, event64_t event,
1595     thread_t thread, wait_result_t result)
1596 {
1597 	kern_return_t ret = KERN_NOT_WAITING;
1598 
1599 	assert(waitq_irq_safe(waitq));
1600 	assert(waitq_held(waitq));
1601 	assert_thread_magic(thread);
1602 
1603 	/*
1604 	 * See if the thread was still waiting there.  If so, it got
1605 	 * dequeued and returned locked.
1606 	 */
1607 	thread_lock(thread);
1608 
1609 	if (waitq_same(thread->waitq, waitq) && thread->wait_event == event) {
1610 		waitq_thread_remove(waitq, thread);
1611 		ret = KERN_SUCCESS;
1612 	}
1613 	waitq_stats_count_wakeup(waitq, ret == KERN_SUCCESS ? 1 : 0);
1614 
1615 	waitq_unlock(waitq);
1616 
1617 	if (ret == KERN_SUCCESS) {
1618 		ret = thread_go(thread, result, WQ_OPTION_NONE);
1619 		assert(ret == KERN_SUCCESS);
1620 	}
1621 
1622 	thread_unlock(thread);
1623 
1624 	return ret;
1625 }
1626 
1627 
1628 #pragma mark waitq
1629 
1630 __attribute__((always_inline))
1631 void
waitq_init(waitq_t waitq,waitq_type_t type,int policy)1632 waitq_init(waitq_t waitq, waitq_type_t type, int policy)
1633 {
1634 	assert((policy & SYNC_POLICY_FIXED_PRIORITY) == 0);
1635 
1636 	*waitq.wq_q = (struct waitq){
1637 		.waitq_type  = type,
1638 		.waitq_fifo  = ((policy & SYNC_POLICY_REVERSED) == 0),
1639 	};
1640 
1641 	switch (type) {
1642 	case WQT_INVALID:
1643 		__builtin_trap();
1644 
1645 	case WQT_TURNSTILE:
1646 		/* For turnstile, initialize it as a priority queue */
1647 		priority_queue_init(&waitq.wq_q->waitq_prio_queue);
1648 		assert(waitq.wq_q->waitq_fifo == 0);
1649 		break;
1650 
1651 	case WQT_PORT:
1652 		waitq.wq_q->waitq_ts = TURNSTILE_NULL;
1653 		break;
1654 
1655 	case WQT_PORT_SET:
1656 		circle_queue_init(&waitq.wqs_set->wqset_preposts);
1657 		OS_FALLTHROUGH;
1658 	case WQT_SELECT_SET:
1659 	case WQT_QUEUE:
1660 	case WQT_SELECT:
1661 		circle_queue_init(&waitq.wq_q->waitq_queue);
1662 		break;
1663 	}
1664 
1665 	if (policy & SYNC_POLICY_INIT_LOCKED) {
1666 		hw_lck_ticket_init_locked(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1667 	} else {
1668 		hw_lck_ticket_init(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1669 	}
1670 }
1671 
1672 void
waitq_deinit(waitq_t waitq)1673 waitq_deinit(waitq_t waitq)
1674 {
1675 	waitq_type_t type = waitq_type(waitq);
1676 
1677 	switch (type) {
1678 	case WQT_QUEUE:
1679 		assert(circle_queue_empty(&waitq.wq_q->waitq_queue));
1680 		waitq_invalidate(waitq);
1681 		break;
1682 
1683 	case WQT_TURNSTILE:
1684 		assert(priority_queue_empty(&waitq.wq_q->waitq_prio_queue));
1685 		assert(waitq.wq_q->waitq_inheritor == TURNSTILE_INHERITOR_NULL);
1686 		waitq_invalidate(waitq);
1687 		break;
1688 
1689 	case WQT_PORT:
1690 		assert(waitq.wq_q->waitq_ts == TURNSTILE_NULL);
1691 		assert(circle_queue_empty(&waitq.wq_q->waitq_links));
1692 		break;
1693 
1694 	case WQT_SELECT:
1695 		assert(waitq.wq_q->waitq_sellinks.next == NULL);
1696 		assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1697 		break;
1698 
1699 	case WQT_PORT_SET:
1700 		assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1701 		assert(circle_queue_empty(&waitq.wqs_set->wqset_links));
1702 		assert(circle_queue_empty(&waitq.wqs_set->wqset_preposts));
1703 		break;
1704 
1705 	default:
1706 		panic("invalid wait type: %p/%d", waitq.wq_q, type);
1707 	}
1708 
1709 	/*
1710 	 * The waitq must have been invalidated, or hw_lck_ticket_destroy()
1711 	 * below won't wait for reservations from waitq_lock_reserve(),
1712 	 * waitq_lock_reserve_allow_invalid() or waitq_lock_allow_invalid().
1713 	 */
1714 	assert(!waitq_valid(waitq.wqs_set));
1715 	hw_lck_ticket_destroy(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1716 }
1717 
1718 
1719 #pragma mark port-set sets
1720 
1721 void
waitq_set_unlink_all_locked(struct waitq_set * wqset,waitq_link_list_t * free_l)1722 waitq_set_unlink_all_locked(struct waitq_set *wqset, waitq_link_list_t *free_l)
1723 {
1724 	uint32_t batch = waitq_set_unlink_batch;
1725 
1726 	waitq_invalidate(wqset);
1727 
1728 	for (;;) {
1729 		struct waitq_link *link;
1730 		queue_entry_t elt;
1731 		circle_queue_t q;
1732 		struct waitq *wq;
1733 		uint32_t ticket;
1734 		bool stable = true;
1735 
1736 		if (!circle_queue_empty(&wqset->wqset_links)) {
1737 			q = &wqset->wqset_links;
1738 		} else if (!circle_queue_empty(&wqset->wqset_preposts)) {
1739 			q = &wqset->wqset_preposts;
1740 		} else {
1741 			break;
1742 		}
1743 
1744 		if (batch-- == 0) {
1745 			waitq_unlock(wqset);
1746 			waitq_lock(wqset);
1747 			batch = waitq_set_unlink_batch;
1748 			continue;
1749 		}
1750 
1751 		elt  = circle_queue_first(q);
1752 		link = cqe_element(elt, struct waitq_link, wql_slink);
1753 		wq   = link->wql_wq;
1754 
1755 		if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
1756 			waitq_unlock(wqset);
1757 			waitq_lock_wait(wq, ticket);
1758 			waitq_lock(wqset);
1759 			stable = (elt == circle_queue_first(q) && link->wql_wq == wq);
1760 		}
1761 
1762 		if (stable) {
1763 			circle_dequeue(q, &link->wql_slink);
1764 			circle_dequeue(&wq->waitq_links, &link->wql_qlink);
1765 			wql_list_push(free_l, link);
1766 		}
1767 
1768 		waitq_unlock(wq);
1769 	}
1770 }
1771 
1772 void
waitq_clear_prepost_locked(struct waitq * waitq)1773 waitq_clear_prepost_locked(struct waitq *waitq)
1774 {
1775 	assert(waitq_type(waitq) == WQT_PORT);
1776 	waitq->waitq_preposted = false;
1777 }
1778 
1779 void
1780 waitq_set_foreach_member_locked(struct waitq_set *wqs, void (^cb)(struct waitq *))
1781 {
1782 	struct waitq_link *link;
1783 
1784 	cqe_foreach_element(link, &wqs->wqset_links, wql_slink) {
1785 		cb(link->wql_wq);
1786 	}
1787 
1788 	cqe_foreach_element(link, &wqs->wqset_preposts, wql_slink) {
1789 		cb(link->wql_wq);
1790 	}
1791 }
1792 
1793 __abortlike
1794 static void
__waitq_link_arguments_panic(struct waitq * waitq,struct waitq_set * wqset)1795 __waitq_link_arguments_panic(struct waitq *waitq, struct waitq_set *wqset)
1796 {
1797 	if (!waitq_valid(waitq)) {
1798 		panic("Invalid waitq: %p", waitq);
1799 	}
1800 	if (waitq_type(waitq) != WQT_PORT) {
1801 		panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
1802 	}
1803 	panic("Invalid waitq-set: %p", wqset);
1804 }
1805 
1806 static inline void
__waitq_link_arguments_validate(struct waitq * waitq,struct waitq_set * wqset)1807 __waitq_link_arguments_validate(struct waitq *waitq, struct waitq_set *wqset)
1808 {
1809 	if (!waitq_valid(waitq) ||
1810 	    waitq_type(waitq) != WQT_PORT ||
1811 	    waitq_type(wqset) != WQT_PORT_SET) {
1812 		__waitq_link_arguments_panic(waitq, wqset);
1813 	}
1814 }
1815 
1816 __abortlike
1817 static void
__waitq_invalid_panic(waitq_t waitq)1818 __waitq_invalid_panic(waitq_t waitq)
1819 {
1820 	panic("Invalid waitq: %p", waitq.wq_q);
1821 }
1822 
1823 static void
__waitq_validate(waitq_t waitq)1824 __waitq_validate(waitq_t waitq)
1825 {
1826 	if (!waitq_valid(waitq)) {
1827 		__waitq_invalid_panic(waitq);
1828 	}
1829 }
1830 
1831 kern_return_t
waitq_link_locked(struct waitq * waitq,struct waitq_set * wqset,waitq_link_t * linkp)1832 waitq_link_locked(struct waitq *waitq, struct waitq_set *wqset,
1833     waitq_link_t *linkp)
1834 {
1835 	assert(linkp->wqlh);
1836 
1837 	__waitq_link_arguments_validate(waitq, wqset);
1838 
1839 	if (wql_find(waitq, wqset)) {
1840 		return KERN_ALREADY_IN_SET;
1841 	}
1842 
1843 	linkp->wqll->wql_wq = waitq;
1844 	linkp->wqll->wql_wqs = (uintptr_t)wqset;
1845 
1846 	if (waitq_valid(wqset)) {
1847 		circle_enqueue_tail(&wqset->wqset_links, &linkp->wqll->wql_slink);
1848 		circle_enqueue_tail(&waitq->waitq_links, &linkp->wqll->wql_qlink);
1849 		*linkp = WQL_NULL;
1850 	}
1851 
1852 	return KERN_SUCCESS;
1853 }
1854 
1855 kern_return_t
waitq_link_prepost_locked(struct waitq * waitq,struct waitq_set * wqset)1856 waitq_link_prepost_locked(struct waitq *waitq, struct waitq_set *wqset)
1857 {
1858 	struct waitq_link *link;
1859 
1860 	__waitq_link_arguments_validate(waitq, wqset);
1861 
1862 	link = wql_find(waitq, wqset);
1863 	if (link == NULL) {
1864 		return KERN_NOT_IN_SET;
1865 	}
1866 
1867 	if (!wql_wqs_preposted(link)) {
1868 		wql_wqs_mark_preposted(link);
1869 		waitq->waitq_preposted = true;
1870 		circle_dequeue(&wqset->wqset_links, &link->wql_slink);
1871 		circle_enqueue_tail(&wqset->wqset_preposts, &link->wql_slink);
1872 		ipc_pset_prepost(wqset, waitq);
1873 	}
1874 
1875 	return KERN_SUCCESS;
1876 }
1877 
1878 waitq_link_t
waitq_unlink_locked(struct waitq * waitq,struct waitq_set * wqset)1879 waitq_unlink_locked(struct waitq *waitq, struct waitq_set *wqset)
1880 {
1881 	struct waitq_link *link;
1882 
1883 	__waitq_link_arguments_validate(waitq, wqset);
1884 
1885 	link = wql_find(waitq, wqset);
1886 	if (link) {
1887 		circle_dequeue(wql_wqs_queue(wqset, link), &link->wql_slink);
1888 		circle_dequeue(&waitq->waitq_links, &link->wql_qlink);
1889 	}
1890 
1891 	return (waitq_link_t){ .wqll = link };
1892 }
1893 
1894 void
waitq_unlink_all_locked(struct waitq * waitq,struct waitq_set * except_wqset,waitq_link_list_t * free_l)1895 waitq_unlink_all_locked(struct waitq *waitq, struct waitq_set *except_wqset,
1896     waitq_link_list_t *free_l)
1897 {
1898 	struct waitq_link *kept_link = NULL;
1899 	struct waitq_link *link;
1900 
1901 	assert(waitq_type(waitq) == WQT_PORT);
1902 
1903 	cqe_foreach_element_safe(link, &waitq->waitq_links, wql_qlink) {
1904 		waitq_t wqs = wql_wqs(link);
1905 
1906 		if (wqs.wqs_set == except_wqset) {
1907 			kept_link = link;
1908 			continue;
1909 		}
1910 
1911 		waitq_lock(wqs);
1912 		circle_dequeue(wql_wqs_queue(wqs.wqs_set, link),
1913 		    &link->wql_slink);
1914 		wql_list_push(free_l, link);
1915 		waitq_unlock(wqs);
1916 	}
1917 
1918 	circle_queue_init(&waitq->waitq_links);
1919 	if (kept_link) {
1920 		circle_enqueue_tail(&waitq->waitq_links, &kept_link->wql_qlink);
1921 	}
1922 }
1923 
1924 struct waitq *
waitq_set_first_prepost(struct waitq_set * wqset,wqs_prepost_flags_t flags)1925 waitq_set_first_prepost(struct waitq_set *wqset, wqs_prepost_flags_t flags)
1926 {
1927 	circle_queue_t q = &wqset->wqset_preposts;
1928 	queue_entry_t elt;
1929 	struct waitq_link *link;
1930 	struct waitq *wq;
1931 	uint32_t ticket;
1932 
1933 	if (__improbable(!waitq_valid(wqset))) {
1934 		return NULL;
1935 	}
1936 
1937 	while (!circle_queue_empty(q)) {
1938 		elt  = circle_queue_first(q);
1939 		link = cqe_element(elt, struct waitq_link, wql_slink);
1940 		wq   = link->wql_wq;
1941 
1942 		if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
1943 			waitq_unlock(wqset);
1944 			waitq_lock_wait(wq, ticket);
1945 			waitq_lock(wqset);
1946 			if (!waitq_valid(wqset)) {
1947 				waitq_unlock(wq);
1948 				return NULL;
1949 			}
1950 
1951 			if (elt != circle_queue_first(q) || link->wql_wq != wq) {
1952 				waitq_unlock(wq);
1953 				continue;
1954 			}
1955 		}
1956 
1957 		if (wq->waitq_preposted) {
1958 			if ((flags & WQS_PREPOST_PEEK) == 0) {
1959 				circle_queue_rotate_head_forward(q);
1960 			}
1961 			if ((flags & WQS_PREPOST_LOCK) == 0) {
1962 				waitq_unlock(wq);
1963 			}
1964 			return wq;
1965 		}
1966 
1967 		/*
1968 		 * We found a link that is no longer preposted,
1969 		 * someone must have called waitq_clear_prepost_locked()
1970 		 * and this set just only noticed.
1971 		 */
1972 		wql_wqs_clear_preposted(link);
1973 		waitq_unlock(wq);
1974 
1975 		circle_dequeue(q, &link->wql_slink);
1976 		circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
1977 	}
1978 
1979 	return NULL;
1980 }
1981 
1982 
1983 #pragma mark select sets
1984 
1985 /**
1986  * @function select_set_nextid()
1987  *
1988  * @brief
1989  * Generate a unique ID for a select set "generation"
1990  *
1991  * @discussion
1992  * This mixes the CPU number with a monotonic clock
1993  * (in order to avoid contention on a global atomic).
1994  *
1995  * In order for select sets to be invalidated very quickly,
1996  * they do not have backward linkages to their member queues.
1997  *
1998  * Instead, each time a new @c select() "pass" is initiated,
1999  * a new ID is generated, which is copied onto the @c waitq_sellink
2000  * links at the time of link.
2001  *
2002  * The zone for select sets is sequestered, which allows for select
2003  * wait queues to speculatively lock their set during prepost
2004  * and use this ID to debounce wakeups and avoid spurious wakeups
2005  * (as an "optimization" because select recovers from spurious wakeups,
2006  * we just want those to be very rare).
2007  */
2008 __attribute__((always_inline))
2009 static inline uint64_t
select_set_nextid(bool preemption_enabled)2010 select_set_nextid(bool preemption_enabled)
2011 {
2012 	/* waitq_bootstrap() set the low byte to a unique value per CPU */
2013 	static_assert(MAX_CPUS <= 256);
2014 	const uint64_t inc = 256;
2015 	uint64_t id;
2016 
2017 #ifdef __x86_64__
2018 	/* uncontended atomics are slower than disabling preemption on Intel */
2019 	if (preemption_enabled) {
2020 		disable_preemption();
2021 	}
2022 	id = (*PERCPU_GET(select_setid) += inc);
2023 	if (preemption_enabled) {
2024 		enable_preemption();
2025 	}
2026 #else
2027 	/*
2028 	 * if preemption is enabled this might update another CPU's
2029 	 * setid, which will be rare but is acceptable, it still
2030 	 * produces a unique select ID.
2031 	 *
2032 	 * We chose this because the uncontended atomics on !intel
2033 	 * are faster than disabling/reenabling preemption.
2034 	 */
2035 	(void)preemption_enabled;
2036 	id = os_atomic_add(PERCPU_GET(select_setid), inc, relaxed);
2037 #endif
2038 
2039 	return id;
2040 }
2041 
2042 struct select_set *
select_set_alloc(void)2043 select_set_alloc(void)
2044 {
2045 	struct select_set *selset;
2046 	selset = zalloc_id(ZONE_ID_SELECT_SET, Z_ZERO | Z_WAITOK | Z_NOFAIL);
2047 
2048 	waitq_init(selset, WQT_SELECT_SET, SYNC_POLICY_FIFO);
2049 	selset->selset_id = select_set_nextid(true);
2050 
2051 	return selset;
2052 }
2053 
2054 __abortlike
2055 static void
__select_set_link_arguments_panic(struct waitq * waitq,struct select_set * set)2056 __select_set_link_arguments_panic(struct waitq *waitq, struct select_set *set)
2057 {
2058 	if (!waitq_valid(waitq)) {
2059 		panic("Invalid waitq: %p", waitq);
2060 	}
2061 	if (waitq_type(waitq) != WQT_SELECT) {
2062 		panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
2063 	}
2064 	panic("Invalid waitq-set: %p", set);
2065 }
2066 
2067 static inline void
__select_set_link_arguments_validate(struct waitq * waitq,struct select_set * set)2068 __select_set_link_arguments_validate(struct waitq *waitq, struct select_set *set)
2069 {
2070 	if (!waitq_valid(waitq) ||
2071 	    waitq_type(waitq) != WQT_SELECT ||
2072 	    waitq_type(set) != WQT_SELECT_SET) {
2073 		__select_set_link_arguments_panic(waitq, set);
2074 	}
2075 }
2076 
2077 void
select_set_link(struct waitq * waitq,struct select_set * set,waitq_link_t * linkp)2078 select_set_link(struct waitq *waitq, struct select_set *set,
2079     waitq_link_t *linkp)
2080 {
2081 	struct waitq_sellink *link;
2082 
2083 	__select_set_link_arguments_validate(waitq, set);
2084 
2085 	waitq_lock(waitq);
2086 
2087 	if (waitq == &select_conflict_queue) {
2088 		waitq_lock(set);
2089 		set->selset_conflict = true;
2090 		waitq_unlock(set);
2091 	}
2092 
2093 	wql_list_foreach(link, &waitq->waitq_sellinks) {
2094 		if (waitq_same(wql_wqs(link), set)) {
2095 			goto found;
2096 		}
2097 	}
2098 
2099 	link = linkp->wqls;
2100 	*linkp = WQL_NULL;
2101 	wql_list_push(&waitq->waitq_sellinks, link);
2102 
2103 found:
2104 	link->wql_wqs = (uintptr_t)set;
2105 	link->wql_setid = set->selset_id;
2106 	waitq_unlock(waitq);
2107 }
2108 
2109 static void
select_set_unlink_conflict_queue(struct select_set * set)2110 select_set_unlink_conflict_queue(struct select_set *set)
2111 {
2112 	struct waitq_link_list_entry **prev;
2113 	struct waitq_sellink *link;
2114 
2115 	waitq_lock(&select_conflict_queue);
2116 
2117 	/*
2118 	 * We know the conflict queue is hooked,
2119 	 * so find the linkage and free it.
2120 	 */
2121 	prev = &select_conflict_queue.waitq_sellinks.next;
2122 	for (;;) {
2123 		assert(*prev);
2124 		link = wql_list_elem(*prev);
2125 		if (waitq_same(wql_wqs(link), set)) {
2126 			*prev = link->wql_next.next;
2127 			break;
2128 		}
2129 		prev = &link->wql_next.next;
2130 	}
2131 
2132 	waitq_unlock(&select_conflict_queue);
2133 
2134 	waitq_link_free(WQT_SELECT_SET, link);
2135 }
2136 
2137 static void
__select_set_reset(struct select_set * set,bool invalidate)2138 __select_set_reset(struct select_set *set, bool invalidate)
2139 {
2140 	if (set->selset_conflict) {
2141 		select_set_unlink_conflict_queue(set);
2142 	}
2143 
2144 	waitq_lock(set);
2145 	if (invalidate) {
2146 		waitq_invalidate(set);
2147 	}
2148 	set->selset_id = select_set_nextid(false);
2149 	set->selset_preposted = 0;
2150 	set->selset_conflict = 0;
2151 	waitq_unlock(set);
2152 }
2153 
2154 void
select_set_reset(struct select_set * set)2155 select_set_reset(struct select_set *set)
2156 {
2157 	__select_set_reset(set, false);
2158 }
2159 
2160 void
select_set_free(struct select_set * set)2161 select_set_free(struct select_set *set)
2162 {
2163 	__select_set_reset(set, true);
2164 	hw_lck_ticket_destroy(&set->selset_interlock, &waitq_lck_grp);
2165 	zfree_id(ZONE_ID_SELECT_SET, set);
2166 }
2167 
2168 void
select_waitq_wakeup_and_deinit(struct waitq * waitq,event64_t wake_event,wait_result_t result,int priority)2169 select_waitq_wakeup_and_deinit(
2170 	struct waitq           *waitq,
2171 	event64_t               wake_event,
2172 	wait_result_t           result,
2173 	int                     priority)
2174 {
2175 	waitq_link_list_t free_l = { };
2176 
2177 	if (waitq_is_valid(waitq)) {
2178 		assert(waitq_type(waitq) == WQT_SELECT);
2179 
2180 		waitq_lock(waitq);
2181 
2182 		waitq_wakeup64_all_locked(waitq, wake_event, result,
2183 		    priority, WAITQ_KEEP_LOCKED);
2184 
2185 		waitq_invalidate(waitq);
2186 		free_l = waitq->waitq_sellinks;
2187 		waitq->waitq_sellinks.next = NULL;
2188 
2189 		waitq_unlock(waitq);
2190 
2191 		waitq_link_free_list(WQT_SELECT, &free_l);
2192 
2193 		waitq_deinit(waitq);
2194 	}
2195 }
2196 
2197 #pragma mark assert_wait / wakeup (high level)
2198 
2199 wait_result_t
waitq_assert_wait64(struct waitq * waitq,event64_t wait_event,wait_interrupt_t interruptible,uint64_t deadline)2200 waitq_assert_wait64(struct waitq *waitq,
2201     event64_t wait_event,
2202     wait_interrupt_t interruptible,
2203     uint64_t deadline)
2204 {
2205 	thread_t thread = current_thread();
2206 	wait_result_t ret;
2207 	spl_t s = 0;
2208 
2209 	__waitq_validate(waitq);
2210 
2211 	if (waitq_irq_safe(waitq)) {
2212 		s = splsched();
2213 	}
2214 	waitq_lock(waitq);
2215 
2216 	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
2217 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread);
2218 
2219 	waitq_unlock(waitq);
2220 	if (waitq_irq_safe(waitq)) {
2221 		splx(s);
2222 	}
2223 
2224 	return ret;
2225 }
2226 
2227 wait_result_t
waitq_assert_wait64_leeway(struct waitq * waitq,event64_t wait_event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)2228 waitq_assert_wait64_leeway(struct waitq *waitq,
2229     event64_t wait_event,
2230     wait_interrupt_t interruptible,
2231     wait_timeout_urgency_t urgency,
2232     uint64_t deadline,
2233     uint64_t leeway)
2234 {
2235 	wait_result_t ret;
2236 	thread_t thread = current_thread();
2237 	spl_t s = 0;
2238 
2239 	__waitq_validate(waitq);
2240 
2241 	if (waitq_irq_safe(waitq)) {
2242 		s = splsched();
2243 	}
2244 	waitq_lock(waitq);
2245 
2246 	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
2247 	    urgency, deadline, leeway, thread);
2248 
2249 	waitq_unlock(waitq);
2250 	if (waitq_irq_safe(waitq)) {
2251 		splx(s);
2252 	}
2253 
2254 	return ret;
2255 }
2256 
2257 kern_return_t
waitq_wakeup64_one(struct waitq * waitq,event64_t wake_event,wait_result_t result,int priority)2258 waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
2259     wait_result_t result, int priority)
2260 {
2261 	kern_return_t kr;
2262 	spl_t spl = 0;
2263 
2264 	__waitq_validate(waitq);
2265 
2266 	if (waitq_irq_safe(waitq)) {
2267 		spl = splsched();
2268 	}
2269 	waitq_lock(waitq);
2270 
2271 	/* waitq is locked upon return */
2272 	kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
2273 	    priority, WAITQ_UNLOCK, WQ_OPTION_NONE);
2274 
2275 	if (waitq_irq_safe(waitq)) {
2276 		splx(spl);
2277 	}
2278 
2279 	return kr;
2280 }
2281 
2282 kern_return_t
waitq_wakeup64_all(waitq_t waitq,event64_t wake_event,wait_result_t result,int priority)2283 waitq_wakeup64_all(waitq_t waitq, event64_t wake_event,
2284     wait_result_t result, int priority)
2285 {
2286 	kern_return_t ret;
2287 	spl_t spl = 0;
2288 
2289 	__waitq_validate(waitq);
2290 
2291 	if (waitq_irq_safe(waitq)) {
2292 		spl = splsched();
2293 	}
2294 	waitq_lock(waitq);
2295 
2296 	ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
2297 	    priority, WAITQ_UNLOCK);
2298 
2299 	if (waitq_irq_safe(waitq)) {
2300 		splx(spl);
2301 	}
2302 
2303 	return ret;
2304 }
2305 
2306 kern_return_t
waitq_wakeup64_thread(struct waitq * waitq,event64_t event,thread_t thread,wait_result_t result)2307 waitq_wakeup64_thread(struct waitq *waitq, event64_t event,
2308     thread_t thread, wait_result_t result)
2309 {
2310 	spl_t s = splsched();
2311 	kern_return_t ret;
2312 
2313 	__waitq_validate(waitq);
2314 	assert(waitq_irq_safe(waitq));
2315 	waitq_lock(waitq);
2316 
2317 	ret = waitq_wakeup64_thread_and_unlock(waitq, event, thread, result);
2318 
2319 	splx(s);
2320 
2321 	return ret;
2322 }
2323 
2324 thread_t
waitq_wakeup64_identify(waitq_t waitq,event64_t wake_event,wait_result_t result,int priority)2325 waitq_wakeup64_identify(waitq_t waitq, event64_t wake_event,
2326     wait_result_t result, int priority)
2327 {
2328 	spl_t thread_spl = 0;
2329 	thread_t thread;
2330 	spl_t spl = 0;
2331 
2332 	__waitq_validate(waitq);
2333 
2334 	if (waitq_irq_safe(waitq)) {
2335 		spl = splsched();
2336 	}
2337 	waitq_lock(waitq);
2338 
2339 	thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
2340 	    &thread_spl, priority, WAITQ_UNLOCK);
2341 	/* waitq is unlocked, thread is locked */
2342 
2343 	if (thread != THREAD_NULL) {
2344 		thread_reference(thread);
2345 		thread_unlock(thread);
2346 		splx(thread_spl);
2347 	}
2348 
2349 	if (waitq_irq_safe(waitq)) {
2350 		splx(spl);
2351 	}
2352 
2353 	/* returns +1 ref to running thread or THREAD_NULL */
2354 	return thread;
2355 }
2356 
2357 
2358 #pragma mark tests
2359 #if DEBUG || DEVELOPMENT
2360 
2361 #include <ipc/ipc_pset.h>
2362 #include <sys/errno.h>
2363 
2364 #define MAX_GLOBAL_TEST_QUEUES 64
2365 static struct waitq wqt_waitq_array[MAX_GLOBAL_TEST_QUEUES];
2366 static bool wqt_running;
2367 static bool wqt_init;
2368 
2369 static bool
wqt_start(const char * test,int64_t * out)2370 wqt_start(const char *test, int64_t *out)
2371 {
2372 	if (os_atomic_xchg(&wqt_running, true, acquire)) {
2373 		*out = 0;
2374 		return false;
2375 	}
2376 
2377 	if (!wqt_init) {
2378 		wqt_init = true;
2379 		for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) {
2380 			waitq_init(&wqt_waitq_array[i], WQT_PORT, SYNC_POLICY_FIFO);
2381 		}
2382 	}
2383 
2384 	printf("[WQ] starting %s\n", test);
2385 	return true;
2386 }
2387 
2388 static int
wqt_end(const char * test,int64_t * out)2389 wqt_end(const char *test, int64_t *out)
2390 {
2391 	os_atomic_store(&wqt_running, false, release);
2392 	printf("[WQ] done %s\n", test);
2393 	*out = 1;
2394 	return 0;
2395 }
2396 
2397 static struct waitq *
wqt_wq(uint32_t index)2398 wqt_wq(uint32_t index)
2399 {
2400 	return &wqt_waitq_array[index];
2401 }
2402 
2403 static uint32_t
wqt_idx(struct waitq * waitq)2404 wqt_idx(struct waitq *waitq)
2405 {
2406 	assert(waitq >= wqt_waitq_array &&
2407 	    waitq < wqt_waitq_array + MAX_GLOBAL_TEST_QUEUES);
2408 	return (uint32_t)(waitq - wqt_waitq_array);
2409 }
2410 
2411 __attribute__((overloadable))
2412 static uint64_t
wqt_bit(uint32_t index)2413 wqt_bit(uint32_t index)
2414 {
2415 	return 1ull << index;
2416 }
2417 
2418 __attribute__((overloadable))
2419 static uint64_t
wqt_bit(struct waitq * waitq)2420 wqt_bit(struct waitq *waitq)
2421 {
2422 	return wqt_bit(wqt_idx(waitq));
2423 }
2424 
2425 static struct waitq_set *
wqt_wqset_create(void)2426 wqt_wqset_create(void)
2427 {
2428 	struct waitq_set *wqset;
2429 
2430 	wqset = &ipc_pset_alloc_special(ipc_space_kernel)->ips_wqset;
2431 	printf("[WQ]: created waitq set %p\n", wqset);
2432 	return wqset;
2433 }
2434 
2435 static void
wqt_wqset_free(struct waitq_set * wqset)2436 wqt_wqset_free(struct waitq_set *wqset)
2437 {
2438 	printf("[WQ]: destroying waitq set %p\n", wqset);
2439 	waitq_lock(wqset);
2440 	ipc_pset_destroy(ipc_space_kernel,
2441 	    __container_of(wqset, struct ipc_pset, ips_wqset));
2442 }
2443 
2444 static void
wqt_link(uint32_t index,struct waitq_set * wqset,kern_return_t want)2445 wqt_link(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2446 {
2447 	struct waitq *waitq = wqt_wq(index);
2448 	waitq_link_t link = waitq_link_alloc(WQT_PORT_SET);
2449 	kern_return_t kr;
2450 
2451 	printf("[WQ]: linking waitq [%d] to global wqset (%p)\n", index, wqset);
2452 
2453 	waitq_lock(waitq);
2454 	waitq_lock(wqset);
2455 	kr = waitq_link_locked(waitq, wqset, &link);
2456 	waitq_unlock(wqset);
2457 	waitq_unlock(waitq);
2458 
2459 	if (link.wqlh) {
2460 		waitq_link_free(WQT_PORT_SET, link);
2461 	}
2462 
2463 	printf("[WQ]:\tkr=%d\texpected=%d\n", kr, want);
2464 	assert(kr == want);
2465 }
2466 
2467 static void
wqt_unlink(uint32_t index,struct waitq_set * wqset,kern_return_t want)2468 wqt_unlink(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2469 {
2470 	struct waitq *waitq = wqt_wq(index);
2471 	waitq_link_t link;
2472 	kern_return_t kr;
2473 
2474 	printf("[WQ]: unlinking waitq [%d] from global wqset (%p)\n",
2475 	    index, wqset);
2476 
2477 	waitq_lock(waitq);
2478 	waitq_lock(wqset);
2479 	link = waitq_unlink_locked(waitq, wqset);
2480 	waitq_unlock(wqset);
2481 	waitq_unlock(waitq);
2482 
2483 	if (link.wqlh) {
2484 		waitq_link_free(WQT_PORT_SET, link);
2485 		kr = KERN_SUCCESS;
2486 	} else {
2487 		kr = KERN_NOT_IN_SET;
2488 	}
2489 
2490 	printf("[WQ]: \tkr=%d\n", kr);
2491 	assert(kr == want);
2492 }
2493 
2494 static void
wqt_wakeup_one(uint32_t index,event64_t event64,kern_return_t want)2495 wqt_wakeup_one(uint32_t index, event64_t event64, kern_return_t want)
2496 {
2497 	kern_return_t kr;
2498 
2499 	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
2500 	    index, event64);
2501 	kr = waitq_wakeup64_one(wqt_wq(index), event64,
2502 	    THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2503 	printf("[WQ]: \tkr=%d\n", kr);
2504 	assert(kr == want);
2505 }
2506 
2507 static void
wqt_clear_preposts(uint32_t idx)2508 wqt_clear_preposts(uint32_t idx)
2509 {
2510 	waitq_lock(wqt_wq(idx));
2511 	(void)waitq_clear_prepost_locked(wqt_wq(idx));
2512 	waitq_unlock(wqt_wq(idx));
2513 }
2514 
2515 static void
wqt_preposts_gc_locked(struct waitq_set * wqset)2516 wqt_preposts_gc_locked(struct waitq_set *wqset)
2517 {
2518 	circle_queue_t q = &wqset->wqset_preposts;
2519 	struct waitq_link *link;
2520 	uint32_t ticket;
2521 
2522 again:
2523 	cqe_foreach_element_safe(link, q, wql_slink) {
2524 		struct waitq *wq = link->wql_wq;
2525 
2526 		if (!waitq_lock_reserve(wq, &ticket)) {
2527 			waitq_unlock(wqset);
2528 			waitq_lock_wait(wq, ticket);
2529 			waitq_lock(wqset);
2530 			waitq_unlock(wq);
2531 			/* the list was possibly mutated, restart */
2532 			goto again;
2533 		}
2534 
2535 		if (!wq->waitq_preposted) {
2536 			wql_wqs_clear_preposted(link);
2537 			circle_dequeue(q, &link->wql_slink);
2538 			circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
2539 		}
2540 
2541 		waitq_unlock(wq);
2542 	}
2543 }
2544 
2545 static void
wqt_expect_preposts(struct waitq_set * wqset,uint64_t preposts)2546 wqt_expect_preposts(struct waitq_set *wqset, uint64_t preposts)
2547 {
2548 	struct waitq_link *link;
2549 	uint64_t found = 0;
2550 
2551 	waitq_lock(wqset);
2552 
2553 	wqt_preposts_gc_locked(wqset);
2554 
2555 	cqe_foreach_element(link, &wqset->wqset_preposts, wql_slink) {
2556 		struct waitq *waitq = link->wql_wq;
2557 
2558 		printf("[WQ]: found prepost %d\n", wqt_idx(waitq));
2559 		assertf((found & wqt_bit(waitq)) == 0,
2560 		    "found waitq %d twice", wqt_idx(waitq));
2561 		found |= wqt_bit(waitq);
2562 	}
2563 
2564 	waitq_unlock(wqset);
2565 
2566 	assertf(found == preposts, "preposts expected 0x%llx, but got 0x%llx",
2567 	    preposts, found);
2568 }
2569 
2570 static int
waitq_basic_test(__unused int64_t in,int64_t * out)2571 waitq_basic_test(__unused int64_t in, int64_t *out)
2572 {
2573 	struct waitq_set *wqset;
2574 
2575 	if (!wqt_start(__func__, out)) {
2576 		return EBUSY;
2577 	}
2578 
2579 	wqset = wqt_wqset_create();
2580 	wqt_link(10, wqset, KERN_SUCCESS);
2581 	wqt_link(10, wqset, KERN_ALREADY_IN_SET);
2582 	wqt_link(11, wqset, KERN_SUCCESS);
2583 	wqt_link(11, wqset, KERN_ALREADY_IN_SET);
2584 	wqt_link(12, wqset, KERN_SUCCESS);
2585 	wqt_link(12, wqset, KERN_ALREADY_IN_SET);
2586 
2587 	wqt_wakeup_one(10, NO_EVENT64, KERN_NOT_WAITING);
2588 	wqt_wakeup_one(12, NO_EVENT64, KERN_NOT_WAITING);
2589 
2590 	wqt_expect_preposts(wqset, wqt_bit(10) | wqt_bit(12));
2591 	wqt_clear_preposts(10);
2592 
2593 	wqt_expect_preposts(wqset, wqt_bit(12));
2594 	wqt_clear_preposts(12);
2595 
2596 	wqt_expect_preposts(wqset, 0);
2597 
2598 	wqt_unlink(12, wqset, KERN_SUCCESS);
2599 	wqt_unlink(12, wqset, KERN_NOT_IN_SET);
2600 	wqt_unlink(11, wqset, KERN_SUCCESS);
2601 	wqt_unlink(10, wqset, KERN_SUCCESS);
2602 	wqt_wqset_free(wqset);
2603 
2604 	return wqt_end(__func__, out);
2605 }
2606 SYSCTL_TEST_REGISTER(waitq_basic, waitq_basic_test);
2607 #endif /* DEBUG || DEVELOPMENT */
2608