xref: /xnu-11215.41.3/osfmk/kern/waitq.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <kern/ast.h>
58 #include <kern/backtrace.h>
59 #include <kern/kern_types.h>
60 #include <kern/mach_param.h>
61 #include <kern/percpu.h>
62 #include <kern/queue.h>
63 #include <kern/sched_prim.h>
64 #include <kern/simple_lock.h>
65 #include <kern/spl.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/policy_internal.h>
69 #include <kern/turnstile.h>
70 
71 #include <os/hash.h>
72 #include <libkern/section_keywords.h>
73 #include <mach/sync_policy.h>
74 #include <vm/vm_kern_xnu.h>
75 
76 #include <sys/kdebug.h>
77 
78 /*!
79  * @const waitq_set_unlink_batch
80  *
81  * @brief
82  * How many links are unhooked under a single set lock hold.
83  *
84  * @discussion
85  * Holding a waitq set lock for too long can cause
86  * extreme contention (when a set is being torn down concurrently
87  * to messages being sent to ports who used to belong to that set).
88  *
89  * In order to fight this, large wait queue sets will drop
90  * and reacquire their lock for each unlinking batch.
91  */
92 static TUNABLE(uint32_t, waitq_set_unlink_batch, "waitq_set_unlink_batch", 64);
93 
94 /*!
95  * @const WQL_PREPOST_MARKER
96  *
97  * @brief
98  * Marker set in the @c wql_wqs field of wait queue linkages to denote that
99  * this linkage has preposted to its wait queue set already.
100  *
101  * @discussion
102  * This bit is manipulated under both the wait queue and the wait queue set
103  * locks, and is used for two purposes:
104  *
105  * - for port set queues, it denotes in which circle queue the linkage
106  *   is queued on (@c waitq_set::wqset_links or @c waitq_set::wqset_preposts)
107  *
108  * - as an optimization during pre-post to not walk sets this link already
109  *   preposted to.
110  */
111 #define WQL_PREPOST_MARKER 1ul
112 
113 #if __LP64__
114 /*!
115  * @struct waitq_link_hdr
116  *
117  * @brief
118  * Common "header" between all linkages, in order to find the waitq_set
119  * of this linkage.
120  *
121  * @discussion
122  * Due to unfortunate alignment constraints on @c queue_chain_t,
123  * this is wildly different for LP64 and ILP32.
124  *
125  * Do note that `wql
126  */
127 struct waitq_link_hdr {
128 	uintptr_t       wql_wqs;
129 };
130 
131 /*!
132  * @struct waitq_sellink
133  *
134  * @brief
135  * Linkages used for select waitq queues to select wait queue sets.
136  *
137  * @discussion
138  * Select linkages are one way (queue to set) for two reasons:
139  *
140  * 1. select doesn't use the wait queue subsystem to discover which file
141  *    descriptor woke up the set (it will instead scan all fds again),
142  *
143  * 2. all linkages are unhooked on each syscall return, so we minimize
144  *    work to be done to be as quick as possible, using a fast invalidation
145  *    scheme based on unique identifiers and sequestering
146  *    (see @c select_set_nextid()).
147  */
148 struct waitq_sellink {
149 	uintptr_t       wql_wqs;
150 	struct waitq_link_list_entry wql_next;
151 	uint64_t        wql_setid;
152 };
153 
154 /*!
155  * @struct waitq_link
156  *
157  * @brief
158  * Linkages used for port wait queues and port-set wait queue sets.
159  *
160  * @discussion
161  * Those linkages go both ways so that receiving messages through a port-set
162  * can quickly find ports that preposted to the set.
163  *
164  * It also means that unhooking linkages cannot be lazy.
165  */
166 struct waitq_link {
167 	uintptr_t       wql_wqs;       /**< wait queue set for this link      */
168 	queue_chain_t   wql_qlink;     /**< linkage through the waitq list    */
169 	queue_chain_t   wql_slink;     /**< linkage through the wqset list    */
170 	struct waitq   *wql_wq;        /**< wait queue for this link          */
171 };
172 #else
173 struct waitq_link_hdr {
174 	uint64_t        __wql_padding;
175 	uintptr_t       wql_wqs;
176 };
177 
178 struct waitq_sellink {
179 	struct waitq_link_list_entry wql_next;
180 	uintptr_t       __wql_padding;
181 	uintptr_t       wql_wqs;
182 	uint64_t        wql_setid;
183 };
184 
185 struct waitq_link {
186 	queue_chain_t   wql_qlink;
187 	uintptr_t       wql_wqs;
188 	struct waitq   *wql_wq;
189 	queue_chain_t   wql_slink;
190 };
191 #endif
192 
193 static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
194     offsetof(struct waitq_sellink, wql_wqs));
195 static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
196     offsetof(struct waitq_link, wql_wqs));
197 static_assert(sizeof(struct waitq) <= WQ_OPAQUE_SIZE, "waitq structure size mismatch");
198 static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure alignment mismatch");
199 
200 static KALLOC_TYPE_DEFINE(waitq_sellink_zone, struct waitq_sellink, KT_PRIV_ACCT);
201 static KALLOC_TYPE_DEFINE(waitq_link_zone, struct waitq_link, KT_PRIV_ACCT);
202 ZONE_DEFINE_ID(ZONE_ID_SELECT_SET, "select_set", struct select_set,
203     ZC_SEQUESTER | ZC_NOPGZ | ZC_ZFREE_CLEARMEM);
204 
205 static LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
206 
207 static uint64_t PERCPU_DATA(select_setid);
208 struct waitq select_conflict_queue;
209 
210 #pragma mark waitq links
211 
212 static inline bool
waitq_is_sellink(waitq_type_t type)213 waitq_is_sellink(waitq_type_t type)
214 {
215 	return type == WQT_SELECT || type == WQT_SELECT_SET;
216 }
217 
218 static inline bool
wql_sellink_valid(struct select_set * selset,struct waitq_sellink * link)219 wql_sellink_valid(struct select_set *selset, struct waitq_sellink *link)
220 {
221 	return waitq_valid(selset) && selset->selset_id == link->wql_setid;
222 }
223 
224 static waitq_t
wql_wqs(waitq_link_t link)225 wql_wqs(waitq_link_t link)
226 {
227 	return (waitq_t){ (void *)(link.wqlh->wql_wqs & ~WQL_PREPOST_MARKER) };
228 }
229 
230 static bool
wql_wqs_preposted(waitq_link_t link)231 wql_wqs_preposted(waitq_link_t link)
232 {
233 	return link.wqlh->wql_wqs & WQL_PREPOST_MARKER;
234 }
235 
236 static void
wql_wqs_mark_preposted(waitq_link_t link)237 wql_wqs_mark_preposted(waitq_link_t link)
238 {
239 	assert(!wql_wqs_preposted(link));
240 	link.wqlh->wql_wqs |= WQL_PREPOST_MARKER;
241 }
242 
243 static void
wql_wqs_clear_preposted(waitq_link_t link)244 wql_wqs_clear_preposted(waitq_link_t link)
245 {
246 	assert(wql_wqs_preposted(link));
247 	link.wqlh->wql_wqs &= ~WQL_PREPOST_MARKER;
248 }
249 
250 static circle_queue_t
wql_wqs_queue(struct waitq_set * wqs,struct waitq_link * link)251 wql_wqs_queue(struct waitq_set *wqs, struct waitq_link *link)
252 {
253 	return wql_wqs_preposted(link) ? &wqs->wqset_preposts : &wqs->wqset_links;
254 }
255 
256 static void
wql_list_push(waitq_link_list_t * list,waitq_link_t link)257 wql_list_push(waitq_link_list_t *list, waitq_link_t link)
258 {
259 	link.wqls->wql_next.next = list->next;
260 	list->next = &link.wqls->wql_next;
261 }
262 
263 static inline struct waitq_sellink *
wql_list_elem(struct waitq_link_list_entry * e)264 wql_list_elem(struct waitq_link_list_entry *e)
265 {
266 	return e ? __container_of(e, struct waitq_sellink, wql_next) : NULL;
267 }
268 
269 /*!
270  * @function wql_list_next()
271  *
272  * @brief
273  * Helper function to implement wait queue link list enumeration.
274  *
275  * @param e             in: pointer to the current element,
276  *                      out: pointer to the next element or NULL
277  * @param end           which element to stop enumeration at (NULL for lists,
278  *                      or the first element enumerated for circle queues).
279  * @returns true        (makes writing for(;;) based enumerators easier).
280  */
281 static inline bool
wql_list_next(struct waitq_link_list_entry ** e,struct waitq_link_list_entry * end)282 wql_list_next(struct waitq_link_list_entry **e, struct waitq_link_list_entry *end)
283 {
284 	if (*e == NULL || (*e)->next == end) {
285 		*e = NULL;
286 	} else {
287 		*e = (*e)->next;
288 	}
289 	return true;
290 }
291 
292 #define __wql_list_foreach(it, head, end) \
293 	for (struct waitq_link_list_entry *__it = (head)->next, *__end = end; \
294 	    ((it) = wql_list_elem(__it)); wql_list_next(&__it, __end))
295 
296 #define wql_list_foreach(it, head) \
297 	__wql_list_foreach(it, head, NULL)
298 
299 #define wql_list_foreach_safe(it, head) \
300 	for (struct waitq_link_list_entry *__it = (head)->next;                \
301 	    ((it) = wql_list_elem(__it)) && wql_list_next(&__it, NULL); )
302 
303 /*
304  * Gross hack: passing `__it` to `__wql_list_foreach` makes it stop whether
305  * we circle back to the first element or NULL (whichever comes first).
306  *
307  * This allows to have a single enumeration function oblivious to whether
308  * we enumerate a circle queue or a sellink list.
309  */
310 #define waitq_link_foreach(link, waitq) \
311 	__wql_list_foreach((link).wqls, &(waitq).wq_q->waitq_sellinks, __it)
312 
313 static_assert(offsetof(struct waitq, waitq_sellinks) ==
314     offsetof(struct waitq, waitq_links));
315 static_assert(offsetof(struct waitq_sellink, wql_next) ==
316     offsetof(struct waitq_link, wql_qlink.next));
317 
318 static struct waitq_link *
wql_find(struct waitq * waitq,waitq_t wqset)319 wql_find(struct waitq *waitq, waitq_t wqset)
320 {
321 	struct waitq_link *link;
322 
323 	cqe_foreach_element(link, &waitq->waitq_links, wql_qlink) {
324 		if (waitq_same(wql_wqs(link), wqset)) {
325 			return link;
326 		}
327 	}
328 
329 	return NULL;
330 }
331 
332 waitq_link_t
waitq_link_alloc(waitq_type_t type)333 waitq_link_alloc(waitq_type_t type)
334 {
335 	waitq_link_t link;
336 
337 	if (waitq_is_sellink(type)) {
338 		link.wqls = zalloc_flags(waitq_sellink_zone, Z_WAITOK | Z_ZERO);
339 	} else {
340 		link.wqll = zalloc_flags(waitq_link_zone, Z_WAITOK | Z_ZERO);
341 	}
342 	return link;
343 }
344 
345 void
waitq_link_free(waitq_type_t type,waitq_link_t link)346 waitq_link_free(waitq_type_t type, waitq_link_t link)
347 {
348 	if (waitq_is_sellink(type)) {
349 		return zfree(waitq_sellink_zone, link.wqls);
350 	} else {
351 		return zfree(waitq_link_zone, link.wqll);
352 	}
353 }
354 
355 void
waitq_link_free_list(waitq_type_t type,waitq_link_list_t * free_l)356 waitq_link_free_list(waitq_type_t type, waitq_link_list_t *free_l)
357 {
358 	waitq_link_t link;
359 
360 	wql_list_foreach_safe(link.wqls, free_l) {
361 		waitq_link_free(type, link);
362 	}
363 
364 	free_l->next = NULL;
365 }
366 
367 
368 #pragma mark global wait queues
369 
370 static __startup_data struct waitq g_boot_waitq;
371 static SECURITY_READ_ONLY_LATE(struct waitq *) global_waitqs = &g_boot_waitq;
372 static SECURITY_READ_ONLY_LATE(uint32_t) g_num_waitqs = 1;
373 
374 /*
375  * Zero out the used MSBs of the event.
376  */
377 #define _CAST_TO_EVENT_MASK(event) \
378 	((waitq_flags_t)(uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
379 
380 static inline uint32_t
waitq_hash(char * key,size_t length)381 waitq_hash(char *key, size_t length)
382 {
383 	return os_hash_jenkins(key, length) & (g_num_waitqs - 1);
384 }
385 
386 /* return a global waitq pointer corresponding to the given event */
387 struct waitq *
_global_eventq(char * event,size_t event_length)388 _global_eventq(char *event, size_t event_length)
389 {
390 	return &global_waitqs[waitq_hash(event, event_length)];
391 }
392 
393 bool
waitq_is_valid(waitq_t waitq)394 waitq_is_valid(waitq_t waitq)
395 {
396 	return waitq_valid(waitq);
397 }
398 
399 static inline bool
waitq_is_global(waitq_t waitq)400 waitq_is_global(waitq_t waitq)
401 {
402 	if (waitq_type(waitq) != WQT_QUEUE) {
403 		return false;
404 	}
405 	return waitq.wq_q >= global_waitqs && waitq.wq_q < global_waitqs + g_num_waitqs;
406 }
407 
408 static inline bool
waitq_empty(waitq_t wq)409 waitq_empty(waitq_t wq)
410 {
411 	struct turnstile *ts;
412 
413 	switch (waitq_type(wq)) {
414 	case WQT_TURNSTILE:
415 		return priority_queue_empty(&wq.wq_q->waitq_prio_queue);
416 	case WQT_PORT:
417 		ts = wq.wq_q->waitq_ts;
418 		return ts == TURNSTILE_NULL ||
419 		       priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
420 	case WQT_QUEUE:
421 	case WQT_SELECT:
422 	case WQT_PORT_SET:
423 	case WQT_SELECT_SET:
424 		return circle_queue_empty(&wq.wq_q->waitq_queue);
425 
426 	default:
427 		return true;
428 	}
429 }
430 
431 #if CONFIG_WAITQ_STATS
432 #define NWAITQ_BTFRAMES 5
433 
434 struct wq_stats {
435 	uint64_t waits;
436 	uint64_t wakeups;
437 	uint64_t clears;
438 	uint64_t failed_wakeups;
439 
440 	uintptr_t last_wait[NWAITQ_BTFRAMES];
441 	uintptr_t last_wakeup[NWAITQ_BTFRAMES];
442 	uintptr_t last_failed_wakeup[NWAITQ_BTFRAMES];
443 };
444 
445 /* this global is for lldb */
446 const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
447 struct wq_stats g_boot_stats;
448 struct wq_stats *g_waitq_stats = &g_boot_stats;
449 
450 static __inline__ void
waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES],unsigned skip)451 waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], unsigned skip)
452 {
453 	uintptr_t buf[NWAITQ_BTFRAMES + skip];
454 
455 	memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
456 	backtrace(buf, g_nwaitq_btframes + skip, NULL, NULL);
457 	memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
458 }
459 
460 static __inline__ struct wq_stats *
waitq_global_stats(waitq_t waitq)461 waitq_global_stats(waitq_t waitq)
462 {
463 	struct wq_stats *wqs;
464 	uint32_t idx;
465 
466 	if (!waitq_is_global(waitq)) {
467 		return NULL;
468 	}
469 
470 	idx = (uint32_t)(waitq.wq_q - global_waitqs);
471 	assert(idx < g_num_waitqs);
472 	wqs = &g_waitq_stats[idx];
473 	return wqs;
474 }
475 
476 static __inline__ void
waitq_stats_count_wait(waitq_t waitq)477 waitq_stats_count_wait(waitq_t waitq)
478 {
479 	struct wq_stats *wqs = waitq_global_stats(waitq);
480 	if (wqs != NULL) {
481 		wqs->waits++;
482 		waitq_grab_backtrace(wqs->last_wait, 2);
483 	}
484 }
485 
486 static __inline__ void
waitq_stats_count_wakeup(waitq_t waitq,int n)487 waitq_stats_count_wakeup(waitq_t waitq, int n)
488 {
489 	struct wq_stats *wqs = waitq_global_stats(waitq);
490 	if (wqs != NULL) {
491 		if (n > 0) {
492 			wqs->wakeups += n;
493 			waitq_grab_backtrace(wqs->last_wakeup, 2);
494 		} else {
495 			wqs->failed_wakeups++;
496 			waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
497 		}
498 	}
499 }
500 
501 static __inline__ void
waitq_stats_count_clear_wakeup(waitq_t waitq)502 waitq_stats_count_clear_wakeup(waitq_t waitq)
503 {
504 	struct wq_stats *wqs = waitq_global_stats(waitq);
505 	if (wqs != NULL) {
506 		wqs->wakeups++;
507 		wqs->clears++;
508 		waitq_grab_backtrace(wqs->last_wakeup, 2);
509 	}
510 }
511 #else /* !CONFIG_WAITQ_STATS */
512 #define waitq_stats_count_wait(q)         do { } while (0)
513 #define waitq_stats_count_wakeup(q, n)    do { } while (0)
514 #define waitq_stats_count_clear_wakeup(q) do { } while (0)
515 #endif
516 
517 static struct waitq *
waitq_get_safeq(waitq_t waitq)518 waitq_get_safeq(waitq_t waitq)
519 {
520 	if (waitq_type(waitq) == WQT_PORT) {
521 		struct turnstile *ts = waitq.wq_q->waitq_ts;
522 		return ts ? &ts->ts_waitq : NULL;
523 	}
524 
525 	uint32_t hash = os_hash_kernel_pointer(waitq.wq_q);
526 	return &global_waitqs[hash & (g_num_waitqs - 1)];
527 }
528 
529 /*
530  * Since the priority ordered waitq uses basepri as the
531  * ordering key assert that this value fits in a uint8_t.
532  */
533 static_assert(MAXPRI <= UINT8_MAX);
534 
535 static inline void
waitq_thread_insert(struct waitq * safeq,thread_t thread,waitq_t wq,event64_t event)536 waitq_thread_insert(struct waitq *safeq, thread_t thread,
537     waitq_t wq, event64_t event)
538 {
539 	if (waitq_type(safeq) == WQT_TURNSTILE) {
540 		turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
541 		turnstile_waitq_add_thread_priority_queue(safeq, thread);
542 	} else {
543 		turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
544 		/*
545 		 * This is the extent to which we currently take scheduling
546 		 * attributes into account:
547 		 *
548 		 * - If the thread is vm privileged, we stick it at the front
549 		 *   of the queue, later, these queues will honor the policy
550 		 *   value set at waitq_init time.
551 		 *
552 		 * - Realtime threads get priority for wait queue placements.
553 		 *   This allows wait_queue_wakeup_one to prefer a waiting
554 		 *   realtime thread, similar in principle to performing
555 		 *   a wait_queue_wakeup_all and allowing scheduler
556 		 *   prioritization to run the realtime thread, but without
557 		 *   causing the lock contention of that scenario.
558 		 */
559 		if (thread->sched_pri >= BASEPRI_REALTIME ||
560 		    !safeq->waitq_fifo ||
561 		    (thread->options & TH_OPT_VMPRIV)) {
562 			circle_enqueue_head(&safeq->waitq_queue, &thread->wait_links);
563 		} else {
564 			circle_enqueue_tail(&safeq->waitq_queue, &thread->wait_links);
565 		}
566 	}
567 
568 	/* mark the event and real waitq, even if enqueued on a global safeq */
569 	thread->wait_event = event;
570 	thread->waitq = wq;
571 }
572 
573 /**
574  * clear the thread-related waitq state, moving the thread from
575  * TH_WAIT to TH_WAIT | TH_WAKING, where it is no longer on a waitq and
576  * can expect to be go'ed in the near future.
577  *
578  * Clearing the waitq prevents further propagation of a turnstile boost
579  * on the thread and stops a clear_wait from succeeding.
580  *
581  * Conditions:
582  *	'thread' is locked, thread is waiting
583  */
584 static inline void
thread_clear_waitq_state(thread_t thread)585 thread_clear_waitq_state(thread_t thread)
586 {
587 	assert(thread->state & TH_WAIT);
588 
589 	thread->waitq.wq_q = NULL;
590 	thread->wait_event = NO_EVENT64;
591 	thread->at_safe_point = FALSE;
592 	thread->block_hint = kThreadWaitNone;
593 	thread->state |= TH_WAKING;
594 }
595 
596 static inline void
waitq_thread_remove(waitq_t wq,thread_t thread)597 waitq_thread_remove(waitq_t wq, thread_t thread)
598 {
599 	if (waitq_type(wq) == WQT_TURNSTILE) {
600 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
601 		    (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS,
602 		    (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
603 		    VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq.wq_q)),
604 		    thread_tid(thread), 0, 0, 0);
605 		priority_queue_remove(&wq.wq_q->waitq_prio_queue,
606 		    &thread->wait_prioq_links);
607 	} else {
608 		circle_dequeue(&wq.wq_q->waitq_queue, &thread->wait_links);
609 		if (waitq_is_global(wq) && waitq_empty(wq)) {
610 			wq.wq_q->waitq_eventmask = 0;
611 		}
612 	}
613 
614 	thread_clear_waitq_state(thread);
615 }
616 
617 bool
waitq_wait_possible(thread_t thread)618 waitq_wait_possible(thread_t thread)
619 {
620 	return waitq_is_null(thread->waitq) &&
621 	       ((thread->state & TH_WAKING) == 0);
622 }
623 
624 __startup_func
625 static void
waitq_bootstrap(void)626 waitq_bootstrap(void)
627 {
628 	const uint32_t qsz = sizeof(struct waitq);
629 	vm_offset_t whsize;
630 	int cpu = 0;
631 
632 	/*
633 	 * Determine the amount of memory we're willing to reserve for
634 	 * the waitqueue hash table
635 	 */
636 	if (!PE_parse_boot_argn("wqsize", &whsize, sizeof(whsize))) {
637 		whsize = round_page(thread_max * qsz / 5);
638 	}
639 
640 	/*
641 	 * Determine the number of waitqueues we can fit.
642 	 * The hash algorithm requires that this be a power of 2.
643 	 */
644 	g_num_waitqs = 0x80000000u >> __builtin_clzl(whsize / qsz);
645 	assert(g_num_waitqs > 0);
646 	whsize = round_page(g_num_waitqs * qsz);
647 
648 	kmem_alloc(kernel_map, (vm_offset_t *)&global_waitqs, whsize,
649 	    KMA_NOFAIL | KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_PERMANENT,
650 	    VM_KERN_MEMORY_WAITQ);
651 
652 #if CONFIG_WAITQ_STATS
653 	whsize = round_page(g_num_waitqs * sizeof(struct wq_stats));
654 	kmem_alloc(kernel_map, (vm_offset_t *)&g_waitq_stats, whsize,
655 	    KMA_NOFAIL | KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_ZERO | KMA_PERMANENT,
656 	    VM_KERN_MEMORY_WAITQ);
657 #endif
658 
659 	for (uint32_t i = 0; i < g_num_waitqs; i++) {
660 		waitq_init(&global_waitqs[i], WQT_QUEUE, SYNC_POLICY_FIFO);
661 	}
662 
663 	waitq_init(&select_conflict_queue, WQT_SELECT, SYNC_POLICY_FIFO);
664 
665 	percpu_foreach(setid, select_setid) {
666 		/* is not cpu_number() but CPUs haven't been numbered yet */
667 		*setid = cpu++;
668 	}
669 }
670 STARTUP(MACH_IPC, STARTUP_RANK_FIRST, waitq_bootstrap);
671 
672 
673 #pragma mark locking
674 
675 static hw_spin_timeout_status_t
waitq_timeout_handler(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)676 waitq_timeout_handler(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
677 {
678 	lck_spinlock_to_info_t lsti;
679 	hw_lck_ticket_t tmp;
680 	struct waitq *wq = _lock;
681 
682 	if (machine_timeout_suspended()) {
683 		return HW_LOCK_TIMEOUT_CONTINUE;
684 	}
685 
686 	lsti = lck_spinlock_timeout_hit(&wq->waitq_interlock, 0);
687 	tmp.tcurnext = os_atomic_load(&wq->waitq_interlock.tcurnext, relaxed);
688 
689 	panic("waitq(%p) lock " HW_SPIN_TIMEOUT_FMT "; cpu=%d, "
690 	    "cticket: 0x%x, nticket: 0x%x, waiting for 0x%x, "
691 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
692 	    wq, HW_SPIN_TIMEOUT_ARG(to, st), cpu_number(),
693 	    tmp.cticket, tmp.nticket, lsti->extra,
694 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
695 }
696 
697 static const struct hw_spin_policy waitq_spin_policy = {
698 	.hwsp_name              = "waitq",
699 #if defined(__i386__) || defined(__x86_64__)
700 	.hwsp_timeout           = &LockTimeOutTSC,
701 #else
702 	.hwsp_timeout_atomic    = &LockTimeOut,
703 #endif
704 	/*
705 	 * Double the standard lock timeout, because wait queues tend
706 	 * to iterate over a number of threads - locking each.  If there is
707 	 * a problem with a thread lock, it normally times out at the wait
708 	 * queue level first, hiding the real problem.
709 	 */
710 	.hwsp_timeout_shift     = 1,
711 	.hwsp_lock_offset       = offsetof(struct waitq, waitq_interlock),
712 	.hwsp_op_timeout        = waitq_timeout_handler,
713 };
714 
715 void
waitq_invalidate(waitq_t waitq)716 waitq_invalidate(waitq_t waitq)
717 {
718 	hw_lck_ticket_invalidate(&waitq.wq_q->waitq_interlock);
719 }
720 
721 bool
waitq_held(waitq_t wq)722 waitq_held(waitq_t wq)
723 {
724 	return hw_lck_ticket_held(&wq.wq_q->waitq_interlock);
725 }
726 
727 void
waitq_lock(waitq_t wq)728 waitq_lock(waitq_t wq)
729 {
730 	(void)hw_lck_ticket_lock_to(&wq.wq_q->waitq_interlock,
731 	    &waitq_spin_policy, &waitq_lck_grp);
732 #if defined(__x86_64__)
733 	pltrace(FALSE);
734 #endif
735 }
736 
737 bool
waitq_lock_try(waitq_t wq)738 waitq_lock_try(waitq_t wq)
739 {
740 	bool rc = hw_lck_ticket_lock_try(&wq.wq_q->waitq_interlock, &waitq_lck_grp);
741 
742 #if defined(__x86_64__)
743 	if (rc) {
744 		pltrace(FALSE);
745 	}
746 #endif
747 	return rc;
748 }
749 
750 bool
waitq_lock_reserve(waitq_t wq,uint32_t * ticket)751 waitq_lock_reserve(waitq_t wq, uint32_t *ticket)
752 {
753 	return hw_lck_ticket_reserve(&wq.wq_q->waitq_interlock, ticket, &waitq_lck_grp);
754 }
755 
756 void
waitq_lock_wait(waitq_t wq,uint32_t ticket)757 waitq_lock_wait(waitq_t wq, uint32_t ticket)
758 {
759 	(void)hw_lck_ticket_wait(&wq.wq_q->waitq_interlock, ticket,
760 	    &waitq_spin_policy, &waitq_lck_grp);
761 #if defined(__x86_64__)
762 	pltrace(FALSE);
763 #endif
764 }
765 
766 bool
waitq_lock_allow_invalid(waitq_t wq)767 waitq_lock_allow_invalid(waitq_t wq)
768 {
769 	hw_lock_status_t rc;
770 
771 	rc = hw_lck_ticket_lock_allow_invalid(&wq.wq_q->waitq_interlock,
772 	    &waitq_spin_policy, &waitq_lck_grp);
773 
774 #if defined(__x86_64__)
775 	if (rc == HW_LOCK_ACQUIRED) {
776 		pltrace(FALSE);
777 	}
778 #endif
779 	return rc == HW_LOCK_ACQUIRED;
780 }
781 
782 void
waitq_unlock(waitq_t wq)783 waitq_unlock(waitq_t wq)
784 {
785 	assert(waitq_held(wq));
786 #if defined(__x86_64__)
787 	pltrace(TRUE);
788 #endif
789 	hw_lck_ticket_unlock(&wq.wq_q->waitq_interlock);
790 }
791 
792 
793 #pragma mark assert_wait / wakeup
794 
795 struct waitq_select_args {
796 	/* input parameters */
797 	event64_t               event;
798 	wait_result_t           result;
799 	waitq_wakeup_flags_t    flags;
800 	uint32_t                max_threads;
801 	bool                    is_identified;
802 
803 	/* output parameters */
804 	/* counts all woken threads, may have more threads than on threadq */
805 	uint32_t                nthreads;
806 	/* preemption is disabled while threadq is non-empty */
807 	circle_queue_head_t     threadq;
808 };
809 
810 static inline void
maybe_adjust_thread_pri(thread_t thread,waitq_wakeup_flags_t flags,__kdebug_only waitq_t waitq)811 maybe_adjust_thread_pri(
812 	thread_t                thread,
813 	waitq_wakeup_flags_t    flags,
814 	__kdebug_only waitq_t   waitq)
815 {
816 	/*
817 	 * If the caller is requesting the waitq subsystem to promote the
818 	 * priority of the awoken thread, then boost the thread's priority to
819 	 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
820 	 * higher priority).  This boost must be removed via a call to
821 	 * waitq_clear_promotion_locked before the thread waits again.
822 	 */
823 	if (flags & WAITQ_PROMOTE_PRIORITY) {
824 		uintptr_t trace_waitq = 0;
825 		if (__improbable(kdebug_enable)) {
826 			trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq.wq_q);
827 		}
828 
829 		sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
830 	}
831 }
832 
833 static void
waitq_select_queue_add(waitq_t waitq,thread_t thread,struct waitq_select_args * args)834 waitq_select_queue_add(waitq_t waitq, thread_t thread, struct waitq_select_args *args)
835 {
836 	spl_t s = splsched();
837 
838 	thread_lock(thread);
839 	thread_clear_waitq_state(thread);
840 
841 	if (!args->is_identified && thread->state & TH_RUN) {
842 		/*
843 		 * A thread that is currently on core may try to clear its own
844 		 * wait with clear wait or by waking its own event instead of
845 		 * calling thread_block as is normally expected.  After doing
846 		 * this, it expects to be able to immediately wait again.
847 		 *
848 		 * If we are currently on a different CPU and waking that
849 		 * thread, as soon as we unlock the waitq and thread, that
850 		 * operation could complete, but we would still be holding the
851 		 * thread on our flush queue, leaving it in the waking state
852 		 * where it can't yet assert another wait.
853 		 *
854 		 * Since we know that we won't actually need to enqueue the
855 		 * thread on the runq due to it being on core, we can just
856 		 * immediately unblock it here so that the thread will be in a
857 		 * waitable state after we release its thread lock from this
858 		 * lock hold.
859 		 *
860 		 * Wakeups using *_identify can't be allowed to pass
861 		 * thread block until they're resumed, so they can't use
862 		 * this path.  That means they are not allowed to skip calling
863 		 * thread_block.
864 		 */
865 		maybe_adjust_thread_pri(thread, args->flags, waitq);
866 		thread_go(thread, args->result, false);
867 	} else {
868 		if (circle_queue_empty(&args->threadq)) {
869 			/*
870 			 * preemption is disabled while threads are
871 			 * on threadq - balanced in:
872 			 * waitq_resume_identified_thread
873 			 * waitq_select_queue_flush
874 			 */
875 			disable_preemption();
876 		}
877 
878 		circle_enqueue_tail(&args->threadq, &thread->wait_links);
879 	}
880 
881 	thread_unlock(thread);
882 
883 	splx(s);
884 }
885 
886 
887 #if SCHED_HYGIENE_DEBUG
888 
889 TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_threads, "waitq_flush_excess_threads", 20);
890 TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_time_mt, "waitq_flush_excess_time_mt", 7200); /* 300us */
891 
892 #endif /* SCHED_HYGIENE_DEBUG */
893 
894 
895 static void
waitq_select_queue_flush(waitq_t waitq,struct waitq_select_args * args)896 waitq_select_queue_flush(waitq_t waitq, struct waitq_select_args *args)
897 {
898 	thread_t thread = THREAD_NULL;
899 
900 	assert(!circle_queue_empty(&args->threadq));
901 
902 	int flushed_threads = 0;
903 
904 #if SCHED_HYGIENE_DEBUG
905 	uint64_t start_time = ml_get_sched_hygiene_timebase();
906 	disable_preemption();
907 #endif /* SCHED_HYGIENE_DEBUG */
908 
909 	cqe_foreach_element_safe(thread, &args->threadq, wait_links) {
910 		circle_dequeue(&args->threadq, &thread->wait_links);
911 		assert_thread_magic(thread);
912 
913 		spl_t s = splsched();
914 
915 		thread_lock(thread);
916 		maybe_adjust_thread_pri(thread, args->flags, waitq);
917 		thread_go(thread, args->result, args->flags & WAITQ_HANDOFF);
918 		thread_unlock(thread);
919 
920 		splx(s);
921 
922 		flushed_threads++;
923 	}
924 
925 #if SCHED_HYGIENE_DEBUG
926 	uint64_t end_time = ml_get_sched_hygiene_timebase();
927 
928 	/*
929 	 * Check for a combination of excess threads and long time,
930 	 * so that a single thread wakeup that gets stuck is still caught
931 	 */
932 	if (waitq_flush_excess_threads && waitq_flush_excess_time_mt &&
933 	    flushed_threads > waitq_flush_excess_threads &&
934 	    (end_time - start_time) > waitq_flush_excess_time_mt) {
935 		/*
936 		 * Hack alert:
937 		 *
938 		 * If a wakeup-all is done with interrupts disabled, or if
939 		 * there are enough threads / lock contention to pass the
940 		 * preemption disable threshold, it can take Too Long to get
941 		 * through waking up all the threads, leading to
942 		 * the watchdog going off.
943 		 *
944 		 * While we are working on a change to break up this
945 		 * giant glob of work into smaller chunks, remove this
946 		 * time region from the watchdog's memory to avoid
947 		 * unit tests that wake up hundreds of threads on
948 		 * one semaphore from causing this to blow up.
949 		 *
950 		 * We only trigger this when seeing a combination of
951 		 * excess threads and long time, so that a single
952 		 * thread wakeup that gets stuck is still caught.
953 		 *
954 		 * This was improved with
955 		 * rdar://90325140
956 		 * to enable interrupts during most wakeup-all's
957 		 * and will be removed with
958 		 * rdar://101110793
959 		 */
960 		if (ml_get_interrupts_enabled() == false) {
961 			ml_spin_debug_reset(current_thread());
962 			ml_irq_debug_abandon();
963 		}
964 		abandon_preemption_disable_measurement();
965 
966 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_RESET), flushed_threads, end_time - start_time);
967 	}
968 
969 	enable_preemption();
970 
971 #endif /* SCHED_HYGIENE_DEBUG */
972 
973 	/*
974 	 * match the disable when making threadq nonempty from
975 	 * waitq_select_queue_add
976 	 */
977 	enable_preemption();
978 }
979 
980 /**
981  * Routine to iterate over the waitq for non-priority ordered waitqs
982  *
983  * Conditions:
984  *	args->waitq (and the posted waitq) is locked
985  *
986  * Notes:
987  *	If one or more threads are selected, this may disable preemption,
988  *	which is balanced when the threadq is flushed in
989  *	waitq_resume_identified_thread or waitq_select_queue_flush.
990  */
991 static waitq_flags_t
waitq_queue_iterate_locked(struct waitq * safeq,struct waitq * waitq,struct waitq_select_args * args)992 waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,
993     struct waitq_select_args *args)
994 {
995 	thread_t thread = THREAD_NULL;
996 	waitq_flags_t eventmask = 0;
997 
998 	cqe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
999 		assert_thread_magic(thread);
1000 
1001 		/*
1002 		 * For non-priority ordered waitqs, we allow multiple events to be
1003 		 * mux'ed into the same waitq. Also safeqs may contain threads from
1004 		 * multiple waitqs. Only pick threads that match the
1005 		 * requested wait event.
1006 		 */
1007 		if (waitq_same(thread->waitq, waitq) && thread->wait_event == args->event) {
1008 			/* We found a matching thread! Pull it from the queue. */
1009 
1010 			circle_dequeue(&safeq->waitq_queue, &thread->wait_links);
1011 
1012 			waitq_select_queue_add(waitq, thread, args);
1013 
1014 			if (++args->nthreads >= args->max_threads) {
1015 				break;
1016 			}
1017 		} else {
1018 			/* thread wasn't selected so track its event */
1019 			eventmask |= waitq_same(thread->waitq, safeq)
1020 			    ? _CAST_TO_EVENT_MASK(thread->wait_event)
1021 			    : _CAST_TO_EVENT_MASK(thread->waitq.wq_q);
1022 		}
1023 	}
1024 
1025 	return eventmask;
1026 }
1027 
1028 /**
1029  * Routine to iterate and remove threads from priority ordered waitqs
1030  *
1031  * Conditions:
1032  *	args->waitq (and the posted waitq) is locked
1033  *
1034  * Notes:
1035  *	The priority ordered waitqs only support maximum priority element removal.
1036  *
1037  *	Also, the implementation makes sure that all threads in a priority ordered
1038  *	waitq are waiting on the same wait event. This is not necessarily true for
1039  *	non-priority ordered waitqs. If one or more threads are selected, this may
1040  *	disable preemption.
1041  */
1042 static void
waitq_prioq_iterate_locked(struct waitq * ts_wq,struct waitq * waitq,struct waitq_select_args * args)1043 waitq_prioq_iterate_locked(
1044 	struct waitq           *ts_wq,
1045 	struct waitq           *waitq,
1046 	struct waitq_select_args *args)
1047 {
1048 	struct turnstile *ts = waitq_to_turnstile(ts_wq);
1049 	bool update_inheritor = (args->flags & WAITQ_UPDATE_INHERITOR);
1050 
1051 	if (update_inheritor && args->max_threads == UINT32_MAX) {
1052 		/*
1053 		 * If we are going to wake up all threads,
1054 		 * go ahead and set the inheritor to NULL.
1055 		 */
1056 		turnstile_kernel_update_inheritor_on_wake_locked(ts,
1057 		    TURNSTILE_INHERITOR_NULL, TURNSTILE_INHERITOR_THREAD);
1058 		update_inheritor = false;
1059 	}
1060 
1061 	while (!priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1062 		thread_t thread;
1063 
1064 		thread = priority_queue_remove_max(&ts_wq->waitq_prio_queue,
1065 		    struct thread, wait_prioq_links);
1066 
1067 		assert_thread_magic(thread);
1068 
1069 		/*
1070 		 * Ensure the wait event matches since priority ordered waitqs do not
1071 		 * support multiple events in the same waitq.
1072 		 */
1073 		assert(waitq_same(thread->waitq, waitq) && (thread->wait_event == args->event));
1074 
1075 		if (update_inheritor) {
1076 			turnstile_inheritor_t inheritor = thread;
1077 
1078 			if (priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1079 				inheritor = TURNSTILE_INHERITOR_NULL;
1080 			}
1081 			turnstile_kernel_update_inheritor_on_wake_locked(ts,
1082 			    inheritor, TURNSTILE_INHERITOR_THREAD);
1083 			update_inheritor = false;
1084 		}
1085 
1086 		waitq_select_queue_add(waitq, thread, args);
1087 
1088 		if (++args->nthreads >= args->max_threads) {
1089 			break;
1090 		}
1091 	}
1092 }
1093 
1094 /**
1095  * @function do_waitq_select_n_locked_queue
1096  *
1097  * @brief
1098  * Selects threads waiting on a wait queue.
1099  *
1100  * @discussion
1101  * @c waitq is locked.
1102  * If @c waitq is a set, then the wait queue posting to it is locked too.
1103  *
1104  * If one or more threads are selected, this may disable preemption.
1105  */
1106 static void
do_waitq_select_n_locked_queue(waitq_t waitq,struct waitq_select_args * args)1107 do_waitq_select_n_locked_queue(waitq_t waitq, struct waitq_select_args *args)
1108 {
1109 	spl_t s = 0;
1110 
1111 	struct waitq *safeq;
1112 	waitq_flags_t eventmask, remaining_eventmask;
1113 
1114 	if (waitq_irq_safe(waitq)) {
1115 		eventmask = _CAST_TO_EVENT_MASK(args->event);
1116 		safeq = waitq.wq_q;
1117 	} else {
1118 		/* JMM - add flag to waitq to avoid global lookup if no waiters */
1119 		eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1120 		safeq = waitq_get_safeq(waitq);
1121 		if (safeq == NULL) {
1122 			return;
1123 		}
1124 
1125 		s = splsched();
1126 		waitq_lock(safeq);
1127 	}
1128 
1129 	/*
1130 	 * If the safeq doesn't have an eventmask (not global) or the event
1131 	 * we're looking for IS set in its eventmask, then scan the threads
1132 	 * in that queue for ones that match the original <waitq,event> pair.
1133 	 */
1134 	if (waitq_type(safeq) == WQT_TURNSTILE) {
1135 		waitq_prioq_iterate_locked(safeq, waitq.wq_q, args);
1136 	} else if (!waitq_is_global(safeq)) {
1137 		waitq_queue_iterate_locked(safeq, waitq.wq_q, args);
1138 	} else if ((safeq->waitq_eventmask & eventmask) == eventmask) {
1139 		remaining_eventmask = waitq_queue_iterate_locked(safeq,
1140 		    waitq.wq_q, args);
1141 
1142 		/*
1143 		 * Update the eventmask of global queues we just scanned:
1144 		 * - If we selected all the threads in the queue,
1145 		 *   we can clear its eventmask.
1146 		 *
1147 		 * - If we didn't find enough threads to fill our needs,
1148 		 *   then we can assume we looked at every thread in the queue
1149 		 *   and the mask we computed is complete - so reset it.
1150 		 */
1151 		if (waitq_empty(safeq)) {
1152 			safeq->waitq_eventmask = 0;
1153 		} else if (args->nthreads < args->max_threads) {
1154 			safeq->waitq_eventmask = remaining_eventmask;
1155 		}
1156 	}
1157 
1158 	/* unlock the safe queue if we locked one above */
1159 	if (!waitq_same(waitq, safeq)) {
1160 		waitq_unlock(safeq);
1161 		splx(s);
1162 	}
1163 }
1164 
1165 /**
1166  * @function do_waitq_link_select_n_locked()
1167  *
1168  * @brief
1169  * Selects threads waiting on any set a wait queue belongs to,
1170  * or preposts the wait queue onto them.
1171  *
1172  * @discussion
1173  * @c waitq is locked.
1174  */
1175 __attribute__((noinline))
1176 static void
do_waitq_select_n_locked_sets(waitq_t waitq,struct waitq_select_args * args)1177 do_waitq_select_n_locked_sets(waitq_t waitq, struct waitq_select_args *args)
1178 {
1179 	waitq_type_t wq_type = waitq_type(waitq);
1180 	waitq_link_t link;
1181 
1182 	assert(args->event == NO_EVENT64);
1183 	assert(waitq_preposts(waitq));
1184 
1185 	waitq_link_foreach(link, waitq) {
1186 		waitq_t wqset = wql_wqs(link);
1187 
1188 		if (wql_wqs_preposted(link)) {
1189 			/*
1190 			 * The wql_wqs_preposted() bit is cleared
1191 			 * under both the wq/wqset lock.
1192 			 *
1193 			 * If the wqset is still preposted,
1194 			 * we really won't find threads there.
1195 			 *
1196 			 * Just mark the waitq as preposted and move on.
1197 			 */
1198 			if (wq_type == WQT_PORT) {
1199 				waitq.wq_q->waitq_preposted = true;
1200 			}
1201 			continue;
1202 		}
1203 
1204 		if (wq_type == WQT_SELECT) {
1205 			/*
1206 			 * If PGZ picked this select set,
1207 			 * translate it to the real address
1208 			 *
1209 			 * If it is still a select set
1210 			 * (the slot could have been reused),
1211 			 * then keep using it for the rest of the logic.
1212 			 *
1213 			 * Even in the extremely unlikely case where
1214 			 * the slot was reused for another select_set,
1215 			 * the `wql_sellink_valid` check below will
1216 			 * take care of debouncing it. But we must
1217 			 * forget the original pointer we read
1218 			 * so that we unlock the proper object.
1219 			 */
1220 			wqset.wqs_sel = pgz_decode_allow_invalid(wqset.wqs_sel,
1221 			    ZONE_ID_SELECT_SET);
1222 			if (!wqset.wqs_sel) {
1223 				continue;
1224 			}
1225 			if (!waitq_lock_allow_invalid(wqset)) {
1226 				continue;
1227 			}
1228 			if (!wql_sellink_valid(wqset.wqs_sel, link.wqls)) {
1229 				goto out_unlock;
1230 			}
1231 		} else {
1232 			waitq_lock(wqset);
1233 			if (!waitq_valid(wqset)) {
1234 				goto out_unlock;
1235 			}
1236 		}
1237 
1238 		/*
1239 		 * Find any threads waiting on this wait queue set as a queue.
1240 		 */
1241 		do_waitq_select_n_locked_queue(wqset, args);
1242 
1243 		if (args->nthreads == 0) {
1244 			/* No thread selected: prepost 'waitq' to 'wqset' */
1245 			wql_wqs_mark_preposted(link);
1246 			if (wq_type == WQT_SELECT) {
1247 				wqset.wqs_sel->selset_preposted = true;
1248 			} else {
1249 				waitq.wq_q->waitq_preposted = true;
1250 				circle_dequeue(&wqset.wqs_set->wqset_links,
1251 				    &link.wqll->wql_slink);
1252 				circle_enqueue_tail(&wqset.wqs_set->wqset_preposts,
1253 				    &link.wqll->wql_slink);
1254 				ipc_pset_prepost(wqset.wqs_set, waitq.wq_q);
1255 			}
1256 		}
1257 
1258 out_unlock:
1259 		waitq_unlock(wqset);
1260 
1261 		if (args->nthreads >= args->max_threads) {
1262 			break;
1263 		}
1264 	}
1265 }
1266 
1267 /**
1268  * @function do_waitq_select_n_locked
1269  *
1270  * @brief
1271  * Selects threads waiting on a wait queue, or preposts it.
1272  *
1273  * @discussion
1274  * @c waitq is locked.
1275  *
1276  * Recurses into all sets this wait queue belongs to.
1277  */
1278 static void
do_waitq_select_n_locked(waitq_t waitq,struct waitq_select_args * args)1279 do_waitq_select_n_locked(waitq_t waitq, struct waitq_select_args *args)
1280 {
1281 	do_waitq_select_n_locked_queue(waitq, args);
1282 
1283 	if (args->nthreads >= args->max_threads) {
1284 		/* already enough threads found */
1285 		return;
1286 	}
1287 
1288 	if (args->event != NO_EVENT64 || !waitq_preposts(waitq)) {
1289 		/* this wakeup should not recurse into sets */
1290 		return;
1291 	}
1292 
1293 	do_waitq_select_n_locked_sets(waitq, args);
1294 }
1295 
1296 static inline bool
waitq_is_preposted_set(waitq_t waitq)1297 waitq_is_preposted_set(waitq_t waitq)
1298 {
1299 	switch (waitq_type(waitq)) {
1300 	case WQT_PORT_SET:
1301 		return waitq_set_first_prepost(waitq.wqs_set, WQS_PREPOST_PEEK) != NULL;
1302 
1303 	case WQT_SELECT_SET:
1304 		return waitq.wqs_sel->selset_preposted;
1305 
1306 	default:
1307 		return false;
1308 	}
1309 }
1310 
1311 wait_result_t
waitq_assert_wait64_locked(waitq_t waitq,event64_t wait_event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway,thread_t thread)1312 waitq_assert_wait64_locked(waitq_t waitq,
1313     event64_t wait_event,
1314     wait_interrupt_t interruptible,
1315     wait_timeout_urgency_t urgency,
1316     uint64_t deadline,
1317     uint64_t leeway,
1318     thread_t thread)
1319 {
1320 	wait_result_t wait_result;
1321 	struct waitq *safeq;
1322 	uintptr_t eventmask;
1323 	spl_t s;
1324 
1325 	switch (waitq_type(waitq)) {
1326 	case WQT_PORT:
1327 	case WQT_SELECT:
1328 	case WQT_PORT_SET:
1329 	case WQT_SELECT_SET:
1330 		assert(wait_event == NO_EVENT64);
1331 		break;
1332 	default:
1333 		assert(wait_event != NO_EVENT64);
1334 		break;
1335 	}
1336 
1337 	/*
1338 	 * Warning: Do _not_ place debugging print statements here.
1339 	 *          The waitq is locked!
1340 	 */
1341 	assert(!thread->started || thread == current_thread());
1342 
1343 	if (!waitq_wait_possible(thread)) {
1344 		panic("thread already waiting on %p", thread->waitq.wq_q);
1345 	}
1346 
1347 	s = splsched();
1348 
1349 	/*
1350 	 * early-out if the thread is waiting on a wait queue set
1351 	 * that has already been pre-posted.
1352 	 *
1353 	 * Note: waitq_is_preposted_set() may unlock the waitq-set
1354 	 */
1355 	if (waitq_is_preposted_set(waitq)) {
1356 		thread_lock(thread);
1357 		thread->wait_result = THREAD_AWAKENED;
1358 		thread_unlock(thread);
1359 		splx(s);
1360 		return THREAD_AWAKENED;
1361 	}
1362 
1363 	/*
1364 	 * If already dealing with an irq safe wait queue, we are all set.
1365 	 * Otherwise, determine a global queue to use and lock it.
1366 	 */
1367 	if (waitq_irq_safe(waitq)) {
1368 		safeq = waitq.wq_q;
1369 		eventmask = _CAST_TO_EVENT_MASK(wait_event);
1370 	} else {
1371 		safeq = waitq_get_safeq(waitq);
1372 		if (__improbable(safeq == NULL)) {
1373 			panic("Trying to assert_wait on a turnstile proxy "
1374 			    "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1375 		}
1376 		eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1377 		waitq_lock(safeq);
1378 	}
1379 
1380 	/* lock the thread now that we have the irq-safe waitq locked */
1381 	thread_lock(thread);
1382 
1383 	wait_result = thread_mark_wait_locked(thread, interruptible);
1384 	/* thread->wait_result has been set */
1385 	if (wait_result == THREAD_WAITING) {
1386 		waitq_thread_insert(safeq, thread, waitq, wait_event);
1387 
1388 		if (deadline != 0) {
1389 			bool was_active;
1390 
1391 			was_active = timer_call_enter_with_leeway(thread->wait_timer,
1392 			    NULL,
1393 			    deadline, leeway,
1394 			    urgency, FALSE);
1395 			if (!was_active) {
1396 				thread->wait_timer_active++;
1397 			}
1398 			thread->wait_timer_armed = true;
1399 		}
1400 
1401 		if (waitq_is_global(safeq)) {
1402 			safeq->waitq_eventmask |= (waitq_flags_t)eventmask;
1403 		}
1404 
1405 		waitq_stats_count_wait(waitq);
1406 	}
1407 
1408 	/* unlock the thread */
1409 	thread_unlock(thread);
1410 
1411 	/* update the inheritor's thread priority if the waitq is embedded in turnstile */
1412 	if (waitq_type(safeq) == WQT_TURNSTILE && wait_result == THREAD_WAITING) {
1413 		turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
1414 		turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
1415 	}
1416 
1417 	/* unlock the safeq if we locked it here */
1418 	if (!waitq_same(waitq, safeq)) {
1419 		waitq_unlock(safeq);
1420 	}
1421 
1422 	splx(s);
1423 
1424 	return wait_result;
1425 }
1426 
1427 bool
waitq_pull_thread_locked(waitq_t waitq,thread_t thread)1428 waitq_pull_thread_locked(waitq_t waitq, thread_t thread)
1429 {
1430 	struct waitq *safeq;
1431 	uint32_t ticket;
1432 
1433 	assert_thread_magic(thread);
1434 
1435 	/* Find the interrupts disabled queue thread is waiting on */
1436 	if (waitq_irq_safe(waitq)) {
1437 		safeq = waitq.wq_q;
1438 	} else {
1439 		safeq = waitq_get_safeq(waitq);
1440 		if (__improbable(safeq == NULL)) {
1441 			panic("Trying to clear_wait on a turnstile proxy "
1442 			    "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1443 		}
1444 	}
1445 
1446 	/*
1447 	 * thread is already locked so have to try for the waitq lock.
1448 	 *
1449 	 * We can't wait for the waitq lock under the thread lock,
1450 	 * however we can reserve our slot in the lock queue,
1451 	 * and if that reservation requires waiting, we are guaranteed
1452 	 * that this waitq can't die until we got our turn!
1453 	 */
1454 	if (!waitq_lock_reserve(safeq, &ticket)) {
1455 		thread_unlock(thread);
1456 		waitq_lock_wait(safeq, ticket);
1457 		thread_lock(thread);
1458 
1459 		if (!waitq_same(waitq, thread->waitq)) {
1460 			/*
1461 			 * While we were waiting for our reservation the thread
1462 			 * stopped waiting on this waitq, bail out.
1463 			 */
1464 			waitq_unlock(safeq);
1465 			return false;
1466 		}
1467 	}
1468 
1469 	waitq_thread_remove(safeq, thread);
1470 	waitq_stats_count_clear_wakeup(waitq);
1471 	waitq_unlock(safeq);
1472 	return true;
1473 }
1474 
1475 
1476 void
waitq_clear_promotion_locked(waitq_t waitq,thread_t thread)1477 waitq_clear_promotion_locked(waitq_t waitq, thread_t thread)
1478 {
1479 	spl_t s = 0;
1480 
1481 	assert(waitq_held(waitq));
1482 	assert(thread != THREAD_NULL);
1483 	assert(thread == current_thread());
1484 
1485 	/* This flag is only cleared by the thread itself, so safe to check outside lock */
1486 	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
1487 		return;
1488 	}
1489 
1490 	if (!waitq_irq_safe(waitq)) {
1491 		s = splsched();
1492 	}
1493 	thread_lock(thread);
1494 
1495 	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
1496 
1497 	thread_unlock(thread);
1498 	if (!waitq_irq_safe(waitq)) {
1499 		splx(s);
1500 	}
1501 }
1502 
1503 static inline bool
waitq_should_unlock(waitq_wakeup_flags_t flags)1504 waitq_should_unlock(waitq_wakeup_flags_t flags)
1505 {
1506 	return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED)) == WAITQ_UNLOCK;
1507 }
1508 
1509 static inline bool
waitq_should_enable_interrupts(waitq_wakeup_flags_t flags)1510 waitq_should_enable_interrupts(waitq_wakeup_flags_t flags)
1511 {
1512 	return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED | WAITQ_ENABLE_INTERRUPTS)) == (WAITQ_UNLOCK | WAITQ_ENABLE_INTERRUPTS);
1513 }
1514 
1515 kern_return_t
waitq_wakeup64_all_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)1516 waitq_wakeup64_all_locked(
1517 	waitq_t                 waitq,
1518 	event64_t               wake_event,
1519 	wait_result_t           result,
1520 	waitq_wakeup_flags_t    flags)
1521 {
1522 	struct waitq_select_args args = {
1523 		.event = wake_event,
1524 		.result = result,
1525 		.flags = flags & ~WAITQ_HANDOFF,
1526 		.max_threads = UINT32_MAX,
1527 	};
1528 
1529 	assert(waitq_held(waitq));
1530 
1531 	if (flags & WAITQ_ENABLE_INTERRUPTS) {
1532 		assert(waitq_should_unlock(flags));
1533 		assert(ml_get_interrupts_enabled() == false);
1534 	}
1535 
1536 	do_waitq_select_n_locked(waitq, &args);
1537 	waitq_stats_count_wakeup(waitq, args.nthreads);
1538 
1539 	if (waitq_should_unlock(flags)) {
1540 		waitq_unlock(waitq);
1541 	}
1542 
1543 	if (waitq_should_enable_interrupts(flags)) {
1544 		ml_set_interrupts_enabled(true);
1545 	}
1546 
1547 	if (!circle_queue_empty(&args.threadq)) {
1548 		waitq_select_queue_flush(waitq, &args);
1549 	}
1550 
1551 	if (args.nthreads > 0) {
1552 		return KERN_SUCCESS;
1553 	}
1554 
1555 	return KERN_NOT_WAITING;
1556 }
1557 
1558 kern_return_t
waitq_wakeup64_one_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)1559 waitq_wakeup64_one_locked(
1560 	waitq_t                 waitq,
1561 	event64_t               wake_event,
1562 	wait_result_t           result,
1563 	waitq_wakeup_flags_t    flags)
1564 {
1565 	struct waitq_select_args args = {
1566 		.event = wake_event,
1567 		.result = result,
1568 		.flags = flags,
1569 		.max_threads = 1,
1570 	};
1571 
1572 	assert(waitq_held(waitq));
1573 
1574 	if (flags & WAITQ_ENABLE_INTERRUPTS) {
1575 		assert(waitq_should_unlock(flags));
1576 		assert(ml_get_interrupts_enabled() == false);
1577 	}
1578 
1579 	do_waitq_select_n_locked(waitq, &args);
1580 	waitq_stats_count_wakeup(waitq, args.nthreads);
1581 
1582 	if (waitq_should_unlock(flags)) {
1583 		waitq_unlock(waitq);
1584 	}
1585 
1586 	if (waitq_should_enable_interrupts(flags)) {
1587 		ml_set_interrupts_enabled(true);
1588 	}
1589 
1590 	if (!circle_queue_empty(&args.threadq)) {
1591 		waitq_select_queue_flush(waitq, &args);
1592 	}
1593 
1594 	if (args.nthreads > 0) {
1595 		return KERN_SUCCESS;
1596 	}
1597 
1598 	return KERN_NOT_WAITING;
1599 }
1600 
1601 thread_t
waitq_wakeup64_identify_locked(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)1602 waitq_wakeup64_identify_locked(
1603 	waitq_t                 waitq,
1604 	event64_t               wake_event,
1605 	wait_result_t           result,
1606 	waitq_wakeup_flags_t    flags)
1607 {
1608 	struct waitq_select_args args = {
1609 		.event = wake_event,
1610 		.result = result,
1611 		.flags = flags,
1612 		.max_threads = 1,
1613 		.is_identified = true,
1614 	};
1615 
1616 	assert(waitq_held(waitq));
1617 
1618 	do_waitq_select_n_locked(waitq, &args);
1619 	waitq_stats_count_wakeup(waitq, args.nthreads);
1620 
1621 	if (waitq_should_unlock(flags)) {
1622 		waitq_unlock(waitq);
1623 	}
1624 
1625 	if (waitq_should_enable_interrupts(flags)) {
1626 		ml_set_interrupts_enabled(true);
1627 	}
1628 
1629 	if (args.nthreads > 0) {
1630 		thread_t thread = cqe_dequeue_head(&args.threadq, struct thread, wait_links);
1631 
1632 		assert(args.nthreads == 1 && circle_queue_empty(&args.threadq));
1633 
1634 		/* Thread is off waitq, not unblocked yet */
1635 
1636 		return thread;
1637 	}
1638 
1639 	return THREAD_NULL;
1640 }
1641 
1642 void
waitq_resume_identified_thread(waitq_t waitq,thread_t thread,wait_result_t result,waitq_wakeup_flags_t flags)1643 waitq_resume_identified_thread(
1644 	waitq_t                 waitq,
1645 	thread_t                thread,
1646 	wait_result_t           result,
1647 	waitq_wakeup_flags_t    flags)
1648 {
1649 	spl_t spl = splsched();
1650 
1651 	thread_lock(thread);
1652 
1653 	assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1654 
1655 	maybe_adjust_thread_pri(thread, flags, waitq);
1656 	thread_go(thread, result, (flags & WAITQ_HANDOFF));
1657 
1658 	thread_unlock(thread);
1659 	splx(spl);
1660 
1661 	enable_preemption(); // balance disable upon pulling thread
1662 }
1663 
1664 void
waitq_resume_and_bind_identified_thread(waitq_t waitq,thread_t thread,processor_t processor,wait_result_t result,waitq_wakeup_flags_t flags)1665 waitq_resume_and_bind_identified_thread(
1666 	waitq_t                 waitq,
1667 	thread_t                thread,
1668 	processor_t             processor,
1669 	wait_result_t           result,
1670 	waitq_wakeup_flags_t    flags)
1671 {
1672 	spl_t spl = splsched();
1673 
1674 	thread_lock(thread);
1675 
1676 	assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1677 
1678 	maybe_adjust_thread_pri(thread, flags, waitq);
1679 	thread_bind_during_wakeup(thread, processor);
1680 	thread_go(thread, result, (flags & WAITQ_HANDOFF));
1681 
1682 	thread_unlock(thread);
1683 	splx(spl);
1684 
1685 	enable_preemption(); // balance disable upon pulling thread
1686 }
1687 
1688 kern_return_t
waitq_wakeup64_thread_and_unlock(struct waitq * waitq,event64_t event,thread_t thread,wait_result_t result)1689 waitq_wakeup64_thread_and_unlock(
1690 	struct waitq           *waitq,
1691 	event64_t              event,
1692 	thread_t               thread,
1693 	wait_result_t          result)
1694 {
1695 	kern_return_t ret = KERN_NOT_WAITING;
1696 
1697 	assert(waitq_irq_safe(waitq));
1698 	assert(waitq_held(waitq));
1699 	assert_thread_magic(thread);
1700 
1701 	/*
1702 	 * See if the thread was still waiting there.  If so, it got
1703 	 * dequeued and returned locked.
1704 	 *
1705 	 * By holding the thread locked across the go, a thread on another CPU
1706 	 * can't see itself in 'waking' state, even if it uses clear_wait.
1707 	 */
1708 	thread_lock(thread);
1709 
1710 	if (waitq_same(thread->waitq, waitq) && thread->wait_event == event) {
1711 		waitq_thread_remove(waitq, thread);
1712 		ret = KERN_SUCCESS;
1713 	}
1714 	waitq_stats_count_wakeup(waitq, ret == KERN_SUCCESS ? 1 : 0);
1715 
1716 	waitq_unlock(waitq);
1717 
1718 	if (ret == KERN_SUCCESS) {
1719 		thread_go(thread, result, /* handoff */ false);
1720 	}
1721 
1722 	thread_unlock(thread);
1723 
1724 	return ret;
1725 }
1726 
1727 
1728 #pragma mark waitq
1729 
1730 __attribute__((always_inline))
1731 void
waitq_init(waitq_t waitq,waitq_type_t type,int policy)1732 waitq_init(waitq_t waitq, waitq_type_t type, int policy)
1733 {
1734 	assert((policy & SYNC_POLICY_FIXED_PRIORITY) == 0);
1735 
1736 	*waitq.wq_q = (struct waitq){
1737 		.waitq_type  = type,
1738 		.waitq_fifo  = ((policy & SYNC_POLICY_REVERSED) == 0),
1739 	};
1740 
1741 	switch (type) {
1742 	case WQT_INVALID:
1743 		__builtin_trap();
1744 
1745 	case WQT_TURNSTILE:
1746 		/* For turnstile, initialize it as a priority queue */
1747 		priority_queue_init(&waitq.wq_q->waitq_prio_queue);
1748 		assert(waitq.wq_q->waitq_fifo == 0);
1749 		break;
1750 
1751 	case WQT_PORT:
1752 		waitq.wq_q->waitq_ts = TURNSTILE_NULL;
1753 		break;
1754 
1755 	case WQT_PORT_SET:
1756 		circle_queue_init(&waitq.wqs_set->wqset_preposts);
1757 		OS_FALLTHROUGH;
1758 	case WQT_SELECT_SET:
1759 	case WQT_QUEUE:
1760 	case WQT_SELECT:
1761 		circle_queue_init(&waitq.wq_q->waitq_queue);
1762 		break;
1763 	}
1764 
1765 	if (policy & SYNC_POLICY_INIT_LOCKED) {
1766 		hw_lck_ticket_init_locked(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1767 	} else {
1768 		hw_lck_ticket_init(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1769 	}
1770 }
1771 
1772 void
waitq_deinit(waitq_t waitq)1773 waitq_deinit(waitq_t waitq)
1774 {
1775 	waitq_type_t type = waitq_type(waitq);
1776 
1777 	switch (type) {
1778 	case WQT_QUEUE:
1779 		assert(circle_queue_empty(&waitq.wq_q->waitq_queue));
1780 		waitq_invalidate(waitq);
1781 		break;
1782 
1783 	case WQT_TURNSTILE:
1784 		assert(priority_queue_empty(&waitq.wq_q->waitq_prio_queue));
1785 		assert(waitq.wq_q->waitq_inheritor == TURNSTILE_INHERITOR_NULL);
1786 		waitq_invalidate(waitq);
1787 		break;
1788 
1789 	case WQT_PORT:
1790 		assert(waitq.wq_q->waitq_ts == TURNSTILE_NULL);
1791 		assert(circle_queue_empty(&waitq.wq_q->waitq_links));
1792 		break;
1793 
1794 	case WQT_SELECT:
1795 		assert(waitq.wq_q->waitq_sellinks.next == NULL);
1796 		assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1797 		break;
1798 
1799 	case WQT_PORT_SET:
1800 		assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1801 		assert(circle_queue_empty(&waitq.wqs_set->wqset_links));
1802 		assert(circle_queue_empty(&waitq.wqs_set->wqset_preposts));
1803 		break;
1804 
1805 	default:
1806 		panic("invalid wait type: %p/%d", waitq.wq_q, type);
1807 	}
1808 
1809 	/*
1810 	 * The waitq must have been invalidated, or hw_lck_ticket_destroy()
1811 	 * below won't wait for reservations from waitq_lock_reserve(),
1812 	 * or waitq_lock_allow_invalid().
1813 	 */
1814 	assert(!waitq_valid(waitq.wqs_set));
1815 	hw_lck_ticket_destroy(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1816 }
1817 
1818 
1819 #pragma mark port-set sets
1820 
1821 void
waitq_set_unlink_all_locked(struct waitq_set * wqset,waitq_link_list_t * free_l)1822 waitq_set_unlink_all_locked(struct waitq_set *wqset, waitq_link_list_t *free_l)
1823 {
1824 	uint32_t batch = waitq_set_unlink_batch;
1825 
1826 	waitq_invalidate(wqset);
1827 
1828 	for (;;) {
1829 		struct waitq_link *link;
1830 		queue_entry_t elt;
1831 		circle_queue_t q;
1832 		struct waitq *wq;
1833 		uint32_t ticket;
1834 		bool stable = true;
1835 
1836 		if (!circle_queue_empty(&wqset->wqset_links)) {
1837 			q = &wqset->wqset_links;
1838 		} else if (!circle_queue_empty(&wqset->wqset_preposts)) {
1839 			q = &wqset->wqset_preposts;
1840 		} else {
1841 			break;
1842 		}
1843 
1844 		if (batch-- == 0) {
1845 			waitq_unlock(wqset);
1846 			waitq_lock(wqset);
1847 			batch = waitq_set_unlink_batch;
1848 			continue;
1849 		}
1850 
1851 		elt  = circle_queue_first(q);
1852 		link = cqe_element(elt, struct waitq_link, wql_slink);
1853 		wq   = link->wql_wq;
1854 
1855 		if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
1856 			waitq_unlock(wqset);
1857 			waitq_lock_wait(wq, ticket);
1858 			waitq_lock(wqset);
1859 			stable = (elt == circle_queue_first(q) && link->wql_wq == wq);
1860 		}
1861 
1862 		if (stable) {
1863 			circle_dequeue(q, &link->wql_slink);
1864 			circle_dequeue(&wq->waitq_links, &link->wql_qlink);
1865 			wql_list_push(free_l, link);
1866 		}
1867 
1868 		waitq_unlock(wq);
1869 	}
1870 }
1871 
1872 void
waitq_clear_prepost_locked(struct waitq * waitq)1873 waitq_clear_prepost_locked(struct waitq *waitq)
1874 {
1875 	assert(waitq_type(waitq) == WQT_PORT);
1876 	waitq->waitq_preposted = false;
1877 }
1878 
1879 void
1880 waitq_set_foreach_member_locked(struct waitq_set *wqs, void (^cb)(struct waitq *))
1881 {
1882 	struct waitq_link *link;
1883 
1884 	cqe_foreach_element(link, &wqs->wqset_links, wql_slink) {
1885 		cb(link->wql_wq);
1886 	}
1887 
1888 	cqe_foreach_element(link, &wqs->wqset_preposts, wql_slink) {
1889 		cb(link->wql_wq);
1890 	}
1891 }
1892 
1893 __abortlike
1894 static void
__waitq_link_arguments_panic(struct waitq * waitq,struct waitq_set * wqset)1895 __waitq_link_arguments_panic(struct waitq *waitq, struct waitq_set *wqset)
1896 {
1897 	if (!waitq_valid(waitq)) {
1898 		panic("Invalid waitq: %p", waitq);
1899 	}
1900 	if (waitq_type(waitq) != WQT_PORT) {
1901 		panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
1902 	}
1903 	panic("Invalid waitq-set: %p", wqset);
1904 }
1905 
1906 static inline void
__waitq_link_arguments_validate(struct waitq * waitq,struct waitq_set * wqset)1907 __waitq_link_arguments_validate(struct waitq *waitq, struct waitq_set *wqset)
1908 {
1909 	if (!waitq_valid(waitq) ||
1910 	    waitq_type(waitq) != WQT_PORT ||
1911 	    waitq_type(wqset) != WQT_PORT_SET) {
1912 		__waitq_link_arguments_panic(waitq, wqset);
1913 	}
1914 }
1915 
1916 __abortlike
1917 static void
__waitq_invalid_panic(waitq_t waitq)1918 __waitq_invalid_panic(waitq_t waitq)
1919 {
1920 	panic("Invalid waitq: %p", waitq.wq_q);
1921 }
1922 
1923 static void
__waitq_validate(waitq_t waitq)1924 __waitq_validate(waitq_t waitq)
1925 {
1926 	if (!waitq_valid(waitq)) {
1927 		__waitq_invalid_panic(waitq);
1928 	}
1929 }
1930 
1931 kern_return_t
waitq_link_locked(struct waitq * waitq,struct waitq_set * wqset,waitq_link_t * linkp)1932 waitq_link_locked(struct waitq *waitq, struct waitq_set *wqset,
1933     waitq_link_t *linkp)
1934 {
1935 	assert(linkp->wqlh);
1936 
1937 	__waitq_link_arguments_validate(waitq, wqset);
1938 
1939 	if (wql_find(waitq, wqset)) {
1940 		return KERN_ALREADY_IN_SET;
1941 	}
1942 
1943 	linkp->wqll->wql_wq = waitq;
1944 	linkp->wqll->wql_wqs = (uintptr_t)wqset;
1945 
1946 	if (waitq_valid(wqset)) {
1947 		circle_enqueue_tail(&wqset->wqset_links, &linkp->wqll->wql_slink);
1948 		circle_enqueue_tail(&waitq->waitq_links, &linkp->wqll->wql_qlink);
1949 		*linkp = WQL_NULL;
1950 	}
1951 
1952 	return KERN_SUCCESS;
1953 }
1954 
1955 kern_return_t
waitq_link_prepost_locked(struct waitq * waitq,struct waitq_set * wqset)1956 waitq_link_prepost_locked(struct waitq *waitq, struct waitq_set *wqset)
1957 {
1958 	struct waitq_link *link;
1959 
1960 	__waitq_link_arguments_validate(waitq, wqset);
1961 
1962 	link = wql_find(waitq, wqset);
1963 	if (link == NULL) {
1964 		return KERN_NOT_IN_SET;
1965 	}
1966 
1967 	if (!wql_wqs_preposted(link)) {
1968 		wql_wqs_mark_preposted(link);
1969 		waitq->waitq_preposted = true;
1970 		circle_dequeue(&wqset->wqset_links, &link->wql_slink);
1971 		circle_enqueue_tail(&wqset->wqset_preposts, &link->wql_slink);
1972 		ipc_pset_prepost(wqset, waitq);
1973 	}
1974 
1975 	return KERN_SUCCESS;
1976 }
1977 
1978 waitq_link_t
waitq_unlink_locked(struct waitq * waitq,struct waitq_set * wqset)1979 waitq_unlink_locked(struct waitq *waitq, struct waitq_set *wqset)
1980 {
1981 	struct waitq_link *link;
1982 
1983 	__waitq_link_arguments_validate(waitq, wqset);
1984 
1985 	link = wql_find(waitq, wqset);
1986 	if (link) {
1987 		circle_dequeue(wql_wqs_queue(wqset, link), &link->wql_slink);
1988 		circle_dequeue(&waitq->waitq_links, &link->wql_qlink);
1989 	}
1990 
1991 	return (waitq_link_t){ .wqll = link };
1992 }
1993 
1994 void
waitq_unlink_all_locked(struct waitq * waitq,struct waitq_set * except_wqset,waitq_link_list_t * free_l)1995 waitq_unlink_all_locked(struct waitq *waitq, struct waitq_set *except_wqset,
1996     waitq_link_list_t *free_l)
1997 {
1998 	struct waitq_link *kept_link = NULL;
1999 	struct waitq_link *link;
2000 
2001 	assert(waitq_type(waitq) == WQT_PORT);
2002 
2003 	cqe_foreach_element_safe(link, &waitq->waitq_links, wql_qlink) {
2004 		waitq_t wqs = wql_wqs(link);
2005 
2006 		if (wqs.wqs_set == except_wqset) {
2007 			kept_link = link;
2008 			continue;
2009 		}
2010 
2011 		waitq_lock(wqs);
2012 		circle_dequeue(wql_wqs_queue(wqs.wqs_set, link),
2013 		    &link->wql_slink);
2014 		wql_list_push(free_l, link);
2015 		waitq_unlock(wqs);
2016 	}
2017 
2018 	circle_queue_init(&waitq->waitq_links);
2019 	if (kept_link) {
2020 		circle_enqueue_tail(&waitq->waitq_links, &kept_link->wql_qlink);
2021 	}
2022 }
2023 
2024 struct waitq *
waitq_set_first_prepost(struct waitq_set * wqset,wqs_prepost_flags_t flags)2025 waitq_set_first_prepost(struct waitq_set *wqset, wqs_prepost_flags_t flags)
2026 {
2027 	circle_queue_t q = &wqset->wqset_preposts;
2028 	queue_entry_t elt;
2029 	struct waitq_link *link;
2030 	struct waitq *wq;
2031 	uint32_t ticket;
2032 
2033 	if (__improbable(!waitq_valid(wqset))) {
2034 		return NULL;
2035 	}
2036 
2037 	while (!circle_queue_empty(q)) {
2038 		elt  = circle_queue_first(q);
2039 		link = cqe_element(elt, struct waitq_link, wql_slink);
2040 		wq   = link->wql_wq;
2041 
2042 		if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
2043 			waitq_unlock(wqset);
2044 			waitq_lock_wait(wq, ticket);
2045 			waitq_lock(wqset);
2046 			if (!waitq_valid(wqset)) {
2047 				waitq_unlock(wq);
2048 				return NULL;
2049 			}
2050 
2051 			if (elt != circle_queue_first(q) || link->wql_wq != wq) {
2052 				waitq_unlock(wq);
2053 				continue;
2054 			}
2055 		}
2056 
2057 		if (wq->waitq_preposted) {
2058 			if ((flags & WQS_PREPOST_PEEK) == 0) {
2059 				circle_queue_rotate_head_forward(q);
2060 			}
2061 			if ((flags & WQS_PREPOST_LOCK) == 0) {
2062 				waitq_unlock(wq);
2063 			}
2064 			return wq;
2065 		}
2066 
2067 		/*
2068 		 * We found a link that is no longer preposted,
2069 		 * someone must have called waitq_clear_prepost_locked()
2070 		 * and this set just only noticed.
2071 		 */
2072 		wql_wqs_clear_preposted(link);
2073 		waitq_unlock(wq);
2074 
2075 		circle_dequeue(q, &link->wql_slink);
2076 		circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
2077 	}
2078 
2079 	return NULL;
2080 }
2081 
2082 
2083 #pragma mark select sets
2084 
2085 /**
2086  * @function select_set_nextid()
2087  *
2088  * @brief
2089  * Generate a unique ID for a select set "generation"
2090  *
2091  * @discussion
2092  * This mixes the CPU number with a monotonic clock
2093  * (in order to avoid contention on a global atomic).
2094  *
2095  * In order for select sets to be invalidated very quickly,
2096  * they do not have backward linkages to their member queues.
2097  *
2098  * Instead, each time a new @c select() "pass" is initiated,
2099  * a new ID is generated, which is copied onto the @c waitq_sellink
2100  * links at the time of link.
2101  *
2102  * The zone for select sets is sequestered, which allows for select
2103  * wait queues to speculatively lock their set during prepost
2104  * and use this ID to debounce wakeups and avoid spurious wakeups
2105  * (as an "optimization" because select recovers from spurious wakeups,
2106  * we just want those to be very rare).
2107  */
2108 __attribute__((always_inline))
2109 static inline uint64_t
select_set_nextid(bool preemption_enabled)2110 select_set_nextid(bool preemption_enabled)
2111 {
2112 	/* waitq_bootstrap() set the low byte to a unique value per CPU */
2113 	static_assert(MAX_CPUS <= 256);
2114 	const uint64_t inc = 256;
2115 	uint64_t id;
2116 
2117 #ifdef __x86_64__
2118 	/* uncontended atomics are slower than disabling preemption on Intel */
2119 	if (preemption_enabled) {
2120 		disable_preemption();
2121 	}
2122 	id = (*PERCPU_GET(select_setid) += inc);
2123 	if (preemption_enabled) {
2124 		enable_preemption();
2125 	}
2126 #else
2127 	/*
2128 	 * if preemption is enabled this might update another CPU's
2129 	 * setid, which will be rare but is acceptable, it still
2130 	 * produces a unique select ID.
2131 	 *
2132 	 * We chose this because the uncontended atomics on !intel
2133 	 * are faster than disabling/reenabling preemption.
2134 	 */
2135 	(void)preemption_enabled;
2136 	id = os_atomic_add(PERCPU_GET(select_setid), inc, relaxed);
2137 #endif
2138 
2139 	return id;
2140 }
2141 
2142 struct select_set *
select_set_alloc(void)2143 select_set_alloc(void)
2144 {
2145 	struct select_set *selset;
2146 	selset = zalloc_id(ZONE_ID_SELECT_SET, Z_ZERO | Z_WAITOK | Z_NOFAIL);
2147 
2148 	waitq_init(selset, WQT_SELECT_SET, SYNC_POLICY_FIFO);
2149 	selset->selset_id = select_set_nextid(true);
2150 
2151 	return selset;
2152 }
2153 
2154 __abortlike
2155 static void
__select_set_link_arguments_panic(struct waitq * waitq,struct select_set * set)2156 __select_set_link_arguments_panic(struct waitq *waitq, struct select_set *set)
2157 {
2158 	if (!waitq_valid(waitq)) {
2159 		panic("Invalid waitq: %p", waitq);
2160 	}
2161 	if (waitq_type(waitq) != WQT_SELECT) {
2162 		panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
2163 	}
2164 	panic("Invalid waitq-set: %p", set);
2165 }
2166 
2167 static inline void
__select_set_link_arguments_validate(struct waitq * waitq,struct select_set * set)2168 __select_set_link_arguments_validate(struct waitq *waitq, struct select_set *set)
2169 {
2170 	if (!waitq_valid(waitq) ||
2171 	    waitq_type(waitq) != WQT_SELECT ||
2172 	    waitq_type(set) != WQT_SELECT_SET) {
2173 		__select_set_link_arguments_panic(waitq, set);
2174 	}
2175 }
2176 
2177 void
select_set_link(struct waitq * waitq,struct select_set * set,waitq_link_t * linkp)2178 select_set_link(struct waitq *waitq, struct select_set *set,
2179     waitq_link_t *linkp)
2180 {
2181 	struct waitq_sellink *link;
2182 
2183 	__select_set_link_arguments_validate(waitq, set);
2184 
2185 	waitq_lock(waitq);
2186 
2187 	if (waitq == &select_conflict_queue) {
2188 		waitq_lock(set);
2189 		set->selset_conflict = true;
2190 		waitq_unlock(set);
2191 	}
2192 
2193 	wql_list_foreach(link, &waitq->waitq_sellinks) {
2194 		if (waitq_same(wql_wqs(link), set)) {
2195 			goto found;
2196 		}
2197 	}
2198 
2199 	link = linkp->wqls;
2200 	*linkp = WQL_NULL;
2201 	wql_list_push(&waitq->waitq_sellinks, link);
2202 
2203 found:
2204 	link->wql_wqs = (uintptr_t)set;
2205 	link->wql_setid = set->selset_id;
2206 	waitq_unlock(waitq);
2207 }
2208 
2209 static void
select_set_unlink_conflict_queue(struct select_set * set)2210 select_set_unlink_conflict_queue(struct select_set *set)
2211 {
2212 	struct waitq_link_list_entry **prev;
2213 	struct waitq_sellink *link;
2214 
2215 	waitq_lock(&select_conflict_queue);
2216 
2217 	/*
2218 	 * We know the conflict queue is hooked,
2219 	 * so find the linkage and free it.
2220 	 */
2221 	prev = &select_conflict_queue.waitq_sellinks.next;
2222 	for (;;) {
2223 		assert(*prev);
2224 		link = wql_list_elem(*prev);
2225 		if (waitq_same(wql_wqs(link), set)) {
2226 			*prev = link->wql_next.next;
2227 			break;
2228 		}
2229 		prev = &link->wql_next.next;
2230 	}
2231 
2232 	waitq_unlock(&select_conflict_queue);
2233 
2234 	waitq_link_free(WQT_SELECT_SET, link);
2235 }
2236 
2237 static void
__select_set_reset(struct select_set * set,bool invalidate)2238 __select_set_reset(struct select_set *set, bool invalidate)
2239 {
2240 	if (set->selset_conflict) {
2241 		select_set_unlink_conflict_queue(set);
2242 	}
2243 
2244 	waitq_lock(set);
2245 	if (invalidate) {
2246 		waitq_invalidate(set);
2247 	}
2248 	set->selset_id = select_set_nextid(false);
2249 	set->selset_preposted = 0;
2250 	set->selset_conflict = 0;
2251 	waitq_unlock(set);
2252 }
2253 
2254 void
select_set_reset(struct select_set * set)2255 select_set_reset(struct select_set *set)
2256 {
2257 	__select_set_reset(set, false);
2258 }
2259 
2260 void
select_set_free(struct select_set * set)2261 select_set_free(struct select_set *set)
2262 {
2263 	__select_set_reset(set, true);
2264 	hw_lck_ticket_destroy(&set->selset_interlock, &waitq_lck_grp);
2265 	zfree_id(ZONE_ID_SELECT_SET, set);
2266 }
2267 
2268 void
select_waitq_wakeup_and_deinit(struct waitq * waitq,event64_t wake_event,wait_result_t result)2269 select_waitq_wakeup_and_deinit(
2270 	struct waitq           *waitq,
2271 	event64_t               wake_event,
2272 	wait_result_t           result)
2273 {
2274 	waitq_link_list_t free_l = { };
2275 
2276 	if (waitq_is_valid(waitq)) {
2277 		assert(waitq_type(waitq) == WQT_SELECT);
2278 
2279 		waitq_lock(waitq);
2280 
2281 		waitq_wakeup64_all_locked(waitq, wake_event, result,
2282 		    WAITQ_KEEP_LOCKED);
2283 
2284 		waitq_invalidate(waitq);
2285 		free_l = waitq->waitq_sellinks;
2286 		waitq->waitq_sellinks.next = NULL;
2287 
2288 		waitq_unlock(waitq);
2289 
2290 		waitq_link_free_list(WQT_SELECT, &free_l);
2291 
2292 		waitq_deinit(waitq);
2293 	}
2294 }
2295 
2296 #pragma mark assert_wait / wakeup (high level)
2297 
2298 wait_result_t
waitq_assert_wait64(struct waitq * waitq,event64_t wait_event,wait_interrupt_t interruptible,uint64_t deadline)2299 waitq_assert_wait64(struct waitq *waitq,
2300     event64_t wait_event,
2301     wait_interrupt_t interruptible,
2302     uint64_t deadline)
2303 {
2304 	thread_t thread = current_thread();
2305 	wait_result_t ret;
2306 	spl_t s = 0;
2307 
2308 	__waitq_validate(waitq);
2309 
2310 	if (waitq_irq_safe(waitq)) {
2311 		s = splsched();
2312 	}
2313 	waitq_lock(waitq);
2314 
2315 	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
2316 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread);
2317 
2318 	waitq_unlock(waitq);
2319 	if (waitq_irq_safe(waitq)) {
2320 		splx(s);
2321 	}
2322 
2323 	return ret;
2324 }
2325 
2326 wait_result_t
waitq_assert_wait64_leeway(struct waitq * waitq,event64_t wait_event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)2327 waitq_assert_wait64_leeway(struct waitq *waitq,
2328     event64_t wait_event,
2329     wait_interrupt_t interruptible,
2330     wait_timeout_urgency_t urgency,
2331     uint64_t deadline,
2332     uint64_t leeway)
2333 {
2334 	wait_result_t ret;
2335 	thread_t thread = current_thread();
2336 	spl_t s = 0;
2337 
2338 	__waitq_validate(waitq);
2339 
2340 	if (waitq_irq_safe(waitq)) {
2341 		s = splsched();
2342 	}
2343 	waitq_lock(waitq);
2344 
2345 	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
2346 	    urgency, deadline, leeway, thread);
2347 
2348 	waitq_unlock(waitq);
2349 	if (waitq_irq_safe(waitq)) {
2350 		splx(s);
2351 	}
2352 
2353 	return ret;
2354 }
2355 
2356 kern_return_t
waitq_wakeup64_one(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)2357 waitq_wakeup64_one(
2358 	waitq_t                 waitq,
2359 	event64_t               wake_event,
2360 	wait_result_t           result,
2361 	waitq_wakeup_flags_t    flags)
2362 {
2363 	__waitq_validate(waitq);
2364 
2365 	spl_t spl = 0;
2366 
2367 	if (waitq_irq_safe(waitq)) {
2368 		spl = splsched();
2369 	}
2370 
2371 	waitq_lock(waitq);
2372 
2373 	/* waitq is unlocked upon return, splx is handled */
2374 	return waitq_wakeup64_one_locked(waitq, wake_event, result,
2375 	           flags | waitq_flags_splx(spl) | WAITQ_UNLOCK);
2376 }
2377 
2378 kern_return_t
waitq_wakeup64_all(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)2379 waitq_wakeup64_all(
2380 	waitq_t                 waitq,
2381 	event64_t               wake_event,
2382 	wait_result_t           result,
2383 	waitq_wakeup_flags_t    flags)
2384 {
2385 	__waitq_validate(waitq);
2386 
2387 	spl_t spl = 0;
2388 
2389 	if (waitq_irq_safe(waitq)) {
2390 		spl = splsched();
2391 	}
2392 
2393 	waitq_lock(waitq);
2394 
2395 	/* waitq is unlocked upon return, splx is handled */
2396 	return waitq_wakeup64_all_locked(waitq, wake_event, result,
2397 	           flags | waitq_flags_splx(spl) | WAITQ_UNLOCK);
2398 }
2399 
2400 kern_return_t
waitq_wakeup64_thread(struct waitq * waitq,event64_t event,thread_t thread,wait_result_t result)2401 waitq_wakeup64_thread(
2402 	struct waitq           *waitq,
2403 	event64_t               event,
2404 	thread_t                thread,
2405 	wait_result_t           result)
2406 {
2407 	spl_t s = splsched();
2408 	kern_return_t ret;
2409 
2410 	__waitq_validate(waitq);
2411 	assert(waitq_irq_safe(waitq));
2412 	waitq_lock(waitq);
2413 
2414 	ret = waitq_wakeup64_thread_and_unlock(waitq, event, thread, result);
2415 
2416 	splx(s);
2417 
2418 	return ret;
2419 }
2420 
2421 thread_t
waitq_wakeup64_identify(waitq_t waitq,event64_t wake_event,wait_result_t result,waitq_wakeup_flags_t flags)2422 waitq_wakeup64_identify(
2423 	waitq_t                 waitq,
2424 	event64_t               wake_event,
2425 	wait_result_t           result,
2426 	waitq_wakeup_flags_t    flags)
2427 {
2428 	__waitq_validate(waitq);
2429 
2430 	spl_t spl = 0;
2431 
2432 	if (waitq_irq_safe(waitq)) {
2433 		spl = splsched();
2434 	}
2435 
2436 	waitq_lock(waitq);
2437 
2438 	thread_t thread = waitq_wakeup64_identify_locked(waitq, wake_event,
2439 	    result, flags | waitq_flags_splx(spl) | WAITQ_UNLOCK);
2440 	/* waitq is unlocked, thread is not go-ed yet */
2441 	/* preemption disabled if thread non-null */
2442 	/* splx is handled */
2443 
2444 	if (thread != THREAD_NULL) {
2445 		thread_reference(thread);
2446 		waitq_resume_identified_thread(waitq, thread, result, flags);
2447 		/* preemption enabled, thread go-ed */
2448 		/* returns +1 ref to running thread */
2449 		return thread;
2450 	}
2451 
2452 	return THREAD_NULL;
2453 }
2454 
2455 
2456 #pragma mark tests
2457 #if DEBUG || DEVELOPMENT
2458 
2459 #include <ipc/ipc_pset.h>
2460 #include <sys/errno.h>
2461 
2462 #define MAX_GLOBAL_TEST_QUEUES 64
2463 static struct waitq wqt_waitq_array[MAX_GLOBAL_TEST_QUEUES];
2464 static bool wqt_running;
2465 static bool wqt_init;
2466 
2467 static bool
wqt_start(const char * test,int64_t * out)2468 wqt_start(const char *test, int64_t *out)
2469 {
2470 	if (os_atomic_xchg(&wqt_running, true, acquire)) {
2471 		*out = 0;
2472 		return false;
2473 	}
2474 
2475 	if (!wqt_init) {
2476 		wqt_init = true;
2477 		for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) {
2478 			waitq_init(&wqt_waitq_array[i], WQT_PORT, SYNC_POLICY_FIFO);
2479 		}
2480 	}
2481 
2482 	printf("[WQ] starting %s\n", test);
2483 	return true;
2484 }
2485 
2486 static int
wqt_end(const char * test,int64_t * out)2487 wqt_end(const char *test, int64_t *out)
2488 {
2489 	os_atomic_store(&wqt_running, false, release);
2490 	printf("[WQ] done %s\n", test);
2491 	*out = 1;
2492 	return 0;
2493 }
2494 
2495 static struct waitq *
wqt_wq(uint32_t index)2496 wqt_wq(uint32_t index)
2497 {
2498 	return &wqt_waitq_array[index];
2499 }
2500 
2501 static uint32_t
wqt_idx(struct waitq * waitq)2502 wqt_idx(struct waitq *waitq)
2503 {
2504 	assert(waitq >= wqt_waitq_array &&
2505 	    waitq < wqt_waitq_array + MAX_GLOBAL_TEST_QUEUES);
2506 	return (uint32_t)(waitq - wqt_waitq_array);
2507 }
2508 
2509 __attribute__((overloadable))
2510 static uint64_t
wqt_bit(uint32_t index)2511 wqt_bit(uint32_t index)
2512 {
2513 	return 1ull << index;
2514 }
2515 
2516 __attribute__((overloadable))
2517 static uint64_t
wqt_bit(struct waitq * waitq)2518 wqt_bit(struct waitq *waitq)
2519 {
2520 	return wqt_bit(wqt_idx(waitq));
2521 }
2522 
2523 static struct waitq_set *
wqt_wqset_create(void)2524 wqt_wqset_create(void)
2525 {
2526 	struct waitq_set *wqset;
2527 
2528 	wqset = &ipc_pset_alloc_special(ipc_space_kernel)->ips_wqset;
2529 	printf("[WQ]: created waitq set %p\n", wqset);
2530 	return wqset;
2531 }
2532 
2533 static void
wqt_wqset_free(struct waitq_set * wqset)2534 wqt_wqset_free(struct waitq_set *wqset)
2535 {
2536 	printf("[WQ]: destroying waitq set %p\n", wqset);
2537 	waitq_lock(wqset);
2538 	ipc_pset_destroy(ipc_space_kernel,
2539 	    __container_of(wqset, struct ipc_pset, ips_wqset));
2540 }
2541 
2542 static void
wqt_link(uint32_t index,struct waitq_set * wqset,kern_return_t want)2543 wqt_link(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2544 {
2545 	struct waitq *waitq = wqt_wq(index);
2546 	waitq_link_t link = waitq_link_alloc(WQT_PORT_SET);
2547 	kern_return_t kr;
2548 
2549 	printf("[WQ]: linking waitq [%d] to global wqset (%p)\n", index, wqset);
2550 
2551 	waitq_lock(waitq);
2552 	waitq_lock(wqset);
2553 	kr = waitq_link_locked(waitq, wqset, &link);
2554 	waitq_unlock(wqset);
2555 	waitq_unlock(waitq);
2556 
2557 	if (link.wqlh) {
2558 		waitq_link_free(WQT_PORT_SET, link);
2559 	}
2560 
2561 	printf("[WQ]:\tkr=%d\texpected=%d\n", kr, want);
2562 	assert(kr == want);
2563 }
2564 
2565 static void
wqt_unlink(uint32_t index,struct waitq_set * wqset,__assert_only kern_return_t want)2566 wqt_unlink(uint32_t index, struct waitq_set *wqset, __assert_only kern_return_t want)
2567 {
2568 	struct waitq *waitq = wqt_wq(index);
2569 	waitq_link_t link;
2570 	kern_return_t kr;
2571 
2572 	printf("[WQ]: unlinking waitq [%d] from global wqset (%p)\n",
2573 	    index, wqset);
2574 
2575 	waitq_lock(waitq);
2576 	waitq_lock(wqset);
2577 	link = waitq_unlink_locked(waitq, wqset);
2578 	waitq_unlock(wqset);
2579 	waitq_unlock(waitq);
2580 
2581 	if (link.wqlh) {
2582 		waitq_link_free(WQT_PORT_SET, link);
2583 		kr = KERN_SUCCESS;
2584 	} else {
2585 		kr = KERN_NOT_IN_SET;
2586 	}
2587 
2588 	printf("[WQ]: \tkr=%d\n", kr);
2589 	assert(kr == want);
2590 }
2591 
2592 static void
wqt_wakeup_one(uint32_t index,event64_t event64,__assert_only kern_return_t want)2593 wqt_wakeup_one(uint32_t index, event64_t event64, __assert_only kern_return_t want)
2594 {
2595 	kern_return_t kr;
2596 
2597 	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
2598 	    index, event64);
2599 	kr = waitq_wakeup64_one(wqt_wq(index), event64,
2600 	    THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
2601 	printf("[WQ]: \tkr=%d\n", kr);
2602 	assert(kr == want);
2603 }
2604 
2605 static void
wqt_clear_preposts(uint32_t idx)2606 wqt_clear_preposts(uint32_t idx)
2607 {
2608 	waitq_lock(wqt_wq(idx));
2609 	(void)waitq_clear_prepost_locked(wqt_wq(idx));
2610 	waitq_unlock(wqt_wq(idx));
2611 }
2612 
2613 static void
wqt_preposts_gc_locked(struct waitq_set * wqset)2614 wqt_preposts_gc_locked(struct waitq_set *wqset)
2615 {
2616 	circle_queue_t q = &wqset->wqset_preposts;
2617 	struct waitq_link *link;
2618 	uint32_t ticket;
2619 
2620 again:
2621 	cqe_foreach_element_safe(link, q, wql_slink) {
2622 		struct waitq *wq = link->wql_wq;
2623 
2624 		if (!waitq_lock_reserve(wq, &ticket)) {
2625 			waitq_unlock(wqset);
2626 			waitq_lock_wait(wq, ticket);
2627 			waitq_lock(wqset);
2628 			waitq_unlock(wq);
2629 			/* the list was possibly mutated, restart */
2630 			goto again;
2631 		}
2632 
2633 		if (!wq->waitq_preposted) {
2634 			wql_wqs_clear_preposted(link);
2635 			circle_dequeue(q, &link->wql_slink);
2636 			circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
2637 		}
2638 
2639 		waitq_unlock(wq);
2640 	}
2641 }
2642 
2643 static void
wqt_expect_preposts(struct waitq_set * wqset,__assert_only uint64_t preposts)2644 wqt_expect_preposts(struct waitq_set *wqset, __assert_only uint64_t preposts)
2645 {
2646 	struct waitq_link *link;
2647 	uint64_t found = 0;
2648 
2649 	waitq_lock(wqset);
2650 
2651 	wqt_preposts_gc_locked(wqset);
2652 
2653 	cqe_foreach_element(link, &wqset->wqset_preposts, wql_slink) {
2654 		struct waitq *waitq = link->wql_wq;
2655 
2656 		printf("[WQ]: found prepost %d\n", wqt_idx(waitq));
2657 		assertf((found & wqt_bit(waitq)) == 0,
2658 		    "found waitq %d twice", wqt_idx(waitq));
2659 		found |= wqt_bit(waitq);
2660 	}
2661 
2662 	waitq_unlock(wqset);
2663 
2664 	assertf(found == preposts, "preposts expected 0x%llx, but got 0x%llx",
2665 	    preposts, found);
2666 }
2667 
2668 static int
waitq_basic_test(__unused int64_t in,int64_t * out)2669 waitq_basic_test(__unused int64_t in, int64_t *out)
2670 {
2671 	struct waitq_set *wqset;
2672 
2673 	if (!wqt_start(__func__, out)) {
2674 		return EBUSY;
2675 	}
2676 
2677 	wqset = wqt_wqset_create();
2678 	wqt_link(10, wqset, KERN_SUCCESS);
2679 	wqt_link(10, wqset, KERN_ALREADY_IN_SET);
2680 	wqt_link(11, wqset, KERN_SUCCESS);
2681 	wqt_link(11, wqset, KERN_ALREADY_IN_SET);
2682 	wqt_link(12, wqset, KERN_SUCCESS);
2683 	wqt_link(12, wqset, KERN_ALREADY_IN_SET);
2684 
2685 	wqt_wakeup_one(10, NO_EVENT64, KERN_NOT_WAITING);
2686 	wqt_wakeup_one(12, NO_EVENT64, KERN_NOT_WAITING);
2687 
2688 	wqt_expect_preposts(wqset, wqt_bit(10) | wqt_bit(12));
2689 	wqt_clear_preposts(10);
2690 
2691 	wqt_expect_preposts(wqset, wqt_bit(12));
2692 	wqt_clear_preposts(12);
2693 
2694 	wqt_expect_preposts(wqset, 0);
2695 
2696 	wqt_unlink(12, wqset, KERN_SUCCESS);
2697 	wqt_unlink(12, wqset, KERN_NOT_IN_SET);
2698 	wqt_unlink(11, wqset, KERN_SUCCESS);
2699 	wqt_unlink(10, wqset, KERN_SUCCESS);
2700 	wqt_wqset_free(wqset);
2701 
2702 	return wqt_end(__func__, out);
2703 }
2704 SYSCTL_TEST_REGISTER(waitq_basic, waitq_basic_test);
2705 #endif /* DEBUG || DEVELOPMENT */
2706