1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
29
30 #include <sys/cdefs.h>
31
32 #include <kern/assert.h>
33 #include <kern/ast.h>
34 #include <kern/clock.h>
35 #include <kern/cpu_data.h>
36 #include <kern/kern_types.h>
37 #include <kern/policy_internal.h>
38 #include <kern/processor.h>
39 #include <kern/sched_prim.h> /* for thread_exception_return */
40 #include <kern/task.h>
41 #include <kern/thread.h>
42 #include <kern/thread_group.h>
43 #include <kern/zalloc.h>
44 #include <mach/kern_return.h>
45 #include <mach/mach_param.h>
46 #include <mach/mach_port.h>
47 #include <mach/mach_types.h>
48 #include <mach/mach_vm.h>
49 #include <mach/sync_policy.h>
50 #include <mach/task.h>
51 #include <mach/thread_act.h> /* for thread_resume */
52 #include <mach/thread_policy.h>
53 #include <mach/thread_status.h>
54 #include <mach/vm_prot.h>
55 #include <mach/vm_statistics.h>
56 #include <machine/atomic.h>
57 #include <machine/machine_routines.h>
58 #include <machine/smp.h>
59 #include <vm/vm_map.h>
60 #include <vm/vm_protos.h>
61
62 #include <sys/eventvar.h>
63 #include <sys/kdebug.h>
64 #include <sys/kernel.h>
65 #include <sys/lock.h>
66 #include <sys/param.h>
67 #include <sys/proc_info.h> /* for fill_procworkqueue */
68 #include <sys/proc_internal.h>
69 #include <sys/pthread_shims.h>
70 #include <sys/resourcevar.h>
71 #include <sys/signalvar.h>
72 #include <sys/sysctl.h>
73 #include <sys/sysproto.h>
74 #include <sys/systm.h>
75 #include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
76
77 #include <pthread/bsdthread_private.h>
78 #include <pthread/workqueue_syscalls.h>
79 #include <pthread/workqueue_internal.h>
80 #include <pthread/workqueue_trace.h>
81
82 #include <os/log.h>
83
84 static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
85 static void workq_schedule_creator(proc_t p, struct workqueue *wq,
86 workq_kern_threadreq_flags_t flags);
87
88 static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
89 workq_threadreq_t req);
90
91 static uint32_t workq_constrained_allowance(struct workqueue *wq,
92 thread_qos_t at_qos, struct uthread *uth, bool may_start_timer);
93
94 static bool _wq_cooperative_queue_refresh_best_req_qos(struct workqueue *wq);
95
96 static bool workq_thread_is_busy(uint64_t cur_ts,
97 _Atomic uint64_t *lastblocked_tsp);
98
99 static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
100
101 static bool
102 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags);
103
104 static inline void
105 workq_lock_spin(struct workqueue *wq);
106
107 static inline void
108 workq_unlock(struct workqueue *wq);
109
110 #pragma mark globals
111
112 struct workq_usec_var {
113 uint32_t usecs;
114 uint64_t abstime;
115 };
116
117 #define WORKQ_SYSCTL_USECS(var, init) \
118 static struct workq_usec_var var = { .usecs = init }; \
119 SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
121 workq_sysctl_handle_usecs, "I", "")
122
123 static LCK_GRP_DECLARE(workq_lck_grp, "workq");
124 os_refgrp_decl(static, workq_refgrp, "workq", NULL);
125
126 static ZONE_DEFINE(workq_zone_workqueue, "workq.wq",
127 sizeof(struct workqueue), ZC_NONE);
128 static ZONE_DEFINE(workq_zone_threadreq, "workq.threadreq",
129 sizeof(struct workq_threadreq_s), ZC_CACHING);
130
131 static struct mpsc_daemon_queue workq_deallocate_queue;
132
133 WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS);
134 WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
135 WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
136 static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
137 static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
138 static uint32_t wq_init_constrained_limit = 1;
139 static uint16_t wq_death_max_load;
140 static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
141
142 /*
143 * This is not a hard limit but the max size we want to aim to hit across the
144 * entire cooperative pool. We can oversubscribe the pool due to non-cooperative
145 * workers and the max we will oversubscribe the pool by, is a total of
146 * wq_max_cooperative_threads * WORKQ_NUM_QOS_BUCKETS.
147 */
148 static uint32_t wq_max_cooperative_threads;
149
150 static inline uint32_t
wq_cooperative_queue_max_size(struct workqueue * wq)151 wq_cooperative_queue_max_size(struct workqueue *wq)
152 {
153 return wq->wq_cooperative_queue_has_limited_max_size ? 1 : wq_max_cooperative_threads;
154 }
155
156 #pragma mark sysctls
157
158 static int
159 workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
160 {
161 #pragma unused(arg2)
162 struct workq_usec_var *v = arg1;
163 int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
164 if (error || !req->newptr) {
165 return error;
166 }
167 clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
168 &v->abstime);
169 return 0;
170 }
171
172 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
173 &wq_max_threads, 0, "");
174
175 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &wq_max_constrained_threads, 0, "");
177
178 static int
179 wq_limit_cooperative_threads_for_proc SYSCTL_HANDLER_ARGS
180 {
181 #pragma unused(arg1, arg2, oidp)
182 int input_pool_size = 0;
183 int changed;
184 int error = 0;
185
186 error = sysctl_io_number(req, 0, sizeof(int), &input_pool_size, &changed);
187 if (error || !changed) {
188 return error;
189 }
190
191 #define WQ_COOPERATIVE_POOL_SIZE_DEFAULT 0
192 #define WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS -1
193 /* Not available currently, but sysctl interface is designed to allow these
194 * extra parameters:
195 * WQ_COOPERATIVE_POOL_SIZE_STRICT : -2 (across all bucket)
196 * WQ_COOPERATIVE_POOL_SIZE_CUSTOM : [1, 512]
197 */
198
199 if (input_pool_size != WQ_COOPERATIVE_POOL_SIZE_DEFAULT
200 && input_pool_size != WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS) {
201 error = EINVAL;
202 goto out;
203 }
204
205 proc_t p = req->p;
206 struct workqueue *wq = proc_get_wqptr(p);
207
208 if (wq != NULL) {
209 workq_lock_spin(wq);
210 if (wq->wq_reqcount > 0 || wq->wq_nthreads > 0) {
211 // Hackily enforce that the workqueue is still new (no requests or
212 // threads)
213 error = ENOTSUP;
214 } else {
215 wq->wq_cooperative_queue_has_limited_max_size = (input_pool_size == WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS);
216 }
217 workq_unlock(wq);
218 } else {
219 /* This process has no workqueue, calling this syctl makes no sense */
220 return ENOTSUP;
221 }
222
223 out:
224 return error;
225 }
226
227 SYSCTL_PROC(_kern, OID_AUTO, wq_limit_cooperative_threads,
228 CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_WR | CTLFLAG_LOCKED | CTLTYPE_INT, 0, 0,
229 wq_limit_cooperative_threads_for_proc,
230 "I", "Modify the max pool size of the cooperative pool");
231
232 #pragma mark p_wqptr
233
234 #define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
235
236 static struct workqueue *
proc_get_wqptr_fast(struct proc * p)237 proc_get_wqptr_fast(struct proc *p)
238 {
239 return os_atomic_load(&p->p_wqptr, relaxed);
240 }
241
242 struct workqueue *
proc_get_wqptr(struct proc * p)243 proc_get_wqptr(struct proc *p)
244 {
245 struct workqueue *wq = proc_get_wqptr_fast(p);
246 return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
247 }
248
249 static void
proc_set_wqptr(struct proc * p,struct workqueue * wq)250 proc_set_wqptr(struct proc *p, struct workqueue *wq)
251 {
252 wq = os_atomic_xchg(&p->p_wqptr, wq, release);
253 if (wq == WQPTR_IS_INITING_VALUE) {
254 proc_lock(p);
255 thread_wakeup(&p->p_wqptr);
256 proc_unlock(p);
257 }
258 }
259
260 static bool
proc_init_wqptr_or_wait(struct proc * p)261 proc_init_wqptr_or_wait(struct proc *p)
262 {
263 struct workqueue *wq;
264
265 proc_lock(p);
266 wq = os_atomic_load(&p->p_wqptr, relaxed);
267
268 if (wq == NULL) {
269 os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
270 proc_unlock(p);
271 return true;
272 }
273
274 if (wq == WQPTR_IS_INITING_VALUE) {
275 assert_wait(&p->p_wqptr, THREAD_UNINT);
276 proc_unlock(p);
277 thread_block(THREAD_CONTINUE_NULL);
278 } else {
279 proc_unlock(p);
280 }
281 return false;
282 }
283
284 static inline event_t
workq_parked_wait_event(struct uthread * uth)285 workq_parked_wait_event(struct uthread *uth)
286 {
287 return (event_t)&uth->uu_workq_stackaddr;
288 }
289
290 static inline void
workq_thread_wakeup(struct uthread * uth)291 workq_thread_wakeup(struct uthread *uth)
292 {
293 thread_wakeup_thread(workq_parked_wait_event(uth), get_machthread(uth));
294 }
295
296 #pragma mark wq_thactive
297
298 #if defined(__LP64__)
299 // Layout is:
300 // 127 - 115 : 13 bits of zeroes
301 // 114 - 112 : best QoS among all pending constrained requests
302 // 111 - 0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
303 #define WQ_THACTIVE_BUCKET_WIDTH 16
304 #define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
305 #else
306 // Layout is:
307 // 63 - 61 : best QoS among all pending constrained requests
308 // 60 : Manager bucket (0 or 1)
309 // 59 - 0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
310 #define WQ_THACTIVE_BUCKET_WIDTH 10
311 #define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
312 #endif
313 #define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
314 #define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
315
316 static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
317 "Make sure we have space to encode a QoS");
318
319 static inline wq_thactive_t
_wq_thactive(struct workqueue * wq)320 _wq_thactive(struct workqueue *wq)
321 {
322 return os_atomic_load_wide(&wq->wq_thactive, relaxed);
323 }
324
325 static inline uint8_t
_wq_bucket(thread_qos_t qos)326 _wq_bucket(thread_qos_t qos)
327 {
328 // Map both BG and MT to the same bucket by over-shifting down and
329 // clamping MT and BG together.
330 switch (qos) {
331 case THREAD_QOS_MAINTENANCE:
332 return 0;
333 default:
334 return qos - 2;
335 }
336 }
337
338 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
339 ((thread_qos_t)((tha) >> WQ_THACTIVE_QOS_SHIFT))
340
341 static inline thread_qos_t
_wq_thactive_best_constrained_req_qos(struct workqueue * wq)342 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
343 {
344 // Avoid expensive atomic operations: the three bits we're loading are in
345 // a single byte, and always updated under the workqueue lock
346 wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
347 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
348 }
349
350 static void
_wq_thactive_refresh_best_constrained_req_qos(struct workqueue * wq)351 _wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
352 {
353 thread_qos_t old_qos, new_qos;
354 workq_threadreq_t req;
355
356 req = priority_queue_max(&wq->wq_constrained_queue,
357 struct workq_threadreq_s, tr_entry);
358 new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
359 old_qos = _wq_thactive_best_constrained_req_qos(wq);
360 if (old_qos != new_qos) {
361 long delta = (long)new_qos - (long)old_qos;
362 wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
363 /*
364 * We can do an atomic add relative to the initial load because updates
365 * to this qos are always serialized under the workqueue lock.
366 */
367 v = os_atomic_add(&wq->wq_thactive, v, relaxed);
368 #ifdef __LP64__
369 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
370 (uint64_t)(v >> 64), 0);
371 #else
372 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0);
373 #endif
374 }
375 }
376
377 static inline wq_thactive_t
_wq_thactive_offset_for_qos(thread_qos_t qos)378 _wq_thactive_offset_for_qos(thread_qos_t qos)
379 {
380 uint8_t bucket = _wq_bucket(qos);
381 __builtin_assume(bucket < WORKQ_NUM_BUCKETS);
382 return (wq_thactive_t)1 << (bucket * WQ_THACTIVE_BUCKET_WIDTH);
383 }
384
385 static inline wq_thactive_t
_wq_thactive_inc(struct workqueue * wq,thread_qos_t qos)386 _wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
387 {
388 wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
389 return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
390 }
391
392 static inline wq_thactive_t
_wq_thactive_dec(struct workqueue * wq,thread_qos_t qos)393 _wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
394 {
395 wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
396 return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
397 }
398
399 static inline void
_wq_thactive_move(struct workqueue * wq,thread_qos_t old_qos,thread_qos_t new_qos)400 _wq_thactive_move(struct workqueue *wq,
401 thread_qos_t old_qos, thread_qos_t new_qos)
402 {
403 wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
404 _wq_thactive_offset_for_qos(old_qos);
405 os_atomic_add(&wq->wq_thactive, v, relaxed);
406 wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
407 wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
408 }
409
410 static inline uint32_t
_wq_thactive_aggregate_downto_qos(struct workqueue * wq,wq_thactive_t v,thread_qos_t qos,uint32_t * busycount,uint32_t * max_busycount)411 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
412 thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
413 {
414 uint32_t count = 0, active;
415 uint64_t curtime;
416
417 assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
418
419 if (busycount) {
420 curtime = mach_absolute_time();
421 *busycount = 0;
422 }
423 if (max_busycount) {
424 *max_busycount = THREAD_QOS_LAST - qos;
425 }
426
427 uint8_t i = _wq_bucket(qos);
428 v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
429 for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
430 active = v & WQ_THACTIVE_BUCKET_MASK;
431 count += active;
432
433 if (busycount && wq->wq_thscheduled_count[i] > active) {
434 if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
435 /*
436 * We only consider the last blocked thread for a given bucket
437 * as busy because we don't want to take the list lock in each
438 * sched callback. However this is an approximation that could
439 * contribute to thread creation storms.
440 */
441 (*busycount)++;
442 }
443 }
444 }
445
446 return count;
447 }
448
449 static inline void
_wq_cooperative_queue_scheduled_count_dec(struct workqueue * wq,thread_qos_t qos)450 _wq_cooperative_queue_scheduled_count_dec(struct workqueue *wq, thread_qos_t qos)
451 {
452 __assert_only uint8_t old_scheduled_count = wq->wq_cooperative_queue_scheduled_count[_wq_bucket(qos)]--;
453 assert(old_scheduled_count > 0);
454 }
455
456 static inline void
_wq_cooperative_queue_scheduled_count_inc(struct workqueue * wq,thread_qos_t qos)457 _wq_cooperative_queue_scheduled_count_inc(struct workqueue *wq, thread_qos_t qos)
458 {
459 __assert_only uint8_t old_scheduled_count = wq->wq_cooperative_queue_scheduled_count[_wq_bucket(qos)]++;
460 assert(old_scheduled_count < UINT8_MAX);
461 }
462
463 #pragma mark wq_flags
464
465 static inline uint32_t
_wq_flags(struct workqueue * wq)466 _wq_flags(struct workqueue *wq)
467 {
468 return os_atomic_load(&wq->wq_flags, relaxed);
469 }
470
471 static inline bool
_wq_exiting(struct workqueue * wq)472 _wq_exiting(struct workqueue *wq)
473 {
474 return _wq_flags(wq) & WQ_EXITING;
475 }
476
477 bool
workq_is_exiting(struct proc * p)478 workq_is_exiting(struct proc *p)
479 {
480 struct workqueue *wq = proc_get_wqptr(p);
481 return !wq || _wq_exiting(wq);
482 }
483
484
485 #pragma mark workqueue lock
486
487 static bool
workq_lock_is_acquired_kdp(struct workqueue * wq)488 workq_lock_is_acquired_kdp(struct workqueue *wq)
489 {
490 return kdp_lck_ticket_is_acquired(&wq->wq_lock);
491 }
492
493 static inline void
workq_lock_spin(struct workqueue * wq)494 workq_lock_spin(struct workqueue *wq)
495 {
496 lck_ticket_lock(&wq->wq_lock, &workq_lck_grp);
497 }
498
499 static inline void
workq_lock_held(struct workqueue * wq)500 workq_lock_held(struct workqueue *wq)
501 {
502 LCK_TICKET_ASSERT_OWNED(&wq->wq_lock);
503 }
504
505 static inline bool
workq_lock_try(struct workqueue * wq)506 workq_lock_try(struct workqueue *wq)
507 {
508 return lck_ticket_lock_try(&wq->wq_lock, &workq_lck_grp);
509 }
510
511 static inline void
workq_unlock(struct workqueue * wq)512 workq_unlock(struct workqueue *wq)
513 {
514 lck_ticket_unlock(&wq->wq_lock);
515 }
516
517 #pragma mark idle thread lists
518
519 #define WORKQ_POLICY_INIT(qos) \
520 (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
521
522 static inline thread_qos_t
workq_pri_bucket(struct uu_workq_policy req)523 workq_pri_bucket(struct uu_workq_policy req)
524 {
525 return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
526 }
527
528 static inline thread_qos_t
workq_pri_override(struct uu_workq_policy req)529 workq_pri_override(struct uu_workq_policy req)
530 {
531 return MAX(workq_pri_bucket(req), req.qos_bucket);
532 }
533
534 static inline bool
workq_thread_needs_params_change(workq_threadreq_t req,struct uthread * uth)535 workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
536 {
537 workq_threadreq_param_t cur_trp, req_trp = { };
538
539 cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
540 if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
541 req_trp = kqueue_threadreq_workloop_param(req);
542 }
543
544 /*
545 * CPU percent flags are handled separately to policy changes, so ignore
546 * them for all of these checks.
547 */
548 uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
549 uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
550
551 if (!req_flags && !cur_flags) {
552 return false;
553 }
554
555 if (req_flags != cur_flags) {
556 return true;
557 }
558
559 if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
560 return true;
561 }
562
563 if ((req_flags & TRP_POLICY) && req_trp.trp_pol != cur_trp.trp_pol) {
564 return true;
565 }
566
567 return false;
568 }
569
570 static inline bool
workq_thread_needs_priority_change(workq_threadreq_t req,struct uthread * uth)571 workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
572 {
573 if (workq_thread_needs_params_change(req, uth)) {
574 return true;
575 }
576
577 if (req->tr_qos != workq_pri_override(uth->uu_workq_pri)) {
578 return true;
579 }
580
581 #if CONFIG_PREADOPT_TG
582 thread_group_qos_t tg = kqr_preadopt_thread_group(req);
583 if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
584 /*
585 * Ideally, we'd add check here to see if thread's preadopt TG is same
586 * as the thread requests's thread group and short circuit if that is
587 * the case. But in the interest of keeping the code clean and not
588 * taking the thread lock here, we're going to skip this. We will
589 * eventually shortcircuit once we try to set the preadoption thread
590 * group on the thread.
591 */
592 return true;
593 }
594 #endif
595
596 return false;
597 }
598
599 static void
workq_thread_update_bucket(proc_t p,struct workqueue * wq,struct uthread * uth,struct uu_workq_policy old_pri,struct uu_workq_policy new_pri,bool force_run)600 workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
601 struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
602 bool force_run)
603 {
604 thread_qos_t old_bucket = old_pri.qos_bucket;
605 thread_qos_t new_bucket = workq_pri_bucket(new_pri);
606
607 if (old_bucket != new_bucket) {
608 _wq_thactive_move(wq, old_bucket, new_bucket);
609 }
610
611 new_pri.qos_bucket = new_bucket;
612 uth->uu_workq_pri = new_pri;
613
614 if (workq_pri_override(old_pri) != new_bucket) {
615 thread_set_workq_override(get_machthread(uth), new_bucket);
616 }
617
618 if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) {
619 int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
620 if (old_bucket > new_bucket) {
621 /*
622 * When lowering our bucket, we may unblock a thread request,
623 * but we can't drop our priority before we have evaluated
624 * whether this is the case, and if we ever drop the workqueue lock
625 * that would cause a priority inversion.
626 *
627 * We hence have to disallow thread creation in that case.
628 */
629 flags = 0;
630 }
631 workq_schedule_creator(p, wq, flags);
632 }
633 }
634
635 /*
636 * Sets/resets the cpu percent limits on the current thread. We can't set
637 * these limits from outside of the current thread, so this function needs
638 * to be called when we're executing on the intended
639 */
640 static void
workq_thread_reset_cpupercent(workq_threadreq_t req,struct uthread * uth)641 workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
642 {
643 assert(uth == current_uthread());
644 workq_threadreq_param_t trp = { };
645
646 if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
647 trp = kqueue_threadreq_workloop_param(req);
648 }
649
650 if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
651 /*
652 * Going through disable when we have an existing CPU percent limit
653 * set will force the ledger to refill the token bucket of the current
654 * thread. Removing any penalty applied by previous thread use.
655 */
656 thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
657 uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
658 }
659
660 if (trp.trp_flags & TRP_CPUPERCENT) {
661 thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
662 (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
663 uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
664 }
665 }
666
667 /* Called with the workq lock held */
668 static void
workq_thread_reset_pri(struct workqueue * wq,struct uthread * uth,workq_threadreq_t req,bool unpark)669 workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
670 workq_threadreq_t req, bool unpark)
671 {
672 thread_t th = get_machthread(uth);
673 thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
674 workq_threadreq_param_t trp = { };
675 int priority = 31;
676 int policy = POLICY_TIMESHARE;
677
678 if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
679 trp = kqueue_threadreq_workloop_param(req);
680 }
681
682 uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
683 uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
684
685 if (unpark) {
686 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
687 // qos sent out to userspace (may differ from uu_workq_pri on param threads)
688 uth->uu_save.uus_workq_park_data.qos = qos;
689 }
690
691 if (qos == WORKQ_THREAD_QOS_MANAGER) {
692 uint32_t mgr_pri = wq->wq_event_manager_priority;
693 assert(trp.trp_value == 0); // manager qos and thread policy don't mix
694
695 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
696 mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
697 thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
698 POLICY_TIMESHARE);
699 return;
700 }
701
702 qos = _pthread_priority_thread_qos(mgr_pri);
703 } else {
704 if (trp.trp_flags & TRP_PRIORITY) {
705 qos = THREAD_QOS_UNSPECIFIED;
706 priority = trp.trp_pri;
707 uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
708 }
709
710 if (trp.trp_flags & TRP_POLICY) {
711 policy = trp.trp_pol;
712 }
713 }
714
715 #if CONFIG_PREADOPT_TG
716 if (req && (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP)) {
717 /*
718 * We cannot safely read and borrow the reference from the kqwl since it
719 * can disappear from under us at any time due to the max-ing logic in
720 * kqueue_set_preadopted_thread_group.
721 *
722 * As such, we do the following dance:
723 *
724 * 1) cmpxchng and steal the kqwl's preadopt thread group and leave
725 * behind with (NULL + QoS). At this point, we have the reference
726 * to the thread group from the kqwl.
727 * 2) Have the thread set the preadoption thread group on itself.
728 * 3) cmpxchng from (NULL + QoS) which we set earlier in (1), back to
729 * thread_group + QoS. ie we try to give the reference back to the kqwl.
730 * If we fail, that's because a higher QoS thread group was set on the
731 * kqwl in kqueue_set_preadopted_thread_group in which case, we need to
732 * go back to (1).
733 */
734
735 _Atomic(struct thread_group *) * tg_loc = kqr_preadopt_thread_group_addr(req);
736
737 thread_group_qos_t old_tg, new_tg;
738 int ret = 0;
739 again:
740 ret = os_atomic_rmw_loop(tg_loc, old_tg, new_tg, relaxed, {
741 if (!KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
742 os_atomic_rmw_loop_give_up(break);
743 }
744
745 /*
746 * Leave the QoS behind - kqueue_set_preadopted_thread_group will
747 * only modify it if there is a higher QoS thread group to attach
748 */
749 new_tg = (thread_group_qos_t) ((uintptr_t) old_tg & KQWL_PREADOPT_TG_QOS_MASK);
750 });
751
752 if (ret) {
753 /*
754 * We successfully took the ref from the kqwl so set it on the
755 * thread now
756 */
757 thread_set_preadopt_thread_group(th, KQWL_GET_PREADOPTED_TG(old_tg));
758
759 thread_group_qos_t thread_group_to_expect = new_tg;
760 thread_group_qos_t thread_group_to_set = old_tg;
761
762 os_atomic_rmw_loop(tg_loc, old_tg, new_tg, relaxed, {
763 if (old_tg != thread_group_to_expect) {
764 /*
765 * There was an intervening write to the kqwl_preadopt_tg,
766 * and it has a higher QoS than what we are working with
767 * here. Abandon our current adopted thread group and redo
768 * the full dance
769 */
770 thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(thread_group_to_set));
771 os_atomic_rmw_loop_give_up(goto again);
772 }
773
774 new_tg = thread_group_to_set;
775 });
776 } else {
777 /* Nothing valid on the kqwl, just clear what's on the thread */
778 thread_set_preadopt_thread_group(th, NULL);
779 }
780 } else {
781 /* Not even a kqwl, clear what's on the thread */
782 thread_set_preadopt_thread_group(th, NULL);
783 }
784 #endif
785 thread_set_workq_pri(th, qos, priority, policy);
786 }
787
788 /*
789 * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
790 * every time a servicer is being told about a new max QoS.
791 */
792 void
workq_thread_set_max_qos(struct proc * p,workq_threadreq_t kqr)793 workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
794 {
795 struct uu_workq_policy old_pri, new_pri;
796 struct uthread *uth = current_uthread();
797 struct workqueue *wq = proc_get_wqptr_fast(p);
798 thread_qos_t qos = kqr->tr_kq_qos_index;
799
800 if (uth->uu_workq_pri.qos_max == qos) {
801 return;
802 }
803
804 workq_lock_spin(wq);
805 old_pri = new_pri = uth->uu_workq_pri;
806 new_pri.qos_max = qos;
807 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
808 workq_unlock(wq);
809 }
810
811 #pragma mark idle threads accounting and handling
812
813 static inline struct uthread *
workq_oldest_killable_idle_thread(struct workqueue * wq)814 workq_oldest_killable_idle_thread(struct workqueue *wq)
815 {
816 struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
817
818 if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
819 uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
820 if (uth) {
821 assert(uth->uu_save.uus_workq_park_data.has_stack);
822 }
823 }
824 return uth;
825 }
826
827 static inline uint64_t
workq_kill_delay_for_idle_thread(struct workqueue * wq)828 workq_kill_delay_for_idle_thread(struct workqueue *wq)
829 {
830 uint64_t delay = wq_reduce_pool_window.abstime;
831 uint16_t idle = wq->wq_thidlecount;
832
833 /*
834 * If we have less than wq_death_max_load threads, have a 5s timer.
835 *
836 * For the next wq_max_constrained_threads ones, decay linearly from
837 * from 5s to 50ms.
838 */
839 if (idle <= wq_death_max_load) {
840 return delay;
841 }
842
843 if (wq_max_constrained_threads > idle - wq_death_max_load) {
844 delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
845 }
846 return delay / wq_max_constrained_threads;
847 }
848
849 static inline bool
workq_should_kill_idle_thread(struct workqueue * wq,struct uthread * uth,uint64_t now)850 workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
851 uint64_t now)
852 {
853 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
854 return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
855 }
856
857 static void
workq_death_call_schedule(struct workqueue * wq,uint64_t deadline)858 workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
859 {
860 uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
861
862 if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
863 return;
864 }
865 os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
866
867 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0);
868
869 /*
870 * <rdar://problem/13139182> Due to how long term timers work, the leeway
871 * can't be too short, so use 500ms which is long enough that we will not
872 * wake up the CPU for killing threads, but short enough that it doesn't
873 * fall into long-term timer list shenanigans.
874 */
875 thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
876 wq_reduce_pool_window.abstime / 10,
877 THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
878 }
879
880 /*
881 * `decrement` is set to the number of threads that are no longer dying:
882 * - because they have been resuscitated just in time (workq_pop_idle_thread)
883 * - or have been killed (workq_thread_terminate).
884 */
885 static void
workq_death_policy_evaluate(struct workqueue * wq,uint16_t decrement)886 workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
887 {
888 struct uthread *uth;
889
890 assert(wq->wq_thdying_count >= decrement);
891 if ((wq->wq_thdying_count -= decrement) > 0) {
892 return;
893 }
894
895 if (wq->wq_thidlecount <= 1) {
896 return;
897 }
898
899 if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) {
900 return;
901 }
902
903 uint64_t now = mach_absolute_time();
904 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
905
906 if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
907 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
908 wq, wq->wq_thidlecount, 0, 0);
909 wq->wq_thdying_count++;
910 uth->uu_workq_flags |= UT_WORKQ_DYING;
911 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
912 workq_thread_wakeup(uth);
913 }
914 return;
915 }
916
917 workq_death_call_schedule(wq,
918 uth->uu_save.uus_workq_park_data.idle_stamp + delay);
919 }
920
921 void
workq_thread_terminate(struct proc * p,struct uthread * uth)922 workq_thread_terminate(struct proc *p, struct uthread *uth)
923 {
924 struct workqueue *wq = proc_get_wqptr_fast(p);
925
926 workq_lock_spin(wq);
927 TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
928 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
929 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
930 wq, wq->wq_thidlecount, 0, 0);
931 workq_death_policy_evaluate(wq, 1);
932 }
933 if (wq->wq_nthreads-- == wq_max_threads) {
934 /*
935 * We got under the thread limit again, which may have prevented
936 * thread creation from happening, redrive if there are pending requests
937 */
938 if (wq->wq_reqcount) {
939 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
940 }
941 }
942 workq_unlock(wq);
943
944 thread_deallocate(get_machthread(uth));
945 }
946
947 static void
workq_kill_old_threads_call(void * param0,void * param1 __unused)948 workq_kill_old_threads_call(void *param0, void *param1 __unused)
949 {
950 struct workqueue *wq = param0;
951
952 workq_lock_spin(wq);
953 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0);
954 os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
955 workq_death_policy_evaluate(wq, 0);
956 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0);
957 workq_unlock(wq);
958 }
959
960 static struct uthread *
workq_pop_idle_thread(struct workqueue * wq,uint16_t uu_flags,bool * needs_wakeup)961 workq_pop_idle_thread(struct workqueue *wq, uint16_t uu_flags,
962 bool *needs_wakeup)
963 {
964 struct uthread *uth;
965
966 if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
967 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
968 } else {
969 uth = TAILQ_FIRST(&wq->wq_thnewlist);
970 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
971 }
972 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
973
974 assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
975 uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
976
977 /* A thread is never woken up as part of the cooperative pool */
978 assert((uu_flags & UT_WORKQ_COOPERATIVE) == 0);
979
980 if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
981 wq->wq_constrained_threads_scheduled++;
982 }
983 wq->wq_threads_scheduled++;
984 wq->wq_thidlecount--;
985
986 if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
987 uth->uu_workq_flags ^= UT_WORKQ_DYING;
988 workq_death_policy_evaluate(wq, 1);
989 *needs_wakeup = false;
990 } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
991 *needs_wakeup = false;
992 } else {
993 *needs_wakeup = true;
994 }
995 return uth;
996 }
997
998 /*
999 * Called by thread_create_workq_waiting() during thread initialization, before
1000 * assert_wait, before the thread has been started.
1001 */
1002 event_t
workq_thread_init_and_wq_lock(task_t task,thread_t th)1003 workq_thread_init_and_wq_lock(task_t task, thread_t th)
1004 {
1005 struct uthread *uth = get_bsdthread_info(th);
1006
1007 uth->uu_workq_flags = UT_WORKQ_NEW;
1008 uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
1009 uth->uu_workq_thport = MACH_PORT_NULL;
1010 uth->uu_workq_stackaddr = 0;
1011 uth->uu_workq_pthread_kill_allowed = 0;
1012
1013 thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
1014 thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
1015
1016 workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
1017 return workq_parked_wait_event(uth);
1018 }
1019
1020 /**
1021 * Try to add a new workqueue thread.
1022 *
1023 * - called with workq lock held
1024 * - dropped and retaken around thread creation
1025 * - return with workq lock held
1026 */
1027 static bool
workq_add_new_idle_thread(proc_t p,struct workqueue * wq)1028 workq_add_new_idle_thread(proc_t p, struct workqueue *wq)
1029 {
1030 mach_vm_offset_t th_stackaddr;
1031 kern_return_t kret;
1032 thread_t th;
1033
1034 wq->wq_nthreads++;
1035
1036 workq_unlock(wq);
1037
1038 vm_map_t vmap = get_task_map(p->task);
1039
1040 kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
1041 if (kret != KERN_SUCCESS) {
1042 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
1043 kret, 1, 0);
1044 goto out;
1045 }
1046
1047 kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th);
1048 if (kret != KERN_SUCCESS) {
1049 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
1050 kret, 0, 0);
1051 pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
1052 goto out;
1053 }
1054
1055 // thread_create_workq_waiting() will return with the wq lock held
1056 // on success, because it calls workq_thread_init_and_wq_lock() above
1057
1058 struct uthread *uth = get_bsdthread_info(th);
1059
1060 wq->wq_creations++;
1061 wq->wq_thidlecount++;
1062 uth->uu_workq_stackaddr = (user_addr_t)th_stackaddr;
1063 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1064
1065 WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0);
1066 return true;
1067
1068 out:
1069 workq_lock_spin(wq);
1070 /*
1071 * Do not redrive here if we went under wq_max_threads again,
1072 * it is the responsibility of the callers of this function
1073 * to do so when it fails.
1074 */
1075 wq->wq_nthreads--;
1076 return false;
1077 }
1078
1079 static inline bool
workq_thread_is_overcommit(struct uthread * uth)1080 workq_thread_is_overcommit(struct uthread *uth)
1081 {
1082 return (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0;
1083 }
1084
1085 static inline bool
workq_thread_is_nonovercommit(struct uthread * uth)1086 workq_thread_is_nonovercommit(struct uthread *uth)
1087 {
1088 return (uth->uu_workq_flags & (UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE)) == 0;
1089 }
1090
1091 static inline bool
workq_thread_is_cooperative(struct uthread * uth)1092 workq_thread_is_cooperative(struct uthread *uth)
1093 {
1094 return (uth->uu_workq_flags & UT_WORKQ_COOPERATIVE) != 0;
1095 }
1096
1097 static inline void
workq_thread_set_type(struct uthread * uth,uint16_t flags)1098 workq_thread_set_type(struct uthread *uth, uint16_t flags)
1099 {
1100 uth->uu_workq_flags &= ~(UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE);
1101 uth->uu_workq_flags |= flags;
1102 }
1103
1104
1105 #define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
1106
1107 __attribute__((noreturn, noinline))
1108 static void
workq_unpark_for_death_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t death_flags,uint32_t setup_flags)1109 workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
1110 struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
1111 {
1112 thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
1113 bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
1114
1115 if (qos > WORKQ_THREAD_QOS_CLEANUP) {
1116 workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
1117 qos = WORKQ_THREAD_QOS_CLEANUP;
1118 }
1119
1120 workq_thread_reset_cpupercent(NULL, uth);
1121
1122 if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
1123 wq->wq_thidlecount--;
1124 if (first_use) {
1125 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
1126 } else {
1127 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
1128 }
1129 }
1130 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
1131
1132 workq_unlock(wq);
1133
1134 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
1135 __assert_only kern_return_t kr;
1136 kr = thread_set_voucher_name(MACH_PORT_NULL);
1137 assert(kr == KERN_SUCCESS);
1138 }
1139
1140 uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
1141 thread_t th = get_machthread(uth);
1142 vm_map_t vmap = get_task_map(p->task);
1143
1144 if (!first_use) {
1145 flags |= WQ_FLAG_THREAD_REUSE;
1146 }
1147
1148 pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
1149 uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
1150 __builtin_unreachable();
1151 }
1152
1153 bool
workq_is_current_thread_updating_turnstile(struct workqueue * wq)1154 workq_is_current_thread_updating_turnstile(struct workqueue *wq)
1155 {
1156 return wq->wq_turnstile_updater == current_thread();
1157 }
1158
1159 __attribute__((always_inline))
1160 static inline void
1161 workq_perform_turnstile_operation_locked(struct workqueue *wq,
1162 void (^operation)(void))
1163 {
1164 workq_lock_held(wq);
1165 wq->wq_turnstile_updater = current_thread();
1166 operation();
1167 wq->wq_turnstile_updater = THREAD_NULL;
1168 }
1169
1170 static void
workq_turnstile_update_inheritor(struct workqueue * wq,turnstile_inheritor_t inheritor,turnstile_update_flags_t flags)1171 workq_turnstile_update_inheritor(struct workqueue *wq,
1172 turnstile_inheritor_t inheritor,
1173 turnstile_update_flags_t flags)
1174 {
1175 if (wq->wq_inheritor == inheritor) {
1176 return;
1177 }
1178 wq->wq_inheritor = inheritor;
1179 workq_perform_turnstile_operation_locked(wq, ^{
1180 turnstile_update_inheritor(wq->wq_turnstile, inheritor,
1181 flags | TURNSTILE_IMMEDIATE_UPDATE);
1182 turnstile_update_inheritor_complete(wq->wq_turnstile,
1183 TURNSTILE_INTERLOCK_HELD);
1184 });
1185 }
1186
1187 static void
workq_push_idle_thread(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)1188 workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
1189 uint32_t setup_flags)
1190 {
1191 uint64_t now = mach_absolute_time();
1192 bool is_creator = (uth == wq->wq_creator);
1193
1194 if (workq_thread_is_cooperative(uth)) {
1195 assert(!is_creator);
1196
1197 thread_qos_t thread_qos = uth->uu_workq_pri.qos_bucket;
1198 _wq_cooperative_queue_scheduled_count_dec(wq, thread_qos);
1199
1200 /* Before we get here, we always go through
1201 * workq_select_threadreq_or_park_and_unlock. If we got here, it means
1202 * that we went through the logic in workq_threadreq_select which
1203 * did the refresh for the next best cooperative qos while
1204 * excluding the current thread - we shouldn't need to do it again.
1205 */
1206 assert(_wq_cooperative_queue_refresh_best_req_qos(wq) == false);
1207 } else if (workq_thread_is_nonovercommit(uth)) {
1208 assert(!is_creator);
1209
1210 wq->wq_constrained_threads_scheduled--;
1211 }
1212
1213 uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE);
1214 TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
1215 wq->wq_threads_scheduled--;
1216
1217 if (is_creator) {
1218 wq->wq_creator = NULL;
1219 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
1220 uth->uu_save.uus_workq_park_data.yields);
1221 }
1222
1223 if (wq->wq_inheritor == get_machthread(uth)) {
1224 assert(wq->wq_creator == NULL);
1225 if (wq->wq_reqcount) {
1226 workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
1227 } else {
1228 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
1229 }
1230 }
1231
1232 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
1233 assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
1234 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1235 wq->wq_thidlecount++;
1236 return;
1237 }
1238
1239 if (!is_creator) {
1240 _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
1241 wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
1242 uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
1243 }
1244
1245 uth->uu_save.uus_workq_park_data.idle_stamp = now;
1246
1247 struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
1248 uint16_t cur_idle = wq->wq_thidlecount;
1249
1250 if (cur_idle >= wq_max_constrained_threads ||
1251 (wq->wq_thdying_count == 0 && oldest &&
1252 workq_should_kill_idle_thread(wq, oldest, now))) {
1253 /*
1254 * Immediately kill threads if we have too may of them.
1255 *
1256 * And swap "place" with the oldest one we'd have woken up.
1257 * This is a relatively desperate situation where we really
1258 * need to kill threads quickly and it's best to kill
1259 * the one that's currently on core than context switching.
1260 */
1261 if (oldest) {
1262 oldest->uu_save.uus_workq_park_data.idle_stamp = now;
1263 TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
1264 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
1265 }
1266
1267 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
1268 wq, cur_idle, 0, 0);
1269 wq->wq_thdying_count++;
1270 uth->uu_workq_flags |= UT_WORKQ_DYING;
1271 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
1272 workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
1273 __builtin_unreachable();
1274 }
1275
1276 struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
1277
1278 cur_idle += 1;
1279 wq->wq_thidlecount = cur_idle;
1280
1281 if (cur_idle >= wq_death_max_load && tail &&
1282 tail->uu_save.uus_workq_park_data.has_stack) {
1283 uth->uu_save.uus_workq_park_data.has_stack = false;
1284 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
1285 } else {
1286 uth->uu_save.uus_workq_park_data.has_stack = true;
1287 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
1288 }
1289
1290 if (!tail) {
1291 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
1292 workq_death_call_schedule(wq, now + delay);
1293 }
1294 }
1295
1296 #pragma mark thread requests
1297
1298 static inline bool
workq_tr_is_overcommit(workq_tr_flags_t tr_flags)1299 workq_tr_is_overcommit(workq_tr_flags_t tr_flags)
1300 {
1301 return (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) != 0;
1302 }
1303
1304 static inline bool
workq_tr_is_nonovercommit(workq_tr_flags_t tr_flags)1305 workq_tr_is_nonovercommit(workq_tr_flags_t tr_flags)
1306 {
1307 return (tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT | WORKQ_TR_FLAG_COOPERATIVE)) == 0;
1308 }
1309
1310 static inline bool
workq_tr_is_cooperative(workq_tr_flags_t tr_flags)1311 workq_tr_is_cooperative(workq_tr_flags_t tr_flags)
1312 {
1313 return (tr_flags & WORKQ_TR_FLAG_COOPERATIVE) != 0;
1314 }
1315
1316 #define workq_threadreq_is_overcommit(req) workq_tr_is_overcommit((req)->tr_flags)
1317 #define workq_threadreq_is_nonovercommit(req) workq_tr_is_nonovercommit((req)->tr_flags)
1318 #define workq_threadreq_is_cooperative(req) workq_tr_is_cooperative((req)->tr_flags)
1319
1320 static inline int
workq_priority_for_req(workq_threadreq_t req)1321 workq_priority_for_req(workq_threadreq_t req)
1322 {
1323 thread_qos_t qos = req->tr_qos;
1324
1325 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1326 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
1327 assert(trp.trp_flags & TRP_PRIORITY);
1328 return trp.trp_pri;
1329 }
1330 return thread_workq_pri_for_qos(qos);
1331 }
1332
1333 static inline struct priority_queue_sched_max *
workq_priority_queue_for_req(struct workqueue * wq,workq_threadreq_t req)1334 workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
1335 {
1336 assert(!workq_tr_is_cooperative(req->tr_flags));
1337
1338 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1339 return &wq->wq_special_queue;
1340 } else if (workq_tr_is_overcommit(req->tr_flags)) {
1341 return &wq->wq_overcommit_queue;
1342 } else {
1343 return &wq->wq_constrained_queue;
1344 }
1345 }
1346
1347
1348 /* Calculates the number of threads scheduled >= the input QoS */
1349 static uint64_t
workq_num_cooperative_threads_scheduled_to_qos(struct workqueue * wq,thread_qos_t qos)1350 workq_num_cooperative_threads_scheduled_to_qos(struct workqueue *wq, thread_qos_t qos)
1351 {
1352 workq_lock_held(wq);
1353
1354 uint64_t num_cooperative_threads = 0;
1355
1356 for (thread_qos_t cur_qos = WORKQ_THREAD_QOS_MAX; cur_qos >= qos; cur_qos--) {
1357 uint8_t bucket = _wq_bucket(cur_qos);
1358 num_cooperative_threads += wq->wq_cooperative_queue_scheduled_count[bucket];
1359 }
1360
1361 return num_cooperative_threads;
1362 }
1363
1364 static uint64_t
workq_num_cooperative_threads_scheduled_total(struct workqueue * wq)1365 workq_num_cooperative_threads_scheduled_total(struct workqueue *wq)
1366 {
1367 return workq_num_cooperative_threads_scheduled_to_qos(wq, WORKQ_THREAD_QOS_MIN);
1368 }
1369
1370 #if DEBUG || DEVELOPMENT
1371 static bool
workq_has_cooperative_thread_requests(struct workqueue * wq)1372 workq_has_cooperative_thread_requests(struct workqueue *wq)
1373 {
1374 for (thread_qos_t qos = WORKQ_THREAD_QOS_MAX; qos >= WORKQ_THREAD_QOS_MIN; qos--) {
1375 uint8_t bucket = _wq_bucket(qos);
1376 if (!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1377 return true;
1378 }
1379 }
1380
1381 return false;
1382 }
1383 #endif
1384
1385 /*
1386 * Determines the next QoS bucket we should service next in the cooperative
1387 * pool. This function will always return a QoS for cooperative pool as long as
1388 * there are requests to be serviced.
1389 *
1390 * Unlike the other thread pools, for the cooperative thread pool the schedule
1391 * counts for the various buckets in the pool affect the next best request for
1392 * it.
1393 *
1394 * This function is called in the following contexts:
1395 *
1396 * a) When determining the best thread QoS for cooperative bucket for the
1397 * creator/thread reuse
1398 *
1399 * b) Once (a) has happened and thread has bound to a thread request, figuring
1400 * out whether the next best request for this pool has changed so that creator
1401 * can be scheduled.
1402 *
1403 * Returns true if the cooperative queue's best qos changed from previous
1404 * value.
1405 */
1406 static bool
_wq_cooperative_queue_refresh_best_req_qos(struct workqueue * wq)1407 _wq_cooperative_queue_refresh_best_req_qos(struct workqueue *wq)
1408 {
1409 workq_lock_held(wq);
1410
1411 thread_qos_t old_best_req_qos = wq->wq_cooperative_queue_best_req_qos;
1412
1413 /* We determine the next best cooperative thread request based on the
1414 * following:
1415 *
1416 * 1. Take the MAX of the following:
1417 * a) Highest qos with pending TRs such that number of scheduled
1418 * threads so far with >= qos is < wq_max_cooperative_threads
1419 * b) Highest qos bucket with pending TRs but no scheduled threads for that bucket
1420 *
1421 * 2. If the result of (1) is UN, then we pick the highest priority amongst
1422 * pending thread requests in the pool.
1423 *
1424 */
1425 thread_qos_t highest_qos_with_no_scheduled = THREAD_QOS_UNSPECIFIED;
1426 thread_qos_t highest_qos_req_with_width = THREAD_QOS_UNSPECIFIED;
1427
1428 thread_qos_t highest_qos_req = THREAD_QOS_UNSPECIFIED;
1429
1430 int scheduled_count_till_qos = 0;
1431
1432 for (thread_qos_t qos = WORKQ_THREAD_QOS_MAX; qos >= WORKQ_THREAD_QOS_MIN; qos--) {
1433 uint8_t bucket = _wq_bucket(qos);
1434 uint8_t scheduled_count_for_bucket = wq->wq_cooperative_queue_scheduled_count[bucket];
1435 scheduled_count_till_qos += scheduled_count_for_bucket;
1436
1437 if (!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1438 if (qos > highest_qos_req) {
1439 highest_qos_req = qos;
1440 }
1441 /*
1442 * The pool isn't saturated for threads at and above this QoS, and
1443 * this qos bucket has pending requests
1444 */
1445 if (scheduled_count_till_qos < wq_cooperative_queue_max_size(wq)) {
1446 if (qos > highest_qos_req_with_width) {
1447 highest_qos_req_with_width = qos;
1448 }
1449 }
1450
1451 /*
1452 * There are no threads scheduled for this bucket but there
1453 * is work pending, give it at least 1 thread
1454 */
1455 if (scheduled_count_for_bucket == 0) {
1456 if (qos > highest_qos_with_no_scheduled) {
1457 highest_qos_with_no_scheduled = qos;
1458 }
1459 }
1460 }
1461 }
1462
1463 wq->wq_cooperative_queue_best_req_qos = MAX(highest_qos_with_no_scheduled, highest_qos_req_with_width);
1464 if (wq->wq_cooperative_queue_best_req_qos == THREAD_QOS_UNSPECIFIED) {
1465 wq->wq_cooperative_queue_best_req_qos = highest_qos_req;
1466 }
1467
1468 #if DEBUG || DEVELOPMENT
1469 /* Assert that if we are showing up the next best req as UN, then there
1470 * actually is no thread request in the cooperative pool buckets */
1471 if (wq->wq_cooperative_queue_best_req_qos == THREAD_QOS_UNSPECIFIED) {
1472 assert(!workq_has_cooperative_thread_requests(wq));
1473 }
1474 #endif
1475
1476 return old_best_req_qos != wq->wq_cooperative_queue_best_req_qos;
1477 }
1478
1479 /*
1480 * Returns whether or not the input thread (or creator thread if uth is NULL)
1481 * should be allowed to work as part of the cooperative pool for the <input qos>
1482 * bucket.
1483 *
1484 * This function is called in a bunch of places:
1485 * a) Quantum expires for a thread and it is part of the cooperative pool
1486 * b) When trying to pick a thread request for the creator thread to
1487 * represent.
1488 * c) When a thread is trying to pick a thread request to actually bind to
1489 * and service.
1490 *
1491 * Called with workq lock held.
1492 */
1493
1494 #define WQ_COOPERATIVE_POOL_UNSATURATED 1
1495 #define WQ_COOPERATIVE_BUCKET_UNSERVICED 2
1496 #define WQ_COOPERATIVE_POOL_SATURATED_UP_TO_QOS 3
1497
1498 static bool
workq_cooperative_allowance(struct workqueue * wq,thread_qos_t qos,struct uthread * uth,bool may_start_timer)1499 workq_cooperative_allowance(struct workqueue *wq, thread_qos_t qos, struct uthread *uth,
1500 bool may_start_timer)
1501 {
1502 workq_lock_held(wq);
1503
1504 bool exclude_thread_as_scheduled = false;
1505 bool passed_admissions = false;
1506 uint8_t bucket = _wq_bucket(qos);
1507
1508 if (uth && workq_thread_is_cooperative(uth)) {
1509 exclude_thread_as_scheduled = true;
1510 _wq_cooperative_queue_scheduled_count_dec(wq, uth->uu_workq_pri.qos_bucket);
1511 }
1512
1513 /*
1514 * We have not saturated the pool yet, let this thread continue
1515 */
1516 uint64_t total_cooperative_threads;
1517 total_cooperative_threads = workq_num_cooperative_threads_scheduled_total(wq);
1518 if (total_cooperative_threads < wq_cooperative_queue_max_size(wq)) {
1519 passed_admissions = true;
1520 WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE,
1521 total_cooperative_threads, qos, passed_admissions,
1522 WQ_COOPERATIVE_POOL_UNSATURATED);
1523 goto out;
1524 }
1525
1526 /*
1527 * Without this thread, nothing is servicing the bucket which has pending
1528 * work
1529 */
1530 uint64_t bucket_scheduled = wq->wq_cooperative_queue_scheduled_count[bucket];
1531 if (bucket_scheduled == 0 &&
1532 !STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1533 passed_admissions = true;
1534 WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE,
1535 total_cooperative_threads, qos, passed_admissions,
1536 WQ_COOPERATIVE_BUCKET_UNSERVICED);
1537 goto out;
1538 }
1539
1540 /*
1541 * If number of threads at the QoS bucket >= input QoS exceeds the max we want
1542 * for the pool, deny this thread
1543 */
1544 uint64_t aggregate_down_to_qos = workq_num_cooperative_threads_scheduled_to_qos(wq, qos);
1545 passed_admissions = (aggregate_down_to_qos < wq_cooperative_queue_max_size(wq));
1546 WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE, aggregate_down_to_qos,
1547 qos, passed_admissions, WQ_COOPERATIVE_POOL_SATURATED_UP_TO_QOS);
1548
1549 if (!passed_admissions && may_start_timer) {
1550 workq_schedule_delayed_thread_creation(wq, 0);
1551 }
1552
1553 out:
1554 if (exclude_thread_as_scheduled) {
1555 _wq_cooperative_queue_scheduled_count_inc(wq, uth->uu_workq_pri.qos_bucket);
1556 }
1557 return passed_admissions;
1558 }
1559
1560 /*
1561 * returns true if the best request for the pool changed as a result of
1562 * enqueuing this thread request.
1563 */
1564 static bool
workq_threadreq_enqueue(struct workqueue * wq,workq_threadreq_t req)1565 workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
1566 {
1567 assert(req->tr_state == WORKQ_TR_STATE_NEW);
1568
1569 req->tr_state = WORKQ_TR_STATE_QUEUED;
1570 wq->wq_reqcount += req->tr_count;
1571
1572 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1573 assert(wq->wq_event_manager_threadreq == NULL);
1574 assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
1575 assert(req->tr_count == 1);
1576 wq->wq_event_manager_threadreq = req;
1577 return true;
1578 }
1579
1580 if (workq_threadreq_is_cooperative(req)) {
1581 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
1582 assert(req->tr_qos != WORKQ_THREAD_QOS_ABOVEUI);
1583
1584 struct workq_threadreq_tailq *bucket = &wq->wq_cooperative_queue[_wq_bucket(req->tr_qos)];
1585 STAILQ_INSERT_TAIL(bucket, req, tr_link);
1586
1587 return _wq_cooperative_queue_refresh_best_req_qos(wq);
1588 }
1589
1590 struct priority_queue_sched_max *q = workq_priority_queue_for_req(wq, req);
1591
1592 priority_queue_entry_set_sched_pri(q, &req->tr_entry,
1593 workq_priority_for_req(req), false);
1594
1595 if (priority_queue_insert(q, &req->tr_entry)) {
1596 if (workq_threadreq_is_nonovercommit(req)) {
1597 _wq_thactive_refresh_best_constrained_req_qos(wq);
1598 }
1599 return true;
1600 }
1601 return false;
1602 }
1603
1604 /*
1605 * returns true if one of the following is true (so as to update creator if
1606 * needed):
1607 *
1608 * (a) the next highest request of the pool we dequeued the request from changed
1609 * (b) the next highest requests of the pool the current thread used to be a
1610 * part of, changed
1611 *
1612 * For overcommit, special and constrained pools, the next highest QoS for each
1613 * pool just a MAX of pending requests so tracking (a) is sufficient.
1614 *
1615 * But for cooperative thread pool, the next highest QoS for the pool depends on
1616 * schedule counts in the pool as well. So if the current thread used to be
1617 * cooperative in it's previous logical run ie (b), then that can also affect
1618 * cooperative pool's next best QoS requests.
1619 */
1620 static bool
workq_threadreq_dequeue(struct workqueue * wq,workq_threadreq_t req,bool cooperative_sched_count_changed)1621 workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req,
1622 bool cooperative_sched_count_changed)
1623 {
1624 wq->wq_reqcount--;
1625
1626 bool next_highest_request_changed = false;
1627
1628 if (--req->tr_count == 0) {
1629 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1630 assert(wq->wq_event_manager_threadreq == req);
1631 assert(req->tr_count == 0);
1632 wq->wq_event_manager_threadreq = NULL;
1633
1634 /* If a cooperative thread was the one which picked up the manager
1635 * thread request, we need to reevaluate the cooperative pool
1636 * anyways.
1637 */
1638 if (cooperative_sched_count_changed) {
1639 _wq_cooperative_queue_refresh_best_req_qos(wq);
1640 }
1641 return true;
1642 }
1643
1644 if (workq_threadreq_is_cooperative(req)) {
1645 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
1646 assert(req->tr_qos != WORKQ_THREAD_QOS_ABOVEUI);
1647 /* Account for the fact that BG and MT are coalesced when
1648 * calculating best request for cooperative pool
1649 */
1650 assert(_wq_bucket(req->tr_qos) == _wq_bucket(wq->wq_cooperative_queue_best_req_qos));
1651
1652 struct workq_threadreq_tailq *bucket = &wq->wq_cooperative_queue[_wq_bucket(req->tr_qos)];
1653 __assert_only workq_threadreq_t head = STAILQ_FIRST(bucket);
1654
1655 assert(head == req);
1656 STAILQ_REMOVE_HEAD(bucket, tr_link);
1657
1658 /*
1659 * If the request we're dequeueing is cooperative, then the sched
1660 * counts definitely changed.
1661 */
1662 assert(cooperative_sched_count_changed);
1663 }
1664
1665 /*
1666 * We want to do the cooperative pool refresh after dequeueing a
1667 * cooperative thread request if any (to combine both effects into 1
1668 * refresh operation)
1669 */
1670 if (cooperative_sched_count_changed) {
1671 next_highest_request_changed = _wq_cooperative_queue_refresh_best_req_qos(wq);
1672 }
1673
1674 if (!workq_threadreq_is_cooperative(req)) {
1675 /*
1676 * All other types of requests are enqueued in priority queues
1677 */
1678
1679 if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
1680 &req->tr_entry)) {
1681 next_highest_request_changed |= true;
1682 if (workq_threadreq_is_nonovercommit(req)) {
1683 _wq_thactive_refresh_best_constrained_req_qos(wq);
1684 }
1685 }
1686 }
1687 }
1688
1689 return next_highest_request_changed;
1690 }
1691
1692 static void
workq_threadreq_destroy(proc_t p,workq_threadreq_t req)1693 workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
1694 {
1695 req->tr_state = WORKQ_TR_STATE_CANCELED;
1696 if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
1697 kqueue_threadreq_cancel(p, req);
1698 } else {
1699 zfree(workq_zone_threadreq, req);
1700 }
1701 }
1702
1703 #pragma mark workqueue thread creation thread calls
1704
1705 static inline bool
workq_thread_call_prepost(struct workqueue * wq,uint32_t sched,uint32_t pend,uint32_t fail_mask)1706 workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
1707 uint32_t fail_mask)
1708 {
1709 uint32_t old_flags, new_flags;
1710
1711 os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
1712 if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
1713 os_atomic_rmw_loop_give_up(return false);
1714 }
1715 if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
1716 new_flags = old_flags | pend;
1717 } else {
1718 new_flags = old_flags | sched;
1719 }
1720 });
1721
1722 return (old_flags & WQ_PROC_SUSPENDED) == 0;
1723 }
1724
1725 #define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
1726
1727 static bool
workq_schedule_delayed_thread_creation(struct workqueue * wq,int flags)1728 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
1729 {
1730 assert(!preemption_enabled());
1731
1732 if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
1733 WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
1734 WQ_IMMEDIATE_CALL_SCHEDULED)) {
1735 return false;
1736 }
1737
1738 uint64_t now = mach_absolute_time();
1739
1740 if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
1741 /* do not change the window */
1742 } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
1743 wq->wq_timer_interval *= 2;
1744 if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
1745 wq->wq_timer_interval = (uint32_t)wq_max_timer_interval.abstime;
1746 }
1747 } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
1748 wq->wq_timer_interval /= 2;
1749 if (wq->wq_timer_interval < wq_stalled_window.abstime) {
1750 wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
1751 }
1752 }
1753
1754 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1755 _wq_flags(wq), wq->wq_timer_interval);
1756
1757 thread_call_t call = wq->wq_delayed_call;
1758 uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
1759 uint64_t deadline = now + wq->wq_timer_interval;
1760 if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
1761 panic("delayed_call was already enqueued");
1762 }
1763 return true;
1764 }
1765
1766 static void
workq_schedule_immediate_thread_creation(struct workqueue * wq)1767 workq_schedule_immediate_thread_creation(struct workqueue *wq)
1768 {
1769 assert(!preemption_enabled());
1770
1771 if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
1772 WQ_IMMEDIATE_CALL_PENDED, 0)) {
1773 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1774 _wq_flags(wq), 0);
1775
1776 uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
1777 if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
1778 panic("immediate_call was already enqueued");
1779 }
1780 }
1781 }
1782
1783 void
workq_proc_suspended(struct proc * p)1784 workq_proc_suspended(struct proc *p)
1785 {
1786 struct workqueue *wq = proc_get_wqptr(p);
1787
1788 if (wq) {
1789 os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
1790 }
1791 }
1792
1793 void
workq_proc_resumed(struct proc * p)1794 workq_proc_resumed(struct proc *p)
1795 {
1796 struct workqueue *wq = proc_get_wqptr(p);
1797 uint32_t wq_flags;
1798
1799 if (!wq) {
1800 return;
1801 }
1802
1803 wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
1804 WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
1805 if ((wq_flags & WQ_EXITING) == 0) {
1806 disable_preemption();
1807 if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
1808 workq_schedule_immediate_thread_creation(wq);
1809 } else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
1810 workq_schedule_delayed_thread_creation(wq,
1811 WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
1812 }
1813 enable_preemption();
1814 }
1815 }
1816
1817 /**
1818 * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
1819 */
1820 static bool
workq_thread_is_busy(uint64_t now,_Atomic uint64_t * lastblocked_tsp)1821 workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
1822 {
1823 uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
1824 if (now <= lastblocked_ts) {
1825 /*
1826 * Because the update of the timestamp when a thread blocks
1827 * isn't serialized against us looking at it (i.e. we don't hold
1828 * the workq lock), it's possible to have a timestamp that matches
1829 * the current time or that even looks to be in the future relative
1830 * to when we grabbed the current time...
1831 *
1832 * Just treat this as a busy thread since it must have just blocked.
1833 */
1834 return true;
1835 }
1836 return (now - lastblocked_ts) < wq_stalled_window.abstime;
1837 }
1838
1839 static void
workq_add_new_threads_call(void * _p,void * flags)1840 workq_add_new_threads_call(void *_p, void *flags)
1841 {
1842 proc_t p = _p;
1843 struct workqueue *wq = proc_get_wqptr(p);
1844 uint32_t my_flag = (uint32_t)(uintptr_t)flags;
1845
1846 /*
1847 * workq_exit() will set the workqueue to NULL before
1848 * it cancels thread calls.
1849 */
1850 if (!wq) {
1851 return;
1852 }
1853
1854 assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
1855 (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
1856
1857 WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
1858 wq->wq_nthreads, wq->wq_thidlecount);
1859
1860 workq_lock_spin(wq);
1861
1862 wq->wq_thread_call_last_run = mach_absolute_time();
1863 os_atomic_andnot(&wq->wq_flags, my_flag, release);
1864
1865 /* This can drop the workqueue lock, and take it again */
1866 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
1867
1868 workq_unlock(wq);
1869
1870 WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
1871 wq->wq_nthreads, wq->wq_thidlecount);
1872 }
1873
1874 #pragma mark thread state tracking
1875
1876 static void
workq_sched_callback(int type,thread_t thread)1877 workq_sched_callback(int type, thread_t thread)
1878 {
1879 thread_ro_t tro = get_thread_ro(thread);
1880 struct uthread *uth = get_bsdthread_info(thread);
1881 struct workqueue *wq = proc_get_wqptr(tro->tro_proc);
1882 thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
1883 wq_thactive_t old_thactive;
1884 bool start_timer = false;
1885
1886 if (qos == WORKQ_THREAD_QOS_MANAGER) {
1887 return;
1888 }
1889
1890 switch (type) {
1891 case SCHED_CALL_BLOCK:
1892 old_thactive = _wq_thactive_dec(wq, qos);
1893 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1894
1895 /*
1896 * Remember the timestamp of the last thread that blocked in this
1897 * bucket, it used used by admission checks to ignore one thread
1898 * being inactive if this timestamp is recent enough.
1899 *
1900 * If we collide with another thread trying to update the
1901 * last_blocked (really unlikely since another thread would have to
1902 * get scheduled and then block after we start down this path), it's
1903 * not a problem. Either timestamp is adequate, so no need to retry
1904 */
1905 os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
1906 thread_last_run_time(thread), relaxed);
1907
1908 if (req_qos == THREAD_QOS_UNSPECIFIED) {
1909 /*
1910 * No pending request at the moment we could unblock, move on.
1911 */
1912 } else if (qos < req_qos) {
1913 /*
1914 * The blocking thread is at a lower QoS than the highest currently
1915 * pending constrained request, nothing has to be redriven
1916 */
1917 } else {
1918 uint32_t max_busycount, old_req_count;
1919 old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1920 req_qos, NULL, &max_busycount);
1921 /*
1922 * If it is possible that may_start_constrained_thread had refused
1923 * admission due to being over the max concurrency, we may need to
1924 * spin up a new thread.
1925 *
1926 * We take into account the maximum number of busy threads
1927 * that can affect may_start_constrained_thread as looking at the
1928 * actual number may_start_constrained_thread will see is racy.
1929 *
1930 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
1931 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
1932 */
1933 uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
1934 if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
1935 start_timer = workq_schedule_delayed_thread_creation(wq, 0);
1936 }
1937 }
1938 if (__improbable(kdebug_enable)) {
1939 __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1940 old_thactive, qos, NULL, NULL);
1941 WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
1942 old - 1, qos | (req_qos << 8),
1943 wq->wq_reqcount << 1 | start_timer);
1944 }
1945 break;
1946
1947 case SCHED_CALL_UNBLOCK:
1948 /*
1949 * we cannot take the workqueue_lock here...
1950 * an UNBLOCK can occur from a timer event which
1951 * is run from an interrupt context... if the workqueue_lock
1952 * is already held by this processor, we'll deadlock...
1953 * the thread lock for the thread being UNBLOCKED
1954 * is also held
1955 */
1956 old_thactive = _wq_thactive_inc(wq, qos);
1957 if (__improbable(kdebug_enable)) {
1958 __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1959 old_thactive, qos, NULL, NULL);
1960 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1961 WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
1962 old + 1, qos | (req_qos << 8),
1963 wq->wq_threads_scheduled);
1964 }
1965 break;
1966 }
1967 }
1968
1969 #pragma mark workq lifecycle
1970
1971 void
workq_reference(struct workqueue * wq)1972 workq_reference(struct workqueue *wq)
1973 {
1974 os_ref_retain(&wq->wq_refcnt);
1975 }
1976
1977 static void
workq_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)1978 workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
1979 __assert_only mpsc_daemon_queue_t dq)
1980 {
1981 struct workqueue *wq;
1982 struct turnstile *ts;
1983
1984 wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
1985 assert(dq == &workq_deallocate_queue);
1986
1987 turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
1988 assert(ts);
1989 turnstile_cleanup();
1990 turnstile_deallocate(ts);
1991
1992 lck_ticket_destroy(&wq->wq_lock, &workq_lck_grp);
1993 zfree(workq_zone_workqueue, wq);
1994 }
1995
1996 static void
workq_deallocate(struct workqueue * wq)1997 workq_deallocate(struct workqueue *wq)
1998 {
1999 if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
2000 workq_deallocate_queue_invoke(&wq->wq_destroy_link,
2001 &workq_deallocate_queue);
2002 }
2003 }
2004
2005 void
workq_deallocate_safe(struct workqueue * wq)2006 workq_deallocate_safe(struct workqueue *wq)
2007 {
2008 if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
2009 mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
2010 MPSC_QUEUE_DISABLE_PREEMPTION);
2011 }
2012 }
2013
2014 /**
2015 * Setup per-process state for the workqueue.
2016 */
2017 int
workq_open(struct proc * p,__unused struct workq_open_args * uap,__unused int32_t * retval)2018 workq_open(struct proc *p, __unused struct workq_open_args *uap,
2019 __unused int32_t *retval)
2020 {
2021 struct workqueue *wq;
2022 int error = 0;
2023
2024 if ((p->p_lflag & P_LREGISTER) == 0) {
2025 return EINVAL;
2026 }
2027
2028 if (wq_init_constrained_limit) {
2029 uint32_t limit, num_cpus = ml_wait_max_cpus();
2030
2031 /*
2032 * set up the limit for the constrained pool
2033 * this is a virtual pool in that we don't
2034 * maintain it on a separate idle and run list
2035 */
2036 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2037
2038 if (limit > wq_max_constrained_threads) {
2039 wq_max_constrained_threads = limit;
2040 }
2041
2042 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2043 wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2044 }
2045 if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
2046 wq_max_threads = CONFIG_THREAD_MAX - 20;
2047 }
2048
2049 wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
2050
2051 for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
2052 wq_max_parallelism[_wq_bucket(qos)] =
2053 qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
2054 }
2055
2056 wq_max_cooperative_threads = num_cpus;
2057
2058 wq_init_constrained_limit = 0;
2059 }
2060
2061 if (proc_get_wqptr(p) == NULL) {
2062 if (proc_init_wqptr_or_wait(p) == FALSE) {
2063 assert(proc_get_wqptr(p) != NULL);
2064 goto out;
2065 }
2066
2067 wq = zalloc_flags(workq_zone_workqueue, Z_WAITOK | Z_ZERO);
2068
2069 os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
2070
2071 // Start the event manager at the priority hinted at by the policy engine
2072 thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
2073 pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
2074 wq->wq_event_manager_priority = (uint32_t)pp;
2075 wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
2076 wq->wq_proc = p;
2077 turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
2078 TURNSTILE_WORKQS);
2079
2080 TAILQ_INIT(&wq->wq_thrunlist);
2081 TAILQ_INIT(&wq->wq_thnewlist);
2082 TAILQ_INIT(&wq->wq_thidlelist);
2083 priority_queue_init(&wq->wq_overcommit_queue);
2084 priority_queue_init(&wq->wq_constrained_queue);
2085 priority_queue_init(&wq->wq_special_queue);
2086 for (int bucket = 0; bucket < WORKQ_NUM_QOS_BUCKETS; bucket++) {
2087 STAILQ_INIT(&wq->wq_cooperative_queue[bucket]);
2088 }
2089
2090 /* We are only using the delayed thread call for the constrained pool
2091 * which can't have work at >= UI QoS and so we can be fine with a
2092 * UI QoS thread call.
2093 */
2094 wq->wq_delayed_call = thread_call_allocate_with_qos(
2095 workq_add_new_threads_call, p, THREAD_QOS_USER_INTERACTIVE,
2096 THREAD_CALL_OPTIONS_ONCE);
2097 wq->wq_immediate_call = thread_call_allocate_with_options(
2098 workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
2099 THREAD_CALL_OPTIONS_ONCE);
2100 wq->wq_death_call = thread_call_allocate_with_options(
2101 workq_kill_old_threads_call, wq,
2102 THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
2103
2104 lck_ticket_init(&wq->wq_lock, &workq_lck_grp);
2105
2106 WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
2107 VM_KERNEL_ADDRHIDE(wq), 0, 0);
2108 proc_set_wqptr(p, wq);
2109 }
2110 out:
2111
2112 return error;
2113 }
2114
2115 /*
2116 * Routine: workq_mark_exiting
2117 *
2118 * Function: Mark the work queue such that new threads will not be added to the
2119 * work queue after we return.
2120 *
2121 * Conditions: Called against the current process.
2122 */
2123 void
workq_mark_exiting(struct proc * p)2124 workq_mark_exiting(struct proc *p)
2125 {
2126 struct workqueue *wq = proc_get_wqptr(p);
2127 uint32_t wq_flags;
2128 workq_threadreq_t mgr_req;
2129
2130 if (!wq) {
2131 return;
2132 }
2133
2134 WQ_TRACE_WQ(TRACE_wq_pthread_exit | DBG_FUNC_START, wq, 0, 0, 0);
2135
2136 workq_lock_spin(wq);
2137
2138 wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
2139 if (__improbable(wq_flags & WQ_EXITING)) {
2140 panic("workq_mark_exiting called twice");
2141 }
2142
2143 /*
2144 * Opportunistically try to cancel thread calls that are likely in flight.
2145 * workq_exit() will do the proper cleanup.
2146 */
2147 if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
2148 thread_call_cancel(wq->wq_immediate_call);
2149 }
2150 if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
2151 thread_call_cancel(wq->wq_delayed_call);
2152 }
2153 if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
2154 thread_call_cancel(wq->wq_death_call);
2155 }
2156
2157 mgr_req = wq->wq_event_manager_threadreq;
2158 wq->wq_event_manager_threadreq = NULL;
2159 wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
2160 wq->wq_creator = NULL;
2161 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
2162
2163 workq_unlock(wq);
2164
2165 if (mgr_req) {
2166 kqueue_threadreq_cancel(p, mgr_req);
2167 }
2168 /*
2169 * No one touches the priority queues once WQ_EXITING is set.
2170 * It is hence safe to do the tear down without holding any lock.
2171 */
2172 priority_queue_destroy(&wq->wq_overcommit_queue,
2173 struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2174 workq_threadreq_destroy(p, e);
2175 });
2176 priority_queue_destroy(&wq->wq_constrained_queue,
2177 struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2178 workq_threadreq_destroy(p, e);
2179 });
2180 priority_queue_destroy(&wq->wq_special_queue,
2181 struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2182 workq_threadreq_destroy(p, e);
2183 });
2184
2185 WQ_TRACE(TRACE_wq_pthread_exit | DBG_FUNC_END, 0, 0, 0, 0);
2186 }
2187
2188 /*
2189 * Routine: workq_exit
2190 *
2191 * Function: clean up the work queue structure(s) now that there are no threads
2192 * left running inside the work queue (except possibly current_thread).
2193 *
2194 * Conditions: Called by the last thread in the process.
2195 * Called against current process.
2196 */
2197 void
workq_exit(struct proc * p)2198 workq_exit(struct proc *p)
2199 {
2200 struct workqueue *wq;
2201 struct uthread *uth, *tmp;
2202
2203 wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
2204 if (wq != NULL) {
2205 thread_t th = current_thread();
2206
2207 WQ_TRACE_WQ(TRACE_wq_workqueue_exit | DBG_FUNC_START, wq, 0, 0, 0);
2208
2209 if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
2210 /*
2211 * <rdar://problem/40111515> Make sure we will no longer call the
2212 * sched call, if we ever block this thread, which the cancel_wait
2213 * below can do.
2214 */
2215 thread_sched_call(th, NULL);
2216 }
2217
2218 /*
2219 * Thread calls are always scheduled by the proc itself or under the
2220 * workqueue spinlock if WQ_EXITING is not yet set.
2221 *
2222 * Either way, when this runs, the proc has no threads left beside
2223 * the one running this very code, so we know no thread call can be
2224 * dispatched anymore.
2225 */
2226 thread_call_cancel_wait(wq->wq_delayed_call);
2227 thread_call_cancel_wait(wq->wq_immediate_call);
2228 thread_call_cancel_wait(wq->wq_death_call);
2229 thread_call_free(wq->wq_delayed_call);
2230 thread_call_free(wq->wq_immediate_call);
2231 thread_call_free(wq->wq_death_call);
2232
2233 /*
2234 * Clean up workqueue data structures for threads that exited and
2235 * didn't get a chance to clean up after themselves.
2236 *
2237 * idle/new threads should have been interrupted and died on their own
2238 */
2239 TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
2240 thread_t mth = get_machthread(uth);
2241 thread_sched_call(mth, NULL);
2242 thread_deallocate(mth);
2243 }
2244 assert(TAILQ_EMPTY(&wq->wq_thnewlist));
2245 assert(TAILQ_EMPTY(&wq->wq_thidlelist));
2246
2247 WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
2248 VM_KERNEL_ADDRHIDE(wq), 0, 0);
2249
2250 workq_deallocate(wq);
2251
2252 WQ_TRACE(TRACE_wq_workqueue_exit | DBG_FUNC_END, 0, 0, 0, 0);
2253 }
2254 }
2255
2256
2257 #pragma mark bsd thread control
2258
2259 bool
bsdthread_part_of_cooperative_workqueue(struct uthread * uth)2260 bsdthread_part_of_cooperative_workqueue(struct uthread *uth)
2261 {
2262 return (workq_thread_is_cooperative(uth) || workq_thread_is_nonovercommit(uth)) &&
2263 (uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER);
2264 }
2265
2266 static bool
_pthread_priority_to_policy(pthread_priority_t priority,thread_qos_policy_data_t * data)2267 _pthread_priority_to_policy(pthread_priority_t priority,
2268 thread_qos_policy_data_t *data)
2269 {
2270 data->qos_tier = _pthread_priority_thread_qos(priority);
2271 data->tier_importance = _pthread_priority_relpri(priority);
2272 if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
2273 data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
2274 return false;
2275 }
2276 return true;
2277 }
2278
2279 static int
bsdthread_set_self(proc_t p,thread_t th,pthread_priority_t priority,mach_port_name_t voucher,enum workq_set_self_flags flags)2280 bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
2281 mach_port_name_t voucher, enum workq_set_self_flags flags)
2282 {
2283 struct uthread *uth = get_bsdthread_info(th);
2284 struct workqueue *wq = proc_get_wqptr(p);
2285
2286 kern_return_t kr;
2287 int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
2288 bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
2289
2290 if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
2291 if (!is_wq_thread) {
2292 unbind_rv = EINVAL;
2293 goto qos;
2294 }
2295
2296 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
2297 unbind_rv = EINVAL;
2298 goto qos;
2299 }
2300
2301 workq_threadreq_t kqr = uth->uu_kqr_bound;
2302 if (kqr == NULL) {
2303 unbind_rv = EALREADY;
2304 goto qos;
2305 }
2306
2307 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2308 unbind_rv = EINVAL;
2309 goto qos;
2310 }
2311
2312 kqueue_threadreq_unbind(p, kqr);
2313 }
2314
2315 qos:
2316 if (flags & WORKQ_SET_SELF_QOS_FLAG) {
2317 thread_qos_policy_data_t new_policy;
2318
2319 if (!_pthread_priority_to_policy(priority, &new_policy)) {
2320 qos_rv = EINVAL;
2321 goto voucher;
2322 }
2323
2324 if (!is_wq_thread) {
2325 /*
2326 * Threads opted out of QoS can't change QoS
2327 */
2328 if (!thread_has_qos_policy(th)) {
2329 qos_rv = EPERM;
2330 goto voucher;
2331 }
2332 } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
2333 uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
2334 /*
2335 * Workqueue manager threads or threads above UI can't change QoS
2336 */
2337 qos_rv = EINVAL;
2338 goto voucher;
2339 } else {
2340 /*
2341 * For workqueue threads, possibly adjust buckets and redrive thread
2342 * requests.
2343 *
2344 * Transitions allowed:
2345 *
2346 * overcommit --> non-overcommit
2347 * overcommit --> overcommit
2348 * non-overcommit --> non-overcommit
2349 * non-overcommit --> overcommit (to be deprecated later)
2350 * cooperative --> cooperative
2351 *
2352 * All other transitions aren't allowed so reject them.
2353 */
2354 if (workq_thread_is_overcommit(uth) && _pthread_priority_is_cooperative(priority)) {
2355 qos_rv = EINVAL;
2356 goto voucher;
2357 } else if (workq_thread_is_cooperative(uth) && !_pthread_priority_is_cooperative(priority)) {
2358 qos_rv = EINVAL;
2359 goto voucher;
2360 } else if (workq_thread_is_nonovercommit(uth) && _pthread_priority_is_cooperative(priority)) {
2361 qos_rv = EINVAL;
2362 goto voucher;
2363 }
2364
2365 struct uu_workq_policy old_pri, new_pri;
2366 bool force_run = false;
2367
2368 workq_lock_spin(wq);
2369
2370 old_pri = new_pri = uth->uu_workq_pri;
2371 new_pri.qos_req = (thread_qos_t)new_policy.qos_tier;
2372
2373 /* Adjust schedule counts for various types of transitions */
2374
2375 /* overcommit -> non-overcommit */
2376 if (workq_thread_is_overcommit(uth) && _pthread_priority_is_nonovercommit(priority)) {
2377 workq_thread_set_type(uth, 0);
2378 wq->wq_constrained_threads_scheduled++;
2379
2380 /* non-overcommit -> overcommit */
2381 } else if (workq_thread_is_nonovercommit(uth) && _pthread_priority_is_overcommit(priority)) {
2382 workq_thread_set_type(uth, UT_WORKQ_OVERCOMMIT);
2383 force_run = (wq->wq_constrained_threads_scheduled-- == wq_max_constrained_threads);
2384
2385 /* cooperative -> cooperative */
2386 } else if (workq_thread_is_cooperative(uth)) {
2387 _wq_cooperative_queue_scheduled_count_dec(wq, old_pri.qos_bucket);
2388 _wq_cooperative_queue_scheduled_count_inc(wq, workq_pri_bucket(new_pri));
2389
2390 /* We're changing schedule counts within cooperative pool, we
2391 * need to refresh best cooperative QoS logic again */
2392 force_run = _wq_cooperative_queue_refresh_best_req_qos(wq);
2393 }
2394
2395 /* This will also call schedule_creator if needed */
2396 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
2397 workq_unlock(wq);
2398
2399 if (workq_thread_is_overcommit(uth)) {
2400 thread_disarm_workqueue_quantum(th);
2401 } else {
2402 /* If the thread changed QoS buckets, the quantum duration
2403 * may have changed too */
2404 thread_arm_workqueue_quantum(th);
2405 }
2406 }
2407
2408 kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
2409 (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
2410 if (kr != KERN_SUCCESS) {
2411 qos_rv = EINVAL;
2412 }
2413 }
2414
2415 voucher:
2416 if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
2417 kr = thread_set_voucher_name(voucher);
2418 if (kr != KERN_SUCCESS) {
2419 voucher_rv = ENOENT;
2420 goto fixedpri;
2421 }
2422 }
2423
2424 fixedpri:
2425 if (qos_rv) {
2426 goto done;
2427 }
2428 if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
2429 thread_extended_policy_data_t extpol = {.timeshare = 0};
2430
2431 if (is_wq_thread) {
2432 /* Not allowed on workqueue threads */
2433 fixedpri_rv = ENOTSUP;
2434 goto done;
2435 }
2436
2437 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
2438 (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
2439 if (kr != KERN_SUCCESS) {
2440 fixedpri_rv = EINVAL;
2441 goto done;
2442 }
2443 } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
2444 thread_extended_policy_data_t extpol = {.timeshare = 1};
2445
2446 if (is_wq_thread) {
2447 /* Not allowed on workqueue threads */
2448 fixedpri_rv = ENOTSUP;
2449 goto done;
2450 }
2451
2452 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
2453 (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
2454 if (kr != KERN_SUCCESS) {
2455 fixedpri_rv = EINVAL;
2456 goto done;
2457 }
2458 }
2459
2460 done:
2461 if (qos_rv && voucher_rv) {
2462 /* Both failed, give that a unique error. */
2463 return EBADMSG;
2464 }
2465
2466 if (unbind_rv) {
2467 return unbind_rv;
2468 }
2469
2470 if (qos_rv) {
2471 return qos_rv;
2472 }
2473
2474 if (voucher_rv) {
2475 return voucher_rv;
2476 }
2477
2478 if (fixedpri_rv) {
2479 return fixedpri_rv;
2480 }
2481
2482
2483 return 0;
2484 }
2485
2486 static int
bsdthread_add_explicit_override(proc_t p,mach_port_name_t kport,pthread_priority_t pp,user_addr_t resource)2487 bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
2488 pthread_priority_t pp, user_addr_t resource)
2489 {
2490 thread_qos_t qos = _pthread_priority_thread_qos(pp);
2491 if (qos == THREAD_QOS_UNSPECIFIED) {
2492 return EINVAL;
2493 }
2494
2495 thread_t th = port_name_to_thread(kport,
2496 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2497 if (th == THREAD_NULL) {
2498 return ESRCH;
2499 }
2500
2501 int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE,
2502 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
2503
2504 thread_deallocate(th);
2505 return rv;
2506 }
2507
2508 static int
bsdthread_remove_explicit_override(proc_t p,mach_port_name_t kport,user_addr_t resource)2509 bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
2510 user_addr_t resource)
2511 {
2512 thread_t th = port_name_to_thread(kport,
2513 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2514 if (th == THREAD_NULL) {
2515 return ESRCH;
2516 }
2517
2518 int rv = proc_thread_qos_remove_override(p->task, th, 0, resource,
2519 THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
2520
2521 thread_deallocate(th);
2522 return rv;
2523 }
2524
2525 static int
workq_thread_add_dispatch_override(proc_t p,mach_port_name_t kport,pthread_priority_t pp,user_addr_t ulock_addr)2526 workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
2527 pthread_priority_t pp, user_addr_t ulock_addr)
2528 {
2529 struct uu_workq_policy old_pri, new_pri;
2530 struct workqueue *wq = proc_get_wqptr(p);
2531
2532 thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
2533 if (qos_override == THREAD_QOS_UNSPECIFIED) {
2534 return EINVAL;
2535 }
2536
2537 thread_t thread = port_name_to_thread(kport,
2538 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2539 if (thread == THREAD_NULL) {
2540 return ESRCH;
2541 }
2542
2543 struct uthread *uth = get_bsdthread_info(thread);
2544 if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2545 thread_deallocate(thread);
2546 return EPERM;
2547 }
2548
2549 WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
2550 wq, thread_tid(thread), 1, pp);
2551
2552 thread_mtx_lock(thread);
2553
2554 if (ulock_addr) {
2555 uint32_t val;
2556 int rc;
2557 /*
2558 * Workaround lack of explicit support for 'no-fault copyin'
2559 * <rdar://problem/24999882>, as disabling preemption prevents paging in
2560 */
2561 disable_preemption();
2562 rc = copyin_atomic32(ulock_addr, &val);
2563 enable_preemption();
2564 if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
2565 goto out;
2566 }
2567 }
2568
2569 workq_lock_spin(wq);
2570
2571 old_pri = uth->uu_workq_pri;
2572 if (old_pri.qos_override >= qos_override) {
2573 /* Nothing to do */
2574 } else if (thread == current_thread()) {
2575 new_pri = old_pri;
2576 new_pri.qos_override = qos_override;
2577 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2578 } else {
2579 uth->uu_workq_pri.qos_override = qos_override;
2580 if (qos_override > workq_pri_override(old_pri)) {
2581 thread_set_workq_override(thread, qos_override);
2582 }
2583 }
2584
2585 workq_unlock(wq);
2586
2587 out:
2588 thread_mtx_unlock(thread);
2589 thread_deallocate(thread);
2590 return 0;
2591 }
2592
2593 static int
workq_thread_reset_dispatch_override(proc_t p,thread_t thread)2594 workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
2595 {
2596 struct uu_workq_policy old_pri, new_pri;
2597 struct workqueue *wq = proc_get_wqptr(p);
2598 struct uthread *uth = get_bsdthread_info(thread);
2599
2600 if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2601 return EPERM;
2602 }
2603
2604 WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0);
2605
2606 workq_lock_spin(wq);
2607 old_pri = new_pri = uth->uu_workq_pri;
2608 new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
2609 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2610 workq_unlock(wq);
2611 return 0;
2612 }
2613
2614 static int
workq_thread_allow_kill(__unused proc_t p,thread_t thread,bool enable)2615 workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
2616 {
2617 if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
2618 // If the thread isn't a workqueue thread, don't set the
2619 // kill_allowed bit; however, we still need to return 0
2620 // instead of an error code since this code is executed
2621 // on the abort path which needs to not depend on the
2622 // pthread_t (returning an error depends on pthread_t via
2623 // cerror_nocancel)
2624 return 0;
2625 }
2626 struct uthread *uth = get_bsdthread_info(thread);
2627 uth->uu_workq_pthread_kill_allowed = enable;
2628 return 0;
2629 }
2630
2631 static int
bsdthread_get_max_parallelism(thread_qos_t qos,unsigned long flags,int * retval)2632 bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
2633 int *retval)
2634 {
2635 static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
2636 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
2637 static_assert(QOS_PARALLELISM_REALTIME ==
2638 _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
2639 static_assert(QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE ==
2640 _PTHREAD_QOS_PARALLELISM_CLUSTER_SHARED_RSRC, "cluster shared resource");
2641
2642 if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL | QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE)) {
2643 return EINVAL;
2644 }
2645
2646 /* No units are present */
2647 if (flags & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) {
2648 return ENOTSUP;
2649 }
2650
2651 if (flags & QOS_PARALLELISM_REALTIME) {
2652 if (qos) {
2653 return EINVAL;
2654 }
2655 } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
2656 return EINVAL;
2657 }
2658
2659 *retval = qos_max_parallelism(qos, flags);
2660 return 0;
2661 }
2662
2663 static int
bsdthread_dispatch_apply_attr(__unused struct proc * p,thread_t thread,unsigned long flags,uint64_t value1,__unused uint64_t value2)2664 bsdthread_dispatch_apply_attr(__unused struct proc *p, thread_t thread,
2665 unsigned long flags, uint64_t value1, __unused uint64_t value2)
2666 {
2667 uint32_t apply_worker_index;
2668 kern_return_t kr;
2669
2670 switch (flags) {
2671 case _PTHREAD_DISPATCH_APPLY_ATTR_CLUSTER_SHARED_RSRC_SET:
2672 apply_worker_index = (uint32_t)value1;
2673 kr = thread_shared_rsrc_policy_set(thread, apply_worker_index, CLUSTER_SHARED_RSRC_TYPE_RR, SHARED_RSRC_POLICY_AGENT_DISPATCH);
2674 /*
2675 * KERN_INVALID_POLICY indicates that the thread was trying to bind to a
2676 * cluster which it was not eligible to execute on.
2677 */
2678 return (kr == KERN_SUCCESS) ? 0 : ((kr == KERN_INVALID_POLICY) ? ENOTSUP : EINVAL);
2679 case _PTHREAD_DISPATCH_APPLY_ATTR_CLUSTER_SHARED_RSRC_CLEAR:
2680 kr = thread_shared_rsrc_policy_clear(thread, CLUSTER_SHARED_RSRC_TYPE_RR, SHARED_RSRC_POLICY_AGENT_DISPATCH);
2681 return (kr == KERN_SUCCESS) ? 0 : EINVAL;
2682 default:
2683 return EINVAL;
2684 }
2685 }
2686
2687 #define ENSURE_UNUSED(arg) \
2688 ({ if ((arg) != 0) { return EINVAL; } })
2689
2690 int
bsdthread_ctl(struct proc * p,struct bsdthread_ctl_args * uap,int * retval)2691 bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
2692 {
2693 switch (uap->cmd) {
2694 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
2695 return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
2696 (pthread_priority_t)uap->arg2, uap->arg3);
2697 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
2698 ENSURE_UNUSED(uap->arg3);
2699 return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
2700 (user_addr_t)uap->arg2);
2701
2702 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
2703 return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
2704 (pthread_priority_t)uap->arg2, uap->arg3);
2705 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
2706 return workq_thread_reset_dispatch_override(p, current_thread());
2707
2708 case BSDTHREAD_CTL_SET_SELF:
2709 return bsdthread_set_self(p, current_thread(),
2710 (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
2711 (enum workq_set_self_flags)uap->arg3);
2712
2713 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
2714 ENSURE_UNUSED(uap->arg3);
2715 return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
2716 (unsigned long)uap->arg2, retval);
2717 case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
2718 ENSURE_UNUSED(uap->arg2);
2719 ENSURE_UNUSED(uap->arg3);
2720 return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
2721 case BSDTHREAD_CTL_DISPATCH_APPLY_ATTR:
2722 return bsdthread_dispatch_apply_attr(p, current_thread(),
2723 (unsigned long)uap->arg1, (uint64_t)uap->arg2,
2724 (uint64_t)uap->arg3);
2725 case BSDTHREAD_CTL_SET_QOS:
2726 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
2727 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
2728 /* no longer supported */
2729 return ENOTSUP;
2730
2731 default:
2732 return EINVAL;
2733 }
2734 }
2735
2736 #pragma mark workqueue thread manipulation
2737
2738 static void __dead2
2739 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2740 struct uthread *uth, uint32_t setup_flags);
2741
2742 static void __dead2
2743 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2744 struct uthread *uth, uint32_t setup_flags);
2745
2746 static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
2747
2748 #if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
2749 static inline uint64_t
workq_trace_req_id(workq_threadreq_t req)2750 workq_trace_req_id(workq_threadreq_t req)
2751 {
2752 struct kqworkloop *kqwl;
2753 if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2754 kqwl = __container_of(req, struct kqworkloop, kqwl_request);
2755 return kqwl->kqwl_dynamicid;
2756 }
2757
2758 return VM_KERNEL_ADDRHIDE(req);
2759 }
2760 #endif
2761
2762 /**
2763 * Entry point for libdispatch to ask for threads
2764 */
2765 static int
workq_reqthreads(struct proc * p,uint32_t reqcount,pthread_priority_t pp,bool cooperative)2766 workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp, bool cooperative)
2767 {
2768 thread_qos_t qos = _pthread_priority_thread_qos(pp);
2769 struct workqueue *wq = proc_get_wqptr(p);
2770 uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
2771 int ret = 0;
2772
2773 if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
2774 qos == THREAD_QOS_UNSPECIFIED) {
2775 ret = EINVAL;
2776 goto exit;
2777 }
2778
2779 WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
2780 wq, reqcount, pp, cooperative);
2781
2782 workq_threadreq_t req = zalloc(workq_zone_threadreq);
2783 priority_queue_entry_init(&req->tr_entry);
2784 req->tr_state = WORKQ_TR_STATE_NEW;
2785 req->tr_qos = qos;
2786 workq_tr_flags_t tr_flags = 0;
2787
2788 if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
2789 tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
2790 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2791 }
2792
2793 if (cooperative) {
2794 tr_flags |= WORKQ_TR_FLAG_COOPERATIVE;
2795 upcall_flags |= WQ_FLAG_THREAD_COOPERATIVE;
2796
2797 if (reqcount > 1) {
2798 ret = ENOTSUP;
2799 goto free_and_exit;
2800 }
2801 }
2802
2803 /* A thread request cannot be both overcommit and cooperative */
2804 if (workq_tr_is_cooperative(tr_flags) &&
2805 workq_tr_is_overcommit(tr_flags)) {
2806 ret = EINVAL;
2807 goto free_and_exit;
2808 }
2809 req->tr_flags = tr_flags;
2810
2811 WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
2812 wq, workq_trace_req_id(req), req->tr_qos, reqcount);
2813
2814 workq_lock_spin(wq);
2815 do {
2816 if (_wq_exiting(wq)) {
2817 goto unlock_and_exit;
2818 }
2819
2820 /*
2821 * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
2822 * threads without pacing, to inform the scheduler of that workload.
2823 *
2824 * The last requests, or the ones that failed the admission checks are
2825 * enqueued and go through the regular creator codepath.
2826 *
2827 * If there aren't enough threads, add one, but re-evaluate everything
2828 * as conditions may now have changed.
2829 */
2830 unpaced = reqcount - 1;
2831
2832 if (reqcount > 1) {
2833 /* We don't handle asking for parallelism on the cooperative
2834 * workqueue just yet */
2835 assert(!workq_threadreq_is_cooperative(req));
2836
2837 if (workq_threadreq_is_nonovercommit(req)) {
2838 unpaced = workq_constrained_allowance(wq, qos, NULL, false);
2839 if (unpaced >= reqcount - 1) {
2840 unpaced = reqcount - 1;
2841 }
2842 }
2843 }
2844
2845 /*
2846 * This path does not currently handle custom workloop parameters
2847 * when creating threads for parallelism.
2848 */
2849 assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
2850
2851 /*
2852 * This is a trimmed down version of workq_threadreq_bind_and_unlock()
2853 */
2854 while (unpaced > 0 && wq->wq_thidlecount) {
2855 struct uthread *uth;
2856 bool needs_wakeup;
2857 uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
2858
2859 if (workq_tr_is_overcommit(req->tr_flags)) {
2860 uu_flags |= UT_WORKQ_OVERCOMMIT;
2861 }
2862
2863 uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
2864
2865 _wq_thactive_inc(wq, qos);
2866 wq->wq_thscheduled_count[_wq_bucket(qos)]++;
2867 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
2868 wq->wq_fulfilled++;
2869
2870 uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
2871 uth->uu_save.uus_workq_park_data.thread_request = req;
2872 if (needs_wakeup) {
2873 workq_thread_wakeup(uth);
2874 }
2875 unpaced--;
2876 reqcount--;
2877 }
2878 } while (unpaced && wq->wq_nthreads < wq_max_threads &&
2879 workq_add_new_idle_thread(p, wq));
2880
2881 if (_wq_exiting(wq)) {
2882 goto unlock_and_exit;
2883 }
2884
2885 req->tr_count = (uint16_t)reqcount;
2886 if (workq_threadreq_enqueue(wq, req)) {
2887 /* This can drop the workqueue lock, and take it again */
2888 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
2889 }
2890 workq_unlock(wq);
2891 return 0;
2892
2893 unlock_and_exit:
2894 workq_unlock(wq);
2895 free_and_exit:
2896 zfree(workq_zone_threadreq, req);
2897 exit:
2898 return ret;
2899 }
2900
2901 bool
workq_kern_threadreq_initiate(struct proc * p,workq_threadreq_t req,struct turnstile * workloop_ts,thread_qos_t qos,workq_kern_threadreq_flags_t flags)2902 workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
2903 struct turnstile *workloop_ts, thread_qos_t qos,
2904 workq_kern_threadreq_flags_t flags)
2905 {
2906 struct workqueue *wq = proc_get_wqptr_fast(p);
2907 struct uthread *uth = NULL;
2908
2909 assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
2910
2911 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2912 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
2913 qos = thread_workq_qos_for_pri(trp.trp_pri);
2914 if (qos == THREAD_QOS_UNSPECIFIED) {
2915 qos = WORKQ_THREAD_QOS_ABOVEUI;
2916 }
2917 }
2918
2919 assert(req->tr_state == WORKQ_TR_STATE_IDLE);
2920 priority_queue_entry_init(&req->tr_entry);
2921 req->tr_count = 1;
2922 req->tr_state = WORKQ_TR_STATE_NEW;
2923 req->tr_qos = qos;
2924
2925 WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
2926 workq_trace_req_id(req), qos, 1);
2927
2928 if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
2929 /*
2930 * we're called back synchronously from the context of
2931 * kqueue_threadreq_unbind from within workq_thread_return()
2932 * we can try to match up this thread with this request !
2933 */
2934 uth = current_uthread();
2935 assert(uth->uu_kqr_bound == NULL);
2936 }
2937
2938 workq_lock_spin(wq);
2939 if (_wq_exiting(wq)) {
2940 req->tr_state = WORKQ_TR_STATE_IDLE;
2941 workq_unlock(wq);
2942 return false;
2943 }
2944
2945 if (uth && workq_threadreq_admissible(wq, uth, req)) {
2946 /* This is the case of the rebind - we were about to park and unbind
2947 * when more events came so keep the binding.
2948 */
2949 assert(uth != wq->wq_creator);
2950
2951 if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
2952 _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
2953 workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
2954 }
2955 /*
2956 * We're called from workq_kern_threadreq_initiate()
2957 * due to an unbind, with the kq req held.
2958 */
2959 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
2960 workq_trace_req_id(req), req->tr_flags, 0);
2961 wq->wq_fulfilled++;
2962
2963 kqueue_threadreq_bind(p, req, get_machthread(uth), 0);
2964 } else {
2965 if (workloop_ts) {
2966 workq_perform_turnstile_operation_locked(wq, ^{
2967 turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
2968 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
2969 turnstile_update_inheritor_complete(workloop_ts,
2970 TURNSTILE_INTERLOCK_HELD);
2971 });
2972 }
2973
2974 bool reevaluate_creator_thread_group = false;
2975 #if CONFIG_PREADOPT_TG
2976 reevaluate_creator_thread_group = (flags & WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG);
2977 #endif
2978 /* We enqueued the highest priority item or we may need to reevaluate if
2979 * the creator needs a thread group pre-adoption */
2980 if (workq_threadreq_enqueue(wq, req) || reevaluate_creator_thread_group) {
2981 workq_schedule_creator(p, wq, flags);
2982 }
2983 }
2984
2985 workq_unlock(wq);
2986
2987 return true;
2988 }
2989
2990 void
workq_kern_threadreq_modify(struct proc * p,workq_threadreq_t req,thread_qos_t qos,workq_kern_threadreq_flags_t flags)2991 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
2992 thread_qos_t qos, workq_kern_threadreq_flags_t flags)
2993 {
2994 struct workqueue *wq = proc_get_wqptr_fast(p);
2995 bool make_overcommit = false;
2996
2997 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2998 /* Requests outside-of-QoS shouldn't accept modify operations */
2999 return;
3000 }
3001
3002 workq_lock_spin(wq);
3003
3004 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
3005 assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
3006
3007 if (req->tr_state == WORKQ_TR_STATE_BINDING) {
3008 kqueue_threadreq_bind(p, req, req->tr_thread, 0);
3009 workq_unlock(wq);
3010 return;
3011 }
3012
3013 if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
3014 /* TODO (rokhinip): We come into this code path for kqwl thread
3015 * requests. kqwl requests cannot be cooperative.
3016 */
3017 assert(!workq_threadreq_is_cooperative(req));
3018
3019 make_overcommit = workq_threadreq_is_nonovercommit(req);
3020 }
3021
3022 if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
3023 workq_unlock(wq);
3024 return;
3025 }
3026
3027 assert(req->tr_count == 1);
3028 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
3029 panic("Invalid thread request (%p) state %d", req, req->tr_state);
3030 }
3031
3032 WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
3033 workq_trace_req_id(req), qos, 0);
3034
3035 struct priority_queue_sched_max *pq = workq_priority_queue_for_req(wq, req);
3036 workq_threadreq_t req_max;
3037
3038 /*
3039 * Stage 1: Dequeue the request from its priority queue.
3040 *
3041 * If we dequeue the root item of the constrained priority queue,
3042 * maintain the best constrained request qos invariant.
3043 */
3044 if (priority_queue_remove(pq, &req->tr_entry)) {
3045 if (workq_threadreq_is_nonovercommit(req)) {
3046 _wq_thactive_refresh_best_constrained_req_qos(wq);
3047 }
3048 }
3049
3050 /*
3051 * Stage 2: Apply changes to the thread request
3052 *
3053 * If the item will not become the root of the priority queue it belongs to,
3054 * then we need to wait in line, just enqueue and return quickly.
3055 */
3056 if (__improbable(make_overcommit)) {
3057 req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
3058 pq = workq_priority_queue_for_req(wq, req);
3059 }
3060 req->tr_qos = qos;
3061
3062 req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
3063 if (req_max && req_max->tr_qos >= qos) {
3064 priority_queue_entry_set_sched_pri(pq, &req->tr_entry,
3065 workq_priority_for_req(req), false);
3066 priority_queue_insert(pq, &req->tr_entry);
3067 workq_unlock(wq);
3068 return;
3069 }
3070
3071 /*
3072 * Stage 3: Reevaluate whether we should run the thread request.
3073 *
3074 * Pretend the thread request is new again:
3075 * - adjust wq_reqcount to not count it anymore.
3076 * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
3077 * properly attempts a synchronous bind)
3078 */
3079 wq->wq_reqcount--;
3080 req->tr_state = WORKQ_TR_STATE_NEW;
3081
3082 /* We enqueued the highest priority item or we may need to reevaluate if
3083 * the creator needs a thread group pre-adoption if the request got a new TG */
3084 bool reevaluate_creator_tg = false;
3085
3086 #if CONFIG_PREADOPT_TG
3087 reevaluate_creator_tg = (flags & WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG);
3088 #endif
3089
3090 if (workq_threadreq_enqueue(wq, req) || reevaluate_creator_tg) {
3091 workq_schedule_creator(p, wq, flags);
3092 }
3093 workq_unlock(wq);
3094 }
3095
3096 void
workq_kern_threadreq_lock(struct proc * p)3097 workq_kern_threadreq_lock(struct proc *p)
3098 {
3099 workq_lock_spin(proc_get_wqptr_fast(p));
3100 }
3101
3102 void
workq_kern_threadreq_unlock(struct proc * p)3103 workq_kern_threadreq_unlock(struct proc *p)
3104 {
3105 workq_unlock(proc_get_wqptr_fast(p));
3106 }
3107
3108 void
workq_kern_threadreq_update_inheritor(struct proc * p,workq_threadreq_t req,thread_t owner,struct turnstile * wl_ts,turnstile_update_flags_t flags)3109 workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
3110 thread_t owner, struct turnstile *wl_ts,
3111 turnstile_update_flags_t flags)
3112 {
3113 struct workqueue *wq = proc_get_wqptr_fast(p);
3114 turnstile_inheritor_t inheritor;
3115
3116 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
3117 assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
3118 workq_lock_held(wq);
3119
3120 if (req->tr_state == WORKQ_TR_STATE_BINDING) {
3121 kqueue_threadreq_bind(p, req, req->tr_thread,
3122 KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
3123 return;
3124 }
3125
3126 if (_wq_exiting(wq)) {
3127 inheritor = TURNSTILE_INHERITOR_NULL;
3128 } else {
3129 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
3130 panic("Invalid thread request (%p) state %d", req, req->tr_state);
3131 }
3132
3133 if (owner) {
3134 inheritor = owner;
3135 flags |= TURNSTILE_INHERITOR_THREAD;
3136 } else {
3137 inheritor = wq->wq_turnstile;
3138 flags |= TURNSTILE_INHERITOR_TURNSTILE;
3139 }
3140 }
3141
3142 workq_perform_turnstile_operation_locked(wq, ^{
3143 turnstile_update_inheritor(wl_ts, inheritor, flags);
3144 });
3145 }
3146
3147 void
workq_kern_threadreq_redrive(struct proc * p,workq_kern_threadreq_flags_t flags)3148 workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
3149 {
3150 struct workqueue *wq = proc_get_wqptr_fast(p);
3151
3152 workq_lock_spin(wq);
3153 workq_schedule_creator(p, wq, flags);
3154 workq_unlock(wq);
3155 }
3156
3157 /*
3158 * Always called at AST by the thread on itself
3159 *
3160 * Upon quantum expiry, the workqueue subsystem evaluates its state and decides
3161 * on what the thread should do next. The TSD value is always set by the thread
3162 * on itself in the kernel and cleared either by userspace when it acks the TSD
3163 * value and takes action, or by the thread in the kernel when the quantum
3164 * expires again.
3165 */
3166 void
workq_kern_quantum_expiry_reevaluate(proc_t proc,thread_t thread)3167 workq_kern_quantum_expiry_reevaluate(proc_t proc, thread_t thread)
3168 {
3169 struct uthread *uth = get_bsdthread_info(thread);
3170
3171 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
3172 return;
3173 }
3174
3175 if (!thread_supports_cooperative_workqueue(thread)) {
3176 panic("Quantum expired for thread that doesn't support cooperative workqueue");
3177 }
3178
3179 thread_qos_t qos = uth->uu_workq_pri.qos_bucket;
3180 if (qos == THREAD_QOS_UNSPECIFIED) {
3181 panic("Thread should not have workq bucket of QoS UN");
3182 }
3183
3184 assert(thread_has_expired_workqueue_quantum(thread, false));
3185
3186 struct workqueue *wq = proc_get_wqptr(proc);
3187 assert(wq != NULL);
3188
3189 /*
3190 * For starters, we're just going to evaluate and see if we need to narrow
3191 * the pool and tell this thread to park if needed. In the future, we'll
3192 * evaluate and convey other workqueue state information like needing to
3193 * pump kevents, etc.
3194 */
3195 uint64_t flags = 0;
3196
3197 workq_lock_spin(wq);
3198
3199 if (workq_thread_is_cooperative(uth)) {
3200 if (!workq_cooperative_allowance(wq, qos, uth, false)) {
3201 flags |= PTHREAD_WQ_QUANTUM_EXPIRY_NARROW;
3202 } else {
3203 /* In the future, when we have kevent hookups for the cooperative
3204 * pool, we need fancier logic for what userspace should do. But
3205 * right now, only userspace thread requests exist - so we'll just
3206 * tell userspace to shuffle work items */
3207 flags |= PTHREAD_WQ_QUANTUM_EXPIRY_SHUFFLE;
3208 }
3209 } else if (workq_thread_is_nonovercommit(uth)) {
3210 if (!workq_constrained_allowance(wq, qos, uth, false)) {
3211 flags |= PTHREAD_WQ_QUANTUM_EXPIRY_NARROW;
3212 }
3213 }
3214 workq_unlock(wq);
3215
3216 WQ_TRACE(TRACE_wq_quantum_expiry_reevaluate, flags, 0, 0, 0);
3217
3218 kevent_set_workq_quantum_expiry_user_tsd(proc, thread, flags);
3219
3220 /* We have conveyed to userspace about what it needs to do upon quantum
3221 * expiry, now rearm the workqueue quantum again */
3222 thread_arm_workqueue_quantum(get_machthread(uth));
3223 }
3224
3225 void
workq_schedule_creator_turnstile_redrive(struct workqueue * wq,bool locked)3226 workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
3227 {
3228 if (locked) {
3229 workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
3230 } else {
3231 workq_schedule_immediate_thread_creation(wq);
3232 }
3233 }
3234
3235 static int
workq_thread_return(struct proc * p,struct workq_kernreturn_args * uap,struct workqueue * wq)3236 workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
3237 struct workqueue *wq)
3238 {
3239 thread_t th = current_thread();
3240 struct uthread *uth = get_bsdthread_info(th);
3241 workq_threadreq_t kqr = uth->uu_kqr_bound;
3242 workq_threadreq_param_t trp = { };
3243 int nevents = uap->affinity, error;
3244 user_addr_t eventlist = uap->item;
3245
3246 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
3247 (uth->uu_workq_flags & UT_WORKQ_DYING)) {
3248 return EINVAL;
3249 }
3250
3251 if (eventlist && nevents && kqr == NULL) {
3252 return EINVAL;
3253 }
3254
3255 /* reset signal mask on the workqueue thread to default state */
3256 if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
3257 proc_lock(p);
3258 uth->uu_sigmask = ~workq_threadmask;
3259 proc_unlock(p);
3260 }
3261
3262 if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
3263 /*
3264 * Ensure we store the threadreq param before unbinding
3265 * the kqr from this thread.
3266 */
3267 trp = kqueue_threadreq_workloop_param(kqr);
3268 }
3269
3270 /*
3271 * Freeze the base pri while we decide the fate of this thread.
3272 *
3273 * Either:
3274 * - we return to user and kevent_cleanup will have unfrozen the base pri,
3275 * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
3276 */
3277 thread_freeze_base_pri(th);
3278
3279 if (kqr) {
3280 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
3281 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
3282 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3283 } else {
3284 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3285 }
3286 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
3287 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3288 } else {
3289 if (workq_thread_is_overcommit(uth)) {
3290 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3291 }
3292 if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
3293 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
3294 } else {
3295 upcall_flags |= uth->uu_workq_pri.qos_req |
3296 WQ_FLAG_THREAD_PRIO_QOS;
3297 }
3298 }
3299 error = pthread_functions->workq_handle_stack_events(p, th,
3300 get_task_map(p->task), uth->uu_workq_stackaddr,
3301 uth->uu_workq_thport, eventlist, nevents, upcall_flags);
3302 if (error) {
3303 assert(uth->uu_kqr_bound == kqr);
3304 return error;
3305 }
3306
3307 // pthread is supposed to pass KEVENT_FLAG_PARKING here
3308 // which should cause the above call to either:
3309 // - not return
3310 // - return an error
3311 // - return 0 and have unbound properly
3312 assert(uth->uu_kqr_bound == NULL);
3313 }
3314
3315 WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0);
3316
3317 thread_sched_call(th, NULL);
3318 thread_will_park_or_terminate(th);
3319 #if CONFIG_WORKLOOP_DEBUG
3320 UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
3321 #endif
3322
3323 workq_lock_spin(wq);
3324 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0);
3325 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
3326 workq_select_threadreq_or_park_and_unlock(p, wq, uth,
3327 WQ_SETUP_CLEAR_VOUCHER);
3328 __builtin_unreachable();
3329 }
3330
3331 /**
3332 * Multiplexed call to interact with the workqueue mechanism
3333 */
3334 int
workq_kernreturn(struct proc * p,struct workq_kernreturn_args * uap,int32_t * retval)3335 workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
3336 {
3337 int options = uap->options;
3338 int arg2 = uap->affinity;
3339 int arg3 = uap->prio;
3340 struct workqueue *wq = proc_get_wqptr(p);
3341 int error = 0;
3342
3343 if ((p->p_lflag & P_LREGISTER) == 0) {
3344 return EINVAL;
3345 }
3346
3347 switch (options) {
3348 case WQOPS_QUEUE_NEWSPISUPP: {
3349 /*
3350 * arg2 = offset of serialno into dispatch queue
3351 * arg3 = kevent support
3352 */
3353 int offset = arg2;
3354 if (arg3 & 0x01) {
3355 // If we get here, then userspace has indicated support for kevent delivery.
3356 }
3357
3358 p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
3359 break;
3360 }
3361 case WQOPS_QUEUE_REQTHREADS: {
3362 /*
3363 * arg2 = number of threads to start
3364 * arg3 = priority
3365 */
3366 error = workq_reqthreads(p, arg2, arg3, false);
3367 break;
3368 }
3369 /* For requesting threads for the cooperative pool */
3370 case WQOPS_QUEUE_REQTHREADS2: {
3371 /*
3372 * arg2 = number of threads to start
3373 * arg3 = priority
3374 */
3375 error = workq_reqthreads(p, arg2, arg3, true);
3376 break;
3377 }
3378 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
3379 /*
3380 * arg2 = priority for the manager thread
3381 *
3382 * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
3383 * the low bits of the value contains a scheduling priority
3384 * instead of a QOS value
3385 */
3386 pthread_priority_t pri = arg2;
3387
3388 if (wq == NULL) {
3389 error = EINVAL;
3390 break;
3391 }
3392
3393 /*
3394 * Normalize the incoming priority so that it is ordered numerically.
3395 */
3396 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
3397 pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
3398 _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
3399 } else {
3400 thread_qos_t qos = _pthread_priority_thread_qos(pri);
3401 int relpri = _pthread_priority_relpri(pri);
3402 if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
3403 qos == THREAD_QOS_UNSPECIFIED) {
3404 error = EINVAL;
3405 break;
3406 }
3407 pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
3408 }
3409
3410 /*
3411 * If userspace passes a scheduling priority, that wins over any QoS.
3412 * Userspace should takes care not to lower the priority this way.
3413 */
3414 workq_lock_spin(wq);
3415 if (wq->wq_event_manager_priority < (uint32_t)pri) {
3416 wq->wq_event_manager_priority = (uint32_t)pri;
3417 }
3418 workq_unlock(wq);
3419 break;
3420 }
3421 case WQOPS_THREAD_KEVENT_RETURN:
3422 case WQOPS_THREAD_WORKLOOP_RETURN:
3423 case WQOPS_THREAD_RETURN: {
3424 error = workq_thread_return(p, uap, wq);
3425 break;
3426 }
3427
3428 case WQOPS_SHOULD_NARROW: {
3429 /*
3430 * arg2 = priority to test
3431 * arg3 = unused
3432 */
3433 thread_t th = current_thread();
3434 struct uthread *uth = get_bsdthread_info(th);
3435 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
3436 (uth->uu_workq_flags & (UT_WORKQ_DYING | UT_WORKQ_OVERCOMMIT))) {
3437 error = EINVAL;
3438 break;
3439 }
3440
3441 thread_qos_t qos = _pthread_priority_thread_qos(arg2);
3442 if (qos == THREAD_QOS_UNSPECIFIED) {
3443 error = EINVAL;
3444 break;
3445 }
3446 workq_lock_spin(wq);
3447 bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false);
3448 workq_unlock(wq);
3449
3450 *retval = should_narrow;
3451 break;
3452 }
3453 case WQOPS_SETUP_DISPATCH: {
3454 /*
3455 * item = pointer to workq_dispatch_config structure
3456 * arg2 = sizeof(item)
3457 */
3458 struct workq_dispatch_config cfg;
3459 bzero(&cfg, sizeof(cfg));
3460
3461 error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
3462 if (error) {
3463 break;
3464 }
3465
3466 if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
3467 cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
3468 error = ENOTSUP;
3469 break;
3470 }
3471
3472 /* Load fields from version 1 */
3473 p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
3474
3475 /* Load fields from version 2 */
3476 if (cfg.wdc_version >= 2) {
3477 p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
3478 }
3479
3480 break;
3481 }
3482 default:
3483 error = EINVAL;
3484 break;
3485 }
3486
3487 return error;
3488 }
3489
3490 /*
3491 * We have no work to do, park ourselves on the idle list.
3492 *
3493 * Consumes the workqueue lock and does not return.
3494 */
3495 __attribute__((noreturn, noinline))
3496 static void
workq_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)3497 workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
3498 uint32_t setup_flags)
3499 {
3500 assert(uth == current_uthread());
3501 assert(uth->uu_kqr_bound == NULL);
3502 workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
3503
3504 workq_thread_reset_cpupercent(NULL, uth);
3505
3506 #if CONFIG_PREADOPT_TG
3507 /* Clear the preadoption thread group on the thread.
3508 *
3509 * Case 1:
3510 * Creator thread which never picked up a thread request. We set a
3511 * preadoption thread group on creator threads but if it never picked
3512 * up a thread request and didn't go to userspace, then the thread will
3513 * park with a preadoption thread group but no explicitly adopted
3514 * voucher or work interval.
3515 *
3516 * We drop the preadoption thread group here before proceeding to park.
3517 * Note - we may get preempted when we drop the workq lock below.
3518 *
3519 * Case 2:
3520 * Thread picked up a thread request and bound to it and returned back
3521 * from userspace and is parking. At this point, preadoption thread
3522 * group should be NULL since the thread has unbound from the thread
3523 * request. So this operation should be a no-op.
3524 */
3525 thread_set_preadopt_thread_group(get_machthread(uth), NULL);
3526 #endif
3527
3528 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
3529 !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
3530 workq_unlock(wq);
3531
3532 /*
3533 * workq_push_idle_thread() will unset `has_stack`
3534 * if it wants us to free the stack before parking.
3535 */
3536 if (!uth->uu_save.uus_workq_park_data.has_stack) {
3537 pthread_functions->workq_markfree_threadstack(p,
3538 get_machthread(uth), get_task_map(p->task),
3539 uth->uu_workq_stackaddr);
3540 }
3541
3542 /*
3543 * When we remove the voucher from the thread, we may lose our importance
3544 * causing us to get preempted, so we do this after putting the thread on
3545 * the idle list. Then, when we get our importance back we'll be able to
3546 * use this thread from e.g. the kevent call out to deliver a boosting
3547 * message.
3548 *
3549 * Note that setting the voucher to NULL will not clear the preadoption
3550 * thread since this thread could have become the creator again and
3551 * perhaps acquired a preadoption thread group.
3552 */
3553 __assert_only kern_return_t kr;
3554 kr = thread_set_voucher_name(MACH_PORT_NULL);
3555 assert(kr == KERN_SUCCESS);
3556
3557 workq_lock_spin(wq);
3558 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
3559 setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
3560 }
3561
3562 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0);
3563
3564 if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
3565 /*
3566 * While we'd dropped the lock to unset our voucher, someone came
3567 * around and made us runnable. But because we weren't waiting on the
3568 * event their thread_wakeup() was ineffectual. To correct for that,
3569 * we just run the continuation ourselves.
3570 */
3571 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
3572 __builtin_unreachable();
3573 }
3574
3575 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
3576 workq_unpark_for_death_and_unlock(p, wq, uth,
3577 WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
3578 __builtin_unreachable();
3579 }
3580
3581 /* Disarm the workqueue quantum since the thread is now idle */
3582 thread_disarm_workqueue_quantum(get_machthread(uth));
3583
3584 thread_set_pending_block_hint(get_machthread(uth), kThreadWaitParkedWorkQueue);
3585 assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
3586 workq_unlock(wq);
3587 thread_block(workq_unpark_continue);
3588 __builtin_unreachable();
3589 }
3590
3591 static inline bool
workq_may_start_event_mgr_thread(struct workqueue * wq,struct uthread * uth)3592 workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
3593 {
3594 /*
3595 * There's an event manager request and either:
3596 * - no event manager currently running
3597 * - we are re-using the event manager
3598 */
3599 return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
3600 (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
3601 }
3602
3603 static uint32_t
workq_constrained_allowance(struct workqueue * wq,thread_qos_t at_qos,struct uthread * uth,bool may_start_timer)3604 workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
3605 struct uthread *uth, bool may_start_timer)
3606 {
3607 assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
3608 uint32_t count = 0;
3609
3610 uint32_t max_count = wq->wq_constrained_threads_scheduled;
3611 if (uth && workq_thread_is_nonovercommit(uth)) {
3612 /*
3613 * don't count the current thread as scheduled
3614 */
3615 assert(max_count > 0);
3616 max_count--;
3617 }
3618 if (max_count >= wq_max_constrained_threads) {
3619 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3620 wq->wq_constrained_threads_scheduled,
3621 wq_max_constrained_threads);
3622 /*
3623 * we need 1 or more constrained threads to return to the kernel before
3624 * we can dispatch additional work
3625 */
3626 return 0;
3627 }
3628 max_count -= wq_max_constrained_threads;
3629
3630 /*
3631 * Compute a metric for many how many threads are active. We find the
3632 * highest priority request outstanding and then add up the number of active
3633 * threads in that and all higher-priority buckets. We'll also add any
3634 * "busy" threads which are not currently active but blocked recently enough
3635 * that we can't be sure that they won't be unblocked soon and start
3636 * being active again.
3637 *
3638 * We'll then compare this metric to our max concurrency to decide whether
3639 * to add a new thread.
3640 */
3641
3642 uint32_t busycount, thactive_count;
3643
3644 thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
3645 at_qos, &busycount, NULL);
3646
3647 if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
3648 at_qos <= uth->uu_workq_pri.qos_bucket) {
3649 /*
3650 * Don't count this thread as currently active, but only if it's not
3651 * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
3652 * managers.
3653 */
3654 assert(thactive_count > 0);
3655 thactive_count--;
3656 }
3657
3658 count = wq_max_parallelism[_wq_bucket(at_qos)];
3659 if (count > thactive_count + busycount) {
3660 count -= thactive_count + busycount;
3661 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3662 thactive_count, busycount);
3663 return MIN(count, max_count);
3664 } else {
3665 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3666 thactive_count, busycount);
3667 }
3668
3669 if (may_start_timer) {
3670 /*
3671 * If this is called from the add timer, we won't have another timer
3672 * fire when the thread exits the "busy" state, so rearm the timer.
3673 */
3674 workq_schedule_delayed_thread_creation(wq, 0);
3675 }
3676
3677 return 0;
3678 }
3679
3680 static bool
workq_threadreq_admissible(struct workqueue * wq,struct uthread * uth,workq_threadreq_t req)3681 workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
3682 workq_threadreq_t req)
3683 {
3684 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
3685 return workq_may_start_event_mgr_thread(wq, uth);
3686 }
3687 if (workq_threadreq_is_cooperative(req)) {
3688 return workq_cooperative_allowance(wq, req->tr_qos, uth, true);
3689 }
3690 if (workq_threadreq_is_nonovercommit(req)) {
3691 return workq_constrained_allowance(wq, req->tr_qos, uth, true);
3692 }
3693
3694 return true;
3695 }
3696
3697 /*
3698 * Called from the context of selecting thread requests for threads returning
3699 * from userspace or creator thread
3700 */
3701 static workq_threadreq_t
workq_cooperative_queue_best_req(struct workqueue * wq,struct uthread * uth)3702 workq_cooperative_queue_best_req(struct workqueue *wq, struct uthread *uth)
3703 {
3704 workq_lock_held(wq);
3705
3706 /*
3707 * If the current thread is cooperative, we need to exclude it as part of
3708 * cooperative schedule count since this thread is looking for a new
3709 * request. Change in the schedule count for cooperative pool therefore
3710 * requires us to reeevaluate the next best request for it.
3711 */
3712 if (uth && workq_thread_is_cooperative(uth)) {
3713 _wq_cooperative_queue_scheduled_count_dec(wq, uth->uu_workq_pri.qos_bucket);
3714
3715 (void) _wq_cooperative_queue_refresh_best_req_qos(wq);
3716
3717 _wq_cooperative_queue_scheduled_count_inc(wq, uth->uu_workq_pri.qos_bucket);
3718 } else {
3719 /*
3720 * The old value that was already precomputed should be safe to use -
3721 * add an assert that asserts that the best req QoS doesn't change in
3722 * this case
3723 */
3724 assert(_wq_cooperative_queue_refresh_best_req_qos(wq) == false);
3725 }
3726
3727 thread_qos_t qos = wq->wq_cooperative_queue_best_req_qos;
3728
3729 /* There are no eligible requests in the cooperative pool */
3730 if (qos == THREAD_QOS_UNSPECIFIED) {
3731 return NULL;
3732 }
3733 assert(qos != WORKQ_THREAD_QOS_ABOVEUI);
3734 assert(qos != WORKQ_THREAD_QOS_MANAGER);
3735
3736 uint8_t bucket = _wq_bucket(qos);
3737 assert(!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket]));
3738
3739 return STAILQ_FIRST(&wq->wq_cooperative_queue[bucket]);
3740 }
3741
3742 static workq_threadreq_t
workq_threadreq_select_for_creator(struct workqueue * wq)3743 workq_threadreq_select_for_creator(struct workqueue *wq)
3744 {
3745 workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
3746 thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
3747 uint8_t pri = 0;
3748
3749 /*
3750 * Compute the best priority request, and ignore the turnstile for now
3751 */
3752
3753 req_pri = priority_queue_max(&wq->wq_special_queue,
3754 struct workq_threadreq_s, tr_entry);
3755 if (req_pri) {
3756 pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
3757 &req_pri->tr_entry);
3758 }
3759
3760 /*
3761 * Handle the manager thread request. The special queue might yield
3762 * a higher priority, but the manager always beats the QoS world.
3763 */
3764
3765 req_mgr = wq->wq_event_manager_threadreq;
3766 if (req_mgr && workq_may_start_event_mgr_thread(wq, NULL)) {
3767 uint32_t mgr_pri = wq->wq_event_manager_priority;
3768
3769 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
3770 mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
3771 } else {
3772 mgr_pri = thread_workq_pri_for_qos(
3773 _pthread_priority_thread_qos(mgr_pri));
3774 }
3775
3776 return mgr_pri >= pri ? req_mgr : req_pri;
3777 }
3778
3779 /*
3780 * Compute the best QoS Request, and check whether it beats the "pri" one
3781 *
3782 * Start by comparing the overcommit and the cooperative pool
3783 */
3784 req_qos = priority_queue_max(&wq->wq_overcommit_queue,
3785 struct workq_threadreq_s, tr_entry);
3786 if (req_qos) {
3787 qos = req_qos->tr_qos;
3788 }
3789
3790 req_tmp = workq_cooperative_queue_best_req(wq, NULL);
3791 if (req_tmp && qos <= req_tmp->tr_qos) {
3792 /*
3793 * Cooperative TR is better between overcommit and cooperative. Note
3794 * that if qos is same between overcommit and cooperative, we choose
3795 * cooperative.
3796 *
3797 * Pick cooperative pool if it passes the admissions check
3798 */
3799 if (workq_cooperative_allowance(wq, req_tmp->tr_qos, NULL, true)) {
3800 req_qos = req_tmp;
3801 qos = req_qos->tr_qos;
3802 }
3803 }
3804
3805 /*
3806 * Compare the best QoS so far - either from overcommit or from cooperative
3807 * pool - and compare it with the constrained pool
3808 */
3809 req_tmp = priority_queue_max(&wq->wq_constrained_queue,
3810 struct workq_threadreq_s, tr_entry);
3811
3812 if (req_tmp && qos < req_tmp->tr_qos) {
3813 /*
3814 * Constrained pool is best in QoS between overcommit, cooperative
3815 * and constrained. Now check how it fairs against the priority case
3816 */
3817 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
3818 return req_pri;
3819 }
3820
3821 if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) {
3822 /*
3823 * If the constrained thread request is the best one and passes
3824 * the admission check, pick it.
3825 */
3826 return req_tmp;
3827 }
3828 }
3829
3830 /*
3831 * Compare the best of the QoS world with the priority
3832 */
3833 if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
3834 return req_pri;
3835 }
3836
3837 if (req_qos) {
3838 return req_qos;
3839 }
3840
3841 /*
3842 * If we had no eligible request but we have a turnstile push,
3843 * it must be a non overcommit thread request that failed
3844 * the admission check.
3845 *
3846 * Just fake a BG thread request so that if the push stops the creator
3847 * priority just drops to 4.
3848 */
3849 if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
3850 static struct workq_threadreq_s workq_sync_push_fake_req = {
3851 .tr_qos = THREAD_QOS_BACKGROUND,
3852 };
3853
3854 return &workq_sync_push_fake_req;
3855 }
3856
3857 return NULL;
3858 }
3859
3860 /*
3861 * Returns true if this caused a change in the schedule counts of the
3862 * cooperative pool
3863 */
3864 static bool
workq_adjust_cooperative_constrained_schedule_counts(struct workqueue * wq,struct uthread * uth,thread_qos_t old_thread_qos,workq_tr_flags_t tr_flags)3865 workq_adjust_cooperative_constrained_schedule_counts(struct workqueue *wq,
3866 struct uthread *uth, thread_qos_t old_thread_qos, workq_tr_flags_t tr_flags)
3867 {
3868 workq_lock_held(wq);
3869
3870 /*
3871 * Row: thread type
3872 * Column: Request type
3873 *
3874 * overcommit non-overcommit cooperative
3875 * overcommit X case 1 case 2
3876 * cooperative case 3 case 4 case 5
3877 * non-overcommit case 6 X case 7
3878 *
3879 * Move the thread to the right bucket depending on what state it currently
3880 * has and what state the thread req it picks, is going to have.
3881 *
3882 * Note that the creator thread is an overcommit thread.
3883 */
3884 thread_qos_t new_thread_qos = uth->uu_workq_pri.qos_bucket;
3885
3886 /*
3887 * Anytime a cooperative bucket's schedule count changes, we need to
3888 * potentially refresh the next best QoS for that pool when we determine
3889 * the next request for the creator
3890 */
3891 bool cooperative_pool_sched_count_changed = false;
3892
3893 if (workq_thread_is_overcommit(uth)) {
3894 if (workq_tr_is_nonovercommit(tr_flags)) {
3895 // Case 1: thread is overcommit, req is non-overcommit
3896 wq->wq_constrained_threads_scheduled++;
3897 } else if (workq_tr_is_cooperative(tr_flags)) {
3898 // Case 2: thread is overcommit, req is cooperative
3899 _wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
3900 cooperative_pool_sched_count_changed = true;
3901 }
3902 } else if (workq_thread_is_cooperative(uth)) {
3903 if (workq_tr_is_overcommit(tr_flags)) {
3904 // Case 3: thread is cooperative, req is overcommit
3905 _wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
3906 } else if (workq_tr_is_nonovercommit(tr_flags)) {
3907 // Case 4: thread is cooperative, req is non-overcommit
3908 _wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
3909 wq->wq_constrained_threads_scheduled++;
3910 } else {
3911 // Case 5: thread is cooperative, req is also cooperative
3912 assert(workq_tr_is_cooperative(tr_flags));
3913 _wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
3914 _wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
3915 }
3916 cooperative_pool_sched_count_changed = true;
3917 } else {
3918 if (workq_tr_is_overcommit(tr_flags)) {
3919 // Case 6: Thread is non-overcommit, req is overcommit
3920 wq->wq_constrained_threads_scheduled--;
3921 } else if (workq_tr_is_cooperative(tr_flags)) {
3922 // Case 7: Thread is non-overcommit, req is cooperative
3923 wq->wq_constrained_threads_scheduled--;
3924 _wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
3925 cooperative_pool_sched_count_changed = true;
3926 }
3927 }
3928
3929 return cooperative_pool_sched_count_changed;
3930 }
3931
3932 static workq_threadreq_t
workq_threadreq_select(struct workqueue * wq,struct uthread * uth)3933 workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
3934 {
3935 workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
3936 uintptr_t proprietor;
3937 thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
3938 uint8_t pri = 0;
3939
3940 if (uth == wq->wq_creator) {
3941 uth = NULL;
3942 }
3943
3944 /*
3945 * Compute the best priority request (special or turnstile)
3946 */
3947
3948 pri = (uint8_t)turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
3949 &proprietor);
3950 if (pri) {
3951 struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
3952 req_pri = &kqwl->kqwl_request;
3953 if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
3954 panic("Invalid thread request (%p) state %d",
3955 req_pri, req_pri->tr_state);
3956 }
3957 } else {
3958 req_pri = NULL;
3959 }
3960
3961 req_tmp = priority_queue_max(&wq->wq_special_queue,
3962 struct workq_threadreq_s, tr_entry);
3963 if (req_tmp && pri < priority_queue_entry_sched_pri(&wq->wq_special_queue,
3964 &req_tmp->tr_entry)) {
3965 req_pri = req_tmp;
3966 pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
3967 &req_tmp->tr_entry);
3968 }
3969
3970 /*
3971 * Handle the manager thread request. The special queue might yield
3972 * a higher priority, but the manager always beats the QoS world.
3973 */
3974
3975 req_mgr = wq->wq_event_manager_threadreq;
3976 if (req_mgr && workq_may_start_event_mgr_thread(wq, uth)) {
3977 uint32_t mgr_pri = wq->wq_event_manager_priority;
3978
3979 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
3980 mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
3981 } else {
3982 mgr_pri = thread_workq_pri_for_qos(
3983 _pthread_priority_thread_qos(mgr_pri));
3984 }
3985
3986 return mgr_pri >= pri ? req_mgr : req_pri;
3987 }
3988
3989 /*
3990 * Compute the best QoS Request, and check whether it beats the "pri" one
3991 */
3992
3993 req_qos = priority_queue_max(&wq->wq_overcommit_queue,
3994 struct workq_threadreq_s, tr_entry);
3995 if (req_qos) {
3996 qos = req_qos->tr_qos;
3997 }
3998
3999 req_tmp = workq_cooperative_queue_best_req(wq, uth);
4000 if (req_tmp && qos <= req_tmp->tr_qos) {
4001 /*
4002 * Cooperative TR is better between overcommit and cooperative. Note
4003 * that if qos is same between overcommit and cooperative, we choose
4004 * cooperative.
4005 *
4006 * Pick cooperative pool if it passes the admissions check
4007 */
4008 if (workq_cooperative_allowance(wq, req_tmp->tr_qos, uth, true)) {
4009 req_qos = req_tmp;
4010 qos = req_qos->tr_qos;
4011 }
4012 }
4013
4014 /*
4015 * Compare the best QoS so far - either from overcommit or from cooperative
4016 * pool - and compare it with the constrained pool
4017 */
4018 req_tmp = priority_queue_max(&wq->wq_constrained_queue,
4019 struct workq_threadreq_s, tr_entry);
4020
4021 if (req_tmp && qos < req_tmp->tr_qos) {
4022 /*
4023 * Constrained pool is best in QoS between overcommit, cooperative
4024 * and constrained. Now check how it fairs against the priority case
4025 */
4026 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
4027 return req_pri;
4028 }
4029
4030 if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) {
4031 /*
4032 * If the constrained thread request is the best one and passes
4033 * the admission check, pick it.
4034 */
4035 return req_tmp;
4036 }
4037 }
4038
4039 if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
4040 return req_pri;
4041 }
4042
4043 return req_qos;
4044 }
4045
4046 /*
4047 * The creator is an anonymous thread that is counted as scheduled,
4048 * but otherwise without its scheduler callback set or tracked as active
4049 * that is used to make other threads.
4050 *
4051 * When more requests are added or an existing one is hurried along,
4052 * a creator is elected and setup, or the existing one overridden accordingly.
4053 *
4054 * While this creator is in flight, because no request has been dequeued,
4055 * already running threads have a chance at stealing thread requests avoiding
4056 * useless context switches, and the creator once scheduled may not find any
4057 * work to do and will then just park again.
4058 *
4059 * The creator serves the dual purpose of informing the scheduler of work that
4060 * hasn't be materialized as threads yet, and also as a natural pacing mechanism
4061 * for thread creation.
4062 *
4063 * By being anonymous (and not bound to anything) it means that thread requests
4064 * can be stolen from this creator by threads already on core yielding more
4065 * efficient scheduling and reduced context switches.
4066 */
4067 static void
workq_schedule_creator(proc_t p,struct workqueue * wq,workq_kern_threadreq_flags_t flags)4068 workq_schedule_creator(proc_t p, struct workqueue *wq,
4069 workq_kern_threadreq_flags_t flags)
4070 {
4071 workq_threadreq_t req;
4072 struct uthread *uth;
4073 bool needs_wakeup;
4074
4075 workq_lock_held(wq);
4076 assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
4077
4078 again:
4079 uth = wq->wq_creator;
4080
4081 if (!wq->wq_reqcount) {
4082 /*
4083 * There is no thread request left.
4084 *
4085 * If there is a creator, leave everything in place, so that it cleans
4086 * up itself in workq_push_idle_thread().
4087 *
4088 * Else, make sure the turnstile state is reset to no inheritor.
4089 */
4090 if (uth == NULL) {
4091 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
4092 }
4093 return;
4094 }
4095
4096 req = workq_threadreq_select_for_creator(wq);
4097 if (req == NULL) {
4098 /*
4099 * There isn't a thread request that passes the admission check.
4100 *
4101 * If there is a creator, do not touch anything, the creator will sort
4102 * it out when it runs.
4103 *
4104 * Else, set the inheritor to "WORKQ" so that the turnstile propagation
4105 * code calls us if anything changes.
4106 */
4107 if (uth == NULL) {
4108 workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
4109 }
4110 return;
4111 }
4112
4113
4114 if (uth) {
4115 /*
4116 * We need to maybe override the creator we already have
4117 */
4118 if (workq_thread_needs_priority_change(req, uth)) {
4119 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
4120 wq, 1, uthread_tid(uth), req->tr_qos);
4121 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4122 }
4123 assert(wq->wq_inheritor == get_machthread(uth));
4124 } else if (wq->wq_thidlecount) {
4125 /*
4126 * We need to unpark a creator thread
4127 */
4128 wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
4129 &needs_wakeup);
4130 /* Always reset the priorities on the newly chosen creator */
4131 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4132 workq_turnstile_update_inheritor(wq, get_machthread(uth),
4133 TURNSTILE_INHERITOR_THREAD);
4134 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
4135 wq, 2, uthread_tid(uth), req->tr_qos);
4136 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
4137 uth->uu_save.uus_workq_park_data.yields = 0;
4138 if (needs_wakeup) {
4139 workq_thread_wakeup(uth);
4140 }
4141 } else {
4142 /*
4143 * We need to allocate a thread...
4144 */
4145 if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
4146 /* out of threads, just go away */
4147 flags = WORKQ_THREADREQ_NONE;
4148 } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
4149 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
4150 } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
4151 /* This can drop the workqueue lock, and take it again */
4152 workq_schedule_immediate_thread_creation(wq);
4153 } else if (workq_add_new_idle_thread(p, wq)) {
4154 goto again;
4155 } else {
4156 workq_schedule_delayed_thread_creation(wq, 0);
4157 }
4158
4159 /*
4160 * If the current thread is the inheritor:
4161 *
4162 * If we set the AST, then the thread will stay the inheritor until
4163 * either the AST calls workq_kern_threadreq_redrive(), or it parks
4164 * and calls workq_push_idle_thread().
4165 *
4166 * Else, the responsibility of the thread creation is with a thread-call
4167 * and we need to clear the inheritor.
4168 */
4169 if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
4170 wq->wq_inheritor == current_thread()) {
4171 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
4172 }
4173 }
4174 }
4175
4176 /**
4177 * Same as workq_unpark_select_threadreq_or_park_and_unlock,
4178 * but do not allow early binds.
4179 *
4180 * Called with the base pri frozen, will unfreeze it.
4181 */
4182 __attribute__((noreturn, noinline))
4183 static void
workq_select_threadreq_or_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)4184 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
4185 struct uthread *uth, uint32_t setup_flags)
4186 {
4187 workq_threadreq_t req = NULL;
4188 bool is_creator = (wq->wq_creator == uth);
4189 bool schedule_creator = false;
4190
4191 if (__improbable(_wq_exiting(wq))) {
4192 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0);
4193 goto park;
4194 }
4195
4196 if (wq->wq_reqcount == 0) {
4197 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0);
4198 goto park;
4199 }
4200
4201 req = workq_threadreq_select(wq, uth);
4202 if (__improbable(req == NULL)) {
4203 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0);
4204 goto park;
4205 }
4206
4207 thread_qos_t old_thread_bucket = uth->uu_workq_pri.qos_bucket;
4208 uint8_t tr_flags = req->tr_flags;
4209 struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
4210
4211 /*
4212 * Attempt to setup ourselves as the new thing to run, moving all priority
4213 * pushes to ourselves.
4214 *
4215 * If the current thread is the creator, then the fact that we are presently
4216 * running is proof that we'll do something useful, so keep going.
4217 *
4218 * For other cases, peek at the AST to know whether the scheduler wants
4219 * to preempt us, if yes, park instead, and move the thread request
4220 * turnstile back to the workqueue.
4221 */
4222 if (req_ts) {
4223 workq_perform_turnstile_operation_locked(wq, ^{
4224 turnstile_update_inheritor(req_ts, get_machthread(uth),
4225 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
4226 turnstile_update_inheritor_complete(req_ts,
4227 TURNSTILE_INTERLOCK_HELD);
4228 });
4229 }
4230
4231 /* accounting changes of aggregate thscheduled_count and thactive which has
4232 * to be paired with the workq_thread_reset_pri below so that we have
4233 * uth->uu_workq_pri match with thactive.
4234 *
4235 * This is undone when the thread parks */
4236 if (is_creator) {
4237 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
4238 uth->uu_save.uus_workq_park_data.yields);
4239 wq->wq_creator = NULL;
4240 _wq_thactive_inc(wq, req->tr_qos);
4241 wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
4242 } else if (old_thread_bucket != req->tr_qos) {
4243 _wq_thactive_move(wq, old_thread_bucket, req->tr_qos);
4244 }
4245 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4246
4247 /*
4248 * Make relevant accounting changes for pool specific counts.
4249 *
4250 * The schedule counts changing can affect what the next best request
4251 * for cooperative thread pool is if this request is dequeued.
4252 */
4253 bool cooperative_sched_count_changed =
4254 workq_adjust_cooperative_constrained_schedule_counts(wq, uth,
4255 old_thread_bucket, tr_flags);
4256
4257 if (workq_tr_is_overcommit(tr_flags)) {
4258 workq_thread_set_type(uth, UT_WORKQ_OVERCOMMIT);
4259 } else if (workq_tr_is_cooperative(tr_flags)) {
4260 workq_thread_set_type(uth, UT_WORKQ_COOPERATIVE);
4261 } else {
4262 workq_thread_set_type(uth, 0);
4263 }
4264
4265 if (__improbable(thread_unfreeze_base_pri(get_machthread(uth)) && !is_creator)) {
4266 if (req_ts) {
4267 workq_perform_turnstile_operation_locked(wq, ^{
4268 turnstile_update_inheritor(req_ts, wq->wq_turnstile,
4269 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
4270 turnstile_update_inheritor_complete(req_ts,
4271 TURNSTILE_INTERLOCK_HELD);
4272 });
4273 }
4274 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0);
4275 goto park_thawed;
4276 }
4277
4278 /*
4279 * We passed all checks, dequeue the request, bind to it, and set it up
4280 * to return to user.
4281 */
4282 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
4283 workq_trace_req_id(req), tr_flags, 0);
4284 wq->wq_fulfilled++;
4285 schedule_creator = workq_threadreq_dequeue(wq, req,
4286 cooperative_sched_count_changed);
4287
4288 workq_thread_reset_cpupercent(req, uth);
4289
4290 if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
4291 kqueue_threadreq_bind_prepost(p, req, uth);
4292 req = NULL;
4293 } else if (req->tr_count > 0) {
4294 req = NULL;
4295 }
4296
4297 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
4298 uth->uu_workq_flags ^= UT_WORKQ_NEW;
4299 setup_flags |= WQ_SETUP_FIRST_USE;
4300 }
4301
4302 /* If one of the following is true, call workq_schedule_creator (which also
4303 * adjusts priority of existing creator):
4304 *
4305 * - We are the creator currently so the wq may need a new creator
4306 * - The request we're binding to is the highest priority one, existing
4307 * creator's priority might need to be adjusted to reflect the next
4308 * highest TR
4309 */
4310 if (is_creator || schedule_creator) {
4311 /* This can drop the workqueue lock, and take it again */
4312 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
4313 }
4314
4315 workq_unlock(wq);
4316
4317 if (req) {
4318 zfree(workq_zone_threadreq, req);
4319 }
4320
4321 /*
4322 * Run Thread, Run!
4323 */
4324 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
4325 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
4326 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
4327 } else if (workq_tr_is_overcommit(tr_flags)) {
4328 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
4329 } else if (workq_tr_is_cooperative(tr_flags)) {
4330 upcall_flags |= WQ_FLAG_THREAD_COOPERATIVE;
4331 }
4332 if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
4333 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
4334 assert((upcall_flags & WQ_FLAG_THREAD_COOPERATIVE) == 0);
4335 }
4336
4337 if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
4338 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
4339 }
4340 uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
4341
4342 if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
4343 kqueue_threadreq_bind_commit(p, get_machthread(uth));
4344 } else {
4345 #if CONFIG_PREADOPT_TG
4346 /*
4347 * The thread may have a preadopt thread group on it already because it
4348 * got tagged with it as a creator thread. So we need to make sure to
4349 * clear that since we don't have preadoption for anonymous thread
4350 * requests
4351 */
4352 thread_set_preadopt_thread_group(get_machthread(uth), NULL);
4353 #endif
4354 }
4355
4356 workq_setup_and_run(p, uth, setup_flags);
4357 __builtin_unreachable();
4358
4359 park:
4360 thread_unfreeze_base_pri(get_machthread(uth));
4361 park_thawed:
4362 workq_park_and_unlock(p, wq, uth, setup_flags);
4363 }
4364
4365 /**
4366 * Runs a thread request on a thread
4367 *
4368 * - if thread is THREAD_NULL, will find a thread and run the request there.
4369 * Otherwise, the thread must be the current thread.
4370 *
4371 * - if req is NULL, will find the highest priority request and run that. If
4372 * it is not NULL, it must be a threadreq object in state NEW. If it can not
4373 * be run immediately, it will be enqueued and moved to state QUEUED.
4374 *
4375 * Either way, the thread request object serviced will be moved to state
4376 * BINDING and attached to the uthread.
4377 *
4378 * Should be called with the workqueue lock held. Will drop it.
4379 * Should be called with the base pri not frozen.
4380 */
4381 __attribute__((noreturn, noinline))
4382 static void
workq_unpark_select_threadreq_or_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)4383 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
4384 struct uthread *uth, uint32_t setup_flags)
4385 {
4386 if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
4387 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
4388 setup_flags |= WQ_SETUP_FIRST_USE;
4389 }
4390 uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
4391 /*
4392 * This pointer is possibly freed and only used for tracing purposes.
4393 */
4394 workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
4395 workq_unlock(wq);
4396 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
4397 VM_KERNEL_ADDRHIDE(req), 0, 0);
4398 (void)req;
4399
4400 workq_setup_and_run(p, uth, setup_flags);
4401 __builtin_unreachable();
4402 }
4403
4404 thread_freeze_base_pri(get_machthread(uth));
4405 workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
4406 }
4407
4408 static bool
workq_creator_should_yield(struct workqueue * wq,struct uthread * uth)4409 workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
4410 {
4411 thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
4412
4413 if (qos >= THREAD_QOS_USER_INTERACTIVE) {
4414 return false;
4415 }
4416
4417 uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
4418 if (wq->wq_fulfilled == snapshot) {
4419 return false;
4420 }
4421
4422 uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
4423 if (wq->wq_fulfilled - snapshot > conc) {
4424 /* we fulfilled more than NCPU requests since being dispatched */
4425 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
4426 wq->wq_fulfilled, snapshot);
4427 return true;
4428 }
4429
4430 for (uint8_t i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
4431 cnt += wq->wq_thscheduled_count[i];
4432 }
4433 if (conc <= cnt) {
4434 /* We fulfilled requests and have more than NCPU scheduled threads */
4435 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
4436 wq->wq_fulfilled, snapshot);
4437 return true;
4438 }
4439
4440 return false;
4441 }
4442
4443 /**
4444 * parked thread wakes up
4445 */
4446 __attribute__((noreturn, noinline))
4447 static void
workq_unpark_continue(void * parameter __unused,wait_result_t wr __unused)4448 workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
4449 {
4450 thread_t th = current_thread();
4451 struct uthread *uth = get_bsdthread_info(th);
4452 proc_t p = current_proc();
4453 struct workqueue *wq = proc_get_wqptr_fast(p);
4454
4455 workq_lock_spin(wq);
4456
4457 if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
4458 /*
4459 * If the number of threads we have out are able to keep up with the
4460 * demand, then we should avoid sending this creator thread to
4461 * userspace.
4462 */
4463 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
4464 uth->uu_save.uus_workq_park_data.yields++;
4465 workq_unlock(wq);
4466 thread_yield_with_continuation(workq_unpark_continue, NULL);
4467 __builtin_unreachable();
4468 }
4469
4470 if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
4471 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
4472 __builtin_unreachable();
4473 }
4474
4475 if (__probable(wr == THREAD_AWAKENED)) {
4476 /*
4477 * We were set running, but for the purposes of dying.
4478 */
4479 assert(uth->uu_workq_flags & UT_WORKQ_DYING);
4480 assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
4481 } else {
4482 /*
4483 * workaround for <rdar://problem/38647347>,
4484 * in case we do hit userspace, make sure calling
4485 * workq_thread_terminate() does the right thing here,
4486 * and if we never call it, that workq_exit() will too because it sees
4487 * this thread on the runlist.
4488 */
4489 assert(wr == THREAD_INTERRUPTED);
4490 wq->wq_thdying_count++;
4491 uth->uu_workq_flags |= UT_WORKQ_DYING;
4492 }
4493
4494 workq_unpark_for_death_and_unlock(p, wq, uth,
4495 WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
4496 __builtin_unreachable();
4497 }
4498
4499 __attribute__((noreturn, noinline))
4500 static void
workq_setup_and_run(proc_t p,struct uthread * uth,int setup_flags)4501 workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
4502 {
4503 thread_t th = get_machthread(uth);
4504 vm_map_t vmap = get_task_map(p->task);
4505
4506 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
4507 /*
4508 * For preemption reasons, we want to reset the voucher as late as
4509 * possible, so we do it in two places:
4510 * - Just before parking (i.e. in workq_park_and_unlock())
4511 * - Prior to doing the setup for the next workitem (i.e. here)
4512 *
4513 * Those two places are sufficient to ensure we always reset it before
4514 * it goes back out to user space, but be careful to not break that
4515 * guarantee.
4516 *
4517 * Note that setting the voucher to NULL will not clear the preadoption
4518 * thread group on this thread
4519 */
4520 __assert_only kern_return_t kr;
4521 kr = thread_set_voucher_name(MACH_PORT_NULL);
4522 assert(kr == KERN_SUCCESS);
4523 }
4524
4525 uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
4526 if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
4527 upcall_flags |= WQ_FLAG_THREAD_REUSE;
4528 }
4529
4530 if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
4531 /*
4532 * For threads that have an outside-of-QoS thread priority, indicate
4533 * to userspace that setting QoS should only affect the TSD and not
4534 * change QOS in the kernel.
4535 */
4536 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
4537 } else {
4538 /*
4539 * Put the QoS class value into the lower bits of the reuse_thread
4540 * register, this is where the thread priority used to be stored
4541 * anyway.
4542 */
4543 upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
4544 WQ_FLAG_THREAD_PRIO_QOS;
4545 }
4546
4547 if (uth->uu_workq_thport == MACH_PORT_NULL) {
4548 /* convert_thread_to_port_pinned() consumes a reference */
4549 thread_reference(th);
4550 /* Convert to immovable/pinned thread port, but port is not pinned yet */
4551 ipc_port_t port = convert_thread_to_port_pinned(th);
4552 /* Atomically, pin and copy out the port */
4553 uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(p->task));
4554 }
4555
4556 /* Thread has been set up to run, arm its next workqueue quantum or disarm
4557 * if it is no longer supporting that */
4558 if (thread_supports_cooperative_workqueue(th)) {
4559 thread_arm_workqueue_quantum(th);
4560 } else {
4561 thread_disarm_workqueue_quantum(th);
4562 }
4563
4564 /*
4565 * Call out to pthread, this sets up the thread, pulls in kevent structs
4566 * onto the stack, sets up the thread state and then returns to userspace.
4567 */
4568 WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
4569 proc_get_wqptr_fast(p), 0, 0, 0);
4570
4571 if (workq_thread_is_cooperative(uth)) {
4572 thread_sched_call(th, NULL);
4573 } else {
4574 thread_sched_call(th, workq_sched_callback);
4575 }
4576
4577 pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
4578 uth->uu_workq_thport, 0, setup_flags, upcall_flags);
4579
4580 __builtin_unreachable();
4581 }
4582
4583 #pragma mark misc
4584
4585 int
fill_procworkqueue(proc_t p,struct proc_workqueueinfo * pwqinfo)4586 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
4587 {
4588 struct workqueue *wq = proc_get_wqptr(p);
4589 int error = 0;
4590 int activecount;
4591
4592 if (wq == NULL) {
4593 return EINVAL;
4594 }
4595
4596 /*
4597 * This is sometimes called from interrupt context by the kperf sampler.
4598 * In that case, it's not safe to spin trying to take the lock since we
4599 * might already hold it. So, we just try-lock it and error out if it's
4600 * already held. Since this is just a debugging aid, and all our callers
4601 * are able to handle an error, that's fine.
4602 */
4603 bool locked = workq_lock_try(wq);
4604 if (!locked) {
4605 return EBUSY;
4606 }
4607
4608 wq_thactive_t act = _wq_thactive(wq);
4609 activecount = _wq_thactive_aggregate_downto_qos(wq, act,
4610 WORKQ_THREAD_QOS_MIN, NULL, NULL);
4611 if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
4612 activecount++;
4613 }
4614 pwqinfo->pwq_nthreads = wq->wq_nthreads;
4615 pwqinfo->pwq_runthreads = activecount;
4616 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4617 pwqinfo->pwq_state = 0;
4618
4619 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4620 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4621 }
4622
4623 if (wq->wq_nthreads >= wq_max_threads) {
4624 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4625 }
4626
4627 workq_unlock(wq);
4628 return error;
4629 }
4630
4631 boolean_t
workqueue_get_pwq_exceeded(void * v,boolean_t * exceeded_total,boolean_t * exceeded_constrained)4632 workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
4633 boolean_t *exceeded_constrained)
4634 {
4635 proc_t p = v;
4636 struct proc_workqueueinfo pwqinfo;
4637 int err;
4638
4639 assert(p != NULL);
4640 assert(exceeded_total != NULL);
4641 assert(exceeded_constrained != NULL);
4642
4643 err = fill_procworkqueue(p, &pwqinfo);
4644 if (err) {
4645 return FALSE;
4646 }
4647 if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
4648 return FALSE;
4649 }
4650
4651 *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
4652 *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
4653
4654 return TRUE;
4655 }
4656
4657 uint32_t
workqueue_get_pwq_state_kdp(void * v)4658 workqueue_get_pwq_state_kdp(void * v)
4659 {
4660 static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
4661 kTaskWqExceededConstrainedThreadLimit);
4662 static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
4663 kTaskWqExceededTotalThreadLimit);
4664 static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
4665 static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
4666 WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
4667
4668 if (v == NULL) {
4669 return 0;
4670 }
4671
4672 proc_t p = v;
4673 struct workqueue *wq = proc_get_wqptr(p);
4674
4675 if (wq == NULL || workq_lock_is_acquired_kdp(wq)) {
4676 return 0;
4677 }
4678
4679 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4680
4681 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4682 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4683 }
4684
4685 if (wq->wq_nthreads >= wq_max_threads) {
4686 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4687 }
4688
4689 return pwq_state;
4690 }
4691
4692 void
workq_init(void)4693 workq_init(void)
4694 {
4695 clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
4696 NSEC_PER_USEC, &wq_stalled_window.abstime);
4697 clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
4698 NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
4699 clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
4700 NSEC_PER_USEC, &wq_max_timer_interval.abstime);
4701
4702 thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
4703 workq_deallocate_queue_invoke);
4704 }
4705