1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #include <sys/work_interval.h>
31
32 #include <kern/work_interval.h>
33
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49
50 #include <stdatomic.h>
51
52 /*
53 * With the introduction of auto-join work intervals, it is possible
54 * to change the work interval (and related thread group) of a thread in a
55 * variety of contexts (thread termination, context switch, thread mode
56 * change etc.). In order to clearly specify the policy expectation and
57 * the locking behavior, all calls to thread_set_work_interval() pass
58 * in a set of flags.
59 */
60
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 /* Change the work interval using the explicit join rules */
63 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 /* Change the work interval using the auto-join rules */
65 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
66 /* Caller already holds the thread lock */
67 THREAD_WI_THREAD_LOCK_HELD = 0x4,
68 /* Caller does not hold the thread lock */
69 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
70 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
72 });
73
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78 .iko_op_stable = true,
79 .iko_op_no_senders = work_interval_port_no_senders);
80
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84
85 static void work_interval_deferred_release(struct work_interval *);
86
87 /*
88 * Work Interval Auto-Join Status
89 *
90 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91 * It packs the following information:
92 * - A bit representing if a "finish" is deferred on the work interval
93 * - Count of number of threads auto-joined to the work interval
94 */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111
112 /*
113 * struct work_interval_deferred_finish_state
114 *
115 * Contains the parameters of the finish operation which is being deferred.
116 */
117 struct work_interval_deferred_finish_state {
118 uint64_t instance_id;
119 uint64_t start;
120 uint64_t deadline;
121 uint64_t complexity;
122 };
123
124 struct work_interval_auto_join_info {
125 struct work_interval_deferred_finish_state deferred_finish_state;
126 work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134
135 /*
136 * Work Interval structs
137 *
138 * This struct represents a thread group and/or work interval context
139 * in a mechanism that is represented with a kobject.
140 *
141 * Every thread that has joined a WI has a +1 ref, and the port
142 * has a +1 ref as well.
143 *
144 * TODO: groups need to have a 'is for WI' flag
145 * and they need a flag to create that says 'for WI'
146 * This would allow CLPC to avoid allocating WI support
147 * data unless it is needed
148 *
149 * TODO: Enforce not having more than one non-group joinable work
150 * interval per thread group.
151 * CLPC only wants to see one WI-notify callout per group.
152 */
153
154 struct work_interval {
155 uint64_t wi_id;
156 struct os_refcnt wi_ref_count;
157 uint32_t wi_create_flags;
158
159 /* for debugging purposes only, does not hold a ref on port */
160 ipc_port_t wi_port;
161
162 /*
163 * holds uniqueid and version of creating process,
164 * used to permission-gate notify
165 * TODO: you'd think there would be a better way to do this
166 */
167 uint64_t wi_creator_uniqueid;
168 uint32_t wi_creator_pid;
169 int wi_creator_pidversion;
170
171 /* flags set by work_interval_set_workload_id and reflected onto
172 * thread->th_work_interval_flags upon join */
173 uint32_t wi_wlid_flags;
174
175 #if CONFIG_THREAD_GROUPS
176 uint32_t wi_group_flags;
177 struct thread_group *wi_group; /* holds +1 ref on group */
178 #endif /* CONFIG_THREAD_GROUPS */
179
180 #if CONFIG_SCHED_AUTO_JOIN
181 /* Information related to auto-join and deferred finish for work interval */
182 struct work_interval_auto_join_info wi_auto_join_info;
183
184 /*
185 * Since the deallocation of auto-join work intervals
186 * can happen in the scheduler when the last thread in
187 * the WI blocks and the thread lock is held, the deallocation
188 * might have to be done on a separate thread.
189 */
190 struct mpsc_queue_chain wi_deallocate_link;
191 #endif /* CONFIG_SCHED_AUTO_JOIN */
192
193 /*
194 * Work interval class info - determines thread priority for threads
195 * with a work interval driven policy.
196 */
197 wi_class_t wi_class;
198 uint8_t wi_class_offset;
199 };
200
201 #if CONFIG_SCHED_AUTO_JOIN
202
203 /*
204 * work_interval_perform_deferred_finish()
205 *
206 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
207 * argument rather than looking at the work_interval since the deferred finish can race with another
208 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
209 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
210 * the deferred state without issues.
211 */
212 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)213 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
214 __unused struct work_interval *work_interval, __unused thread_t thread)
215 {
216
217 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
218 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
219 }
220
221 /*
222 * work_interval_auto_join_increment()
223 *
224 * Routine to increment auto-join counter when a new thread is auto-joined to
225 * the work interval.
226 */
227 static void
work_interval_auto_join_increment(struct work_interval * work_interval)228 work_interval_auto_join_increment(struct work_interval *work_interval)
229 {
230 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
231 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
232 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
233 }
234
235 /*
236 * work_interval_auto_join_decrement()
237 *
238 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
239 * blocking or termination). If this was the last auto-joined thread in the work interval and
240 * there was a deferred finish, performs the finish operation for the work interval.
241 */
242 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)243 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
244 {
245 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
246 work_interval_auto_join_status_t old_status, new_status;
247 struct work_interval_deferred_finish_state deferred_finish_state;
248 bool perform_finish;
249
250 /* Update the auto-join count for the work interval atomically */
251 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
252 perform_finish = false;
253 new_status = old_status;
254 assert(work_interval_status_auto_join_count(old_status) > 0);
255 new_status -= 1;
256 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
257 /* No auto-joined threads remaining and finish is deferred */
258 new_status = 0;
259 perform_finish = true;
260 /*
261 * Its important to copy the deferred finish state here so that this works
262 * when racing with another start-finish cycle.
263 */
264 deferred_finish_state = join_info->deferred_finish_state;
265 }
266 });
267
268 if (perform_finish == true) {
269 /*
270 * Since work_interval_perform_deferred_finish() calls down to
271 * the machine layer callout for finish which gets the thread
272 * group from the thread passed in here, it is important to
273 * make sure that the thread still has the work interval thread
274 * group here.
275 */
276 assert(thread->thread_group == work_interval->wi_group);
277 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
278 }
279 }
280
281 /*
282 * work_interval_auto_join_enabled()
283 *
284 * Helper routine to check if work interval has auto-join enabled.
285 */
286 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)287 work_interval_auto_join_enabled(struct work_interval *work_interval)
288 {
289 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
290 }
291
292 /*
293 * work_interval_deferred_finish_enabled()
294 *
295 * Helper routine to check if work interval has deferred finish enabled.
296 */
297 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)298 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
299 {
300 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
301 }
302
303 #endif /* CONFIG_SCHED_AUTO_JOIN */
304
305 static inline void
work_interval_retain(struct work_interval * work_interval)306 work_interval_retain(struct work_interval *work_interval)
307 {
308 /*
309 * Even though wi_retain is called under a port lock, we have
310 * to use os_ref_retain instead of os_ref_retain_locked
311 * because wi_release is not synchronized. wi_release calls
312 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
313 */
314 os_ref_retain(&work_interval->wi_ref_count);
315 }
316
317 static inline void
work_interval_deallocate(struct work_interval * work_interval)318 work_interval_deallocate(struct work_interval *work_interval)
319 {
320 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
321 work_interval->wi_id);
322 #if CONFIG_THREAD_GROUPS
323 if (work_interval->wi_group) {
324 thread_group_release(work_interval->wi_group);
325 work_interval->wi_group = NULL;
326 }
327 #endif /* CONFIG_THREAD_GROUPS */
328 kfree_type(struct work_interval, work_interval);
329 }
330
331 /*
332 * work_interval_release()
333 *
334 * Routine to release a ref count on the work interval. If the refcount goes down
335 * to zero, the work interval needs to be de-allocated.
336 *
337 * For non auto-join work intervals, they are de-allocated in this context.
338 *
339 * For auto-join work intervals, the de-allocation cannot be done from this context
340 * since that might need the kernel memory allocator lock. In that case, the
341 * deallocation is done via a thread-call based mpsc queue.
342 */
343 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)344 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
345 {
346 if (os_ref_release(&work_interval->wi_ref_count) == 0) {
347 #if CONFIG_SCHED_AUTO_JOIN
348 if (options & THREAD_WI_THREAD_LOCK_HELD) {
349 work_interval_deferred_release(work_interval);
350 } else {
351 work_interval_deallocate(work_interval);
352 }
353 #else /* CONFIG_SCHED_AUTO_JOIN */
354 work_interval_deallocate(work_interval);
355 #endif /* CONFIG_SCHED_AUTO_JOIN */
356 }
357 }
358
359 #if CONFIG_SCHED_AUTO_JOIN
360
361 /*
362 * work_interval_deferred_release()
363 *
364 * Routine to enqueue the work interval on the deallocation mpsc queue.
365 */
366 static void
work_interval_deferred_release(struct work_interval * work_interval)367 work_interval_deferred_release(struct work_interval *work_interval)
368 {
369 mpsc_daemon_enqueue(&work_interval_deallocate_queue,
370 &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
371 }
372
373 /*
374 * work_interval_should_propagate()
375 *
376 * Main policy routine to decide if a thread should be auto-joined to
377 * another thread's work interval. The conditions are arranged such that
378 * the most common bailout condition are checked the earliest. This routine
379 * is called from the scheduler context; so it needs to be efficient and
380 * be careful when taking locks or performing wakeups.
381 */
382 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)383 work_interval_should_propagate(thread_t cthread, thread_t thread)
384 {
385 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
386 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
387 return false;
388 }
389
390 /* Only propagate work intervals which have auto-join enabled */
391 if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
392 return false;
393 }
394
395 /* Work interval propagation is enabled for realtime threads only */
396 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
397 return false;
398 }
399
400
401 /* Work interval propagation only works for threads with the same home thread group */
402 struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
403 if (thread_group_get_home_group(cthread) != thread_home_tg) {
404 return false;
405 }
406
407 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
408 if (thread->thread_group != thread_home_tg) {
409 return false;
410 }
411
412 /* If either thread is inactive (in the termination path), do not propagate auto-join */
413 if ((!cthread->active) || (!thread->active)) {
414 return false;
415 }
416
417 return true;
418 }
419
420 /*
421 * work_interval_auto_join_propagate()
422 *
423 * Routine to auto-join a thread into another thread's work interval
424 *
425 * Should only be invoked if work_interval_should_propagate() returns
426 * true. Also expects "from" thread to be current thread and "to" thread
427 * to be locked.
428 */
429 void
work_interval_auto_join_propagate(thread_t from,thread_t to)430 work_interval_auto_join_propagate(thread_t from, thread_t to)
431 {
432 assert(from == current_thread());
433 work_interval_retain(from->th_work_interval);
434 work_interval_auto_join_increment(from->th_work_interval);
435 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
436 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
437 assert(kr == KERN_SUCCESS);
438 }
439
440 /*
441 * work_interval_auto_join_unwind()
442 *
443 * Routine to un-join an auto-joined work interval for a thread that is blocking.
444 *
445 * Expects thread to be locked.
446 */
447 void
work_interval_auto_join_unwind(thread_t thread)448 work_interval_auto_join_unwind(thread_t thread)
449 {
450 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
451 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
452 assert(kr == KERN_SUCCESS);
453 }
454
455 /*
456 * work_interval_auto_join_demote()
457 *
458 * Routine to un-join an auto-joined work interval when a thread is changing from
459 * realtime to non-realtime scheduling mode. This could happen due to multiple
460 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
461 * the thread being demoted may not be the current thread.
462 *
463 * Expects thread to be locked.
464 */
465 void
work_interval_auto_join_demote(thread_t thread)466 work_interval_auto_join_demote(thread_t thread)
467 {
468 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
469 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
470 assert(kr == KERN_SUCCESS);
471 }
472
473 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)474 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
475 __assert_only mpsc_daemon_queue_t dq)
476 {
477 struct work_interval *work_interval = NULL;
478 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
479 assert(dq == &work_interval_deallocate_queue);
480 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
481 work_interval_deallocate(work_interval);
482 }
483
484 #endif /* CONFIG_SCHED_AUTO_JOIN */
485
486 #if CONFIG_SCHED_AUTO_JOIN
487 __startup_func
488 static void
work_interval_subsystem_init(void)489 work_interval_subsystem_init(void)
490 {
491 /*
492 * The work interval deallocation queue must be a thread call based queue
493 * because it is woken up from contexts where the thread lock is held. The
494 * only way to perform wakeups safely in those contexts is to wakeup a
495 * thread call which is guaranteed to be on a different waitq and would
496 * not hash onto the same global waitq which might be currently locked.
497 */
498 mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
499 work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
500 MPSC_DAEMON_INIT_NONE);
501 }
502 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
503 #endif /* CONFIG_SCHED_AUTO_JOIN */
504
505 /*
506 * work_interval_port_convert
507 *
508 * Called with port locked, returns reference to work interval
509 * if indeed the port is a work interval kobject port
510 */
511 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)512 work_interval_port_convert_locked(ipc_port_t port)
513 {
514 struct work_interval *work_interval = NULL;
515
516 if (IP_VALID(port)) {
517 work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
518 if (work_interval) {
519 work_interval_retain(work_interval);
520 }
521 }
522
523 return work_interval;
524 }
525
526 /*
527 * port_name_to_work_interval
528 *
529 * Description: Obtain a reference to the work_interval associated with a given port.
530 *
531 * Parameters: name A Mach port name to translate.
532 *
533 * Returns: NULL The given Mach port did not reference a work_interval.
534 * !NULL The work_interval that is associated with the Mach port.
535 */
536 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)537 port_name_to_work_interval(mach_port_name_t name,
538 struct work_interval **work_interval)
539 {
540 if (!MACH_PORT_VALID(name)) {
541 return KERN_INVALID_NAME;
542 }
543
544 ipc_port_t port = IPC_PORT_NULL;
545 kern_return_t kr = KERN_SUCCESS;
546
547 kr = ipc_port_translate_send(current_space(), name, &port);
548 if (kr != KERN_SUCCESS) {
549 return kr;
550 }
551 /* port is locked */
552
553 assert(IP_VALID(port));
554
555 struct work_interval *converted_work_interval;
556
557 converted_work_interval = work_interval_port_convert_locked(port);
558
559 /* the port is valid, but doesn't denote a work_interval */
560 if (converted_work_interval == NULL) {
561 kr = KERN_INVALID_CAPABILITY;
562 }
563
564 ip_mq_unlock(port);
565
566 if (kr == KERN_SUCCESS) {
567 *work_interval = converted_work_interval;
568 }
569
570 return kr;
571 }
572
573
574 /*
575 * work_interval_port_no_senders
576 *
577 * Description: Handle a no-senders notification for a work interval port.
578 * Destroys the port and releases its reference on the work interval.
579 *
580 * Parameters: msg A Mach no-senders notification message.
581 *
582 * Note: This assumes that there is only one create-right-from-work-interval point,
583 * if the ability to extract another send right after creation is added,
584 * this will have to change to handle make-send counts correctly.
585 */
586 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)587 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
588 {
589 struct work_interval *work_interval = NULL;
590
591 work_interval = ipc_kobject_dealloc_port(port, mscount,
592 IKOT_WORK_INTERVAL);
593
594 work_interval->wi_port = MACH_PORT_NULL;
595
596 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
597 }
598
599 /*
600 * work_interval_port_type()
601 *
602 * Converts a port name into the work interval object and returns its type.
603 *
604 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
605 * valid type for work intervals).
606 */
607 static uint32_t
work_interval_port_type(mach_port_name_t port_name)608 work_interval_port_type(mach_port_name_t port_name)
609 {
610 struct work_interval *work_interval = NULL;
611 kern_return_t kr;
612 uint32_t work_interval_type;
613
614 if (port_name == MACH_PORT_NULL) {
615 return WORK_INTERVAL_TYPE_LAST;
616 }
617
618 kr = port_name_to_work_interval(port_name, &work_interval);
619 if (kr != KERN_SUCCESS) {
620 return WORK_INTERVAL_TYPE_LAST;
621 }
622 /* work_interval has a +1 ref */
623
624 assert(work_interval != NULL);
625 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
626 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
627 return work_interval_type;
628 }
629
630 /*
631 * Sparse - not all work interval classes imply a scheduling policy change.
632 * Realtime threads are managed elsewhere.
633 */
634 static const struct {
635 int priority;
636 sched_mode_t sched_mode;
637 } work_interval_class_data[WI_CLASS_COUNT] = {
638 [WI_CLASS_BEST_EFFORT] = {
639 BASEPRI_DEFAULT, // 31
640 TH_MODE_TIMESHARE,
641 },
642
643 [WI_CLASS_SYSTEM_CRITICAL] = {
644 MAXPRI_USER + 1, // 64
645 TH_MODE_FIXED,
646 },
647 };
648
649 /*
650 * Called when a thread gets its scheduling priority from its associated work
651 * interval.
652 */
653 int
work_interval_get_priority(thread_t thread)654 work_interval_get_priority(thread_t thread)
655 {
656 const struct work_interval *work_interval = thread->th_work_interval;
657 assert(work_interval != NULL);
658
659 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
660 int priority = work_interval_class_data[work_interval->wi_class].priority;
661 assert(priority != 0);
662
663 priority += work_interval->wi_class_offset;
664 assert3u(priority, <=, MAXPRI);
665
666 return priority;
667 }
668
669 /*
670 * Switch to a policy driven by the work interval (if applicable).
671 */
672 static void
work_interval_set_policy(thread_t thread)673 work_interval_set_policy(thread_t thread)
674 {
675 /*
676 * Ignore policy changes if the workload context shouldn't affect the
677 * scheduling policy.
678 */
679 workload_config_flags_t flags = WLC_F_NONE;
680
681 /* There may be no config at all. That's ok. */
682 if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
683 (flags & WLC_F_THREAD_POLICY) == 0) {
684 return;
685 }
686
687 const struct work_interval *work_interval = thread->th_work_interval;
688 assert(work_interval != NULL);
689
690 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
691 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
692
693 if (mode == TH_MODE_NONE) {
694 return;
695 }
696
697 proc_set_thread_policy(thread, TASK_POLICY_ATTRIBUTE,
698 TASK_POLICY_WI_DRIVEN, mode);
699
700 assert(thread->requested_policy.thrp_wi_driven);
701
702 return;
703 }
704
705 /*
706 * Clear a work interval driven policy.
707 */
708 static void
work_interval_clear_policy(thread_t thread)709 work_interval_clear_policy(thread_t thread)
710 {
711 if (!thread->requested_policy.thrp_wi_driven) {
712 return;
713 }
714
715 proc_set_thread_policy(thread, TASK_POLICY_ATTRIBUTE,
716 TASK_POLICY_WI_DRIVEN, TH_MODE_NONE);
717 }
718
719 /*
720 * thread_set_work_interval()
721 *
722 * Change thread's bound work interval to the passed-in work interval
723 * Consumes +1 ref on work_interval upon success.
724 *
725 * May also pass NULL to un-set work_interval on the thread
726 * Will deallocate any old work interval on the thread
727 * Return error if thread does not satisfy requirements to join work interval
728 *
729 * For non auto-join work intervals, deallocate any old work interval on the thread
730 * For auto-join work intervals, the routine may wakeup the work interval deferred
731 * deallocation queue since thread locks might be currently held.
732 */
733 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)734 thread_set_work_interval(thread_t thread,
735 struct work_interval *work_interval, thread_work_interval_options_t options)
736 {
737 /* All explicit work interval operations should always be from the current thread */
738 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
739 assert(thread == current_thread());
740 }
741
742 /* All cases of needing the thread lock should be from explicit join scenarios */
743 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
744 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
745 }
746
747 /* For all cases of auto join must come in with the thread lock held */
748 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
749 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
750 }
751
752 #if CONFIG_THREAD_GROUPS
753 if (work_interval && !work_interval->wi_group) {
754 /* Reject join on work intervals with deferred thread group creation */
755 return KERN_INVALID_ARGUMENT;
756 }
757 #endif /* CONFIG_THREAD_GROUPS */
758
759 if (work_interval) {
760 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
761
762 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
763 /* Ensure no kern_work_interval_set_workload_id can happen after this point */
764 uint32_t wlid_flags;
765 (void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
766 WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
767 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
768 /* For workload IDs with rt-allowed, neuter the check below to
769 * enable joining before the thread has become realtime for all
770 * work interval types */
771 work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
772 }
773 }
774
775 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
776 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
777 return KERN_INVALID_ARGUMENT;
778 }
779 }
780
781 /*
782 * Ensure a work interval scheduling policy is not used if the thread is
783 * leaving the work interval.
784 */
785 if (work_interval == NULL &&
786 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
787 work_interval_clear_policy(thread);
788 }
789
790 struct work_interval *old_th_wi = thread->th_work_interval;
791 #if CONFIG_SCHED_AUTO_JOIN
792 spl_t s;
793 /* Take the thread lock if needed */
794 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
795 s = splsched();
796 thread_lock(thread);
797 }
798
799 /*
800 * Work interval auto-join leak to non-RT threads.
801 *
802 * If thread might be running on a remote core and it's not in the context switch path (where
803 * thread is neither running, blocked or in the runq), its not possible to update the
804 * work interval & thread group remotely since its not possible to update CLPC for a remote
805 * core. This situation might happen when a thread is transitioning from realtime to
806 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
807 * be part of the work interval.
808 *
809 * Since there is no immediate mitigation to this issue, the policy is to set a new
810 * flag on the thread which indicates that such a "leak" has happened. This flag will
811 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
812 */
813 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
814
815 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
816 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
817 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
818 return KERN_SUCCESS;
819 }
820
821 const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
822
823 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
824 __kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
825 __kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
826 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
827 thread_tid(thread), old_tg_id, new_tg_id, options);
828 }
829
830 if (old_wi_auto_joined) {
831 /*
832 * If thread was auto-joined to a work interval and is not realtime, make sure it
833 * happened due to the "leak" described above.
834 */
835 if (thread->sched_mode != TH_MODE_REALTIME) {
836 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
837 }
838
839 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
840 work_interval_auto_join_decrement(old_th_wi, thread);
841 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
842 }
843
844 #endif /* CONFIG_SCHED_AUTO_JOIN */
845
846 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
847 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
848
849 /* transfer +1 ref to thread */
850 thread->th_work_interval = work_interval;
851
852 #if CONFIG_SCHED_AUTO_JOIN
853
854 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
855 assert(work_interval_auto_join_enabled(work_interval) == true);
856 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
857 }
858
859 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
860 thread_unlock(thread);
861 splx(s);
862 }
863 #endif /* CONFIG_SCHED_AUTO_JOIN */
864
865 /*
866 * The thread got a new work interval. It may come with a work interval
867 * scheduling policy that needs to be applied.
868 */
869 if (work_interval != NULL &&
870 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
871 work_interval_set_policy(thread);
872 }
873
874 #if CONFIG_THREAD_GROUPS
875 if (work_interval) {
876 /* Prevent thread_group_set_name after CLPC may have already heard
877 * about the thread group */
878 (void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
879 WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
880 }
881 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
882
883 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
884 #if CONFIG_SCHED_AUTO_JOIN
885 thread_set_autojoin_thread_group_locked(thread, new_tg);
886 #endif
887 } else {
888 thread_set_work_interval_thread_group(thread, new_tg);
889 }
890 #endif /* CONFIG_THREAD_GROUPS */
891
892 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
893 /* Construct mask to XOR with th_work_interval_flags to clear the
894 * currently present flags and set the new flags in wlid_flags. */
895 uint32_t wlid_flags = 0;
896 if (work_interval) {
897 wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
898 }
899 thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
900 &thread->th_work_interval_flags, relaxed);
901 th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
902 TH_WORK_INTERVAL_FLAGS_RT_ALLOWED |
903 TH_WORK_INTERVAL_FLAGS_RT_CRITICAL);
904 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
905 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
906 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
907 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
908 }
909 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL) {
910 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_CRITICAL;
911 }
912 }
913 if (th_wi_xor_mask) {
914 os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
915 }
916
917 /*
918 * Now that the interval flags have been set, re-evaluate
919 * whether the thread needs to be undemoted - the new work
920 * interval may have the RT_ALLOWED flag. and the thread may
921 * have have a realtime policy but be demoted.
922 */
923 thread_rt_evaluate(thread);
924 }
925
926 if (old_th_wi != NULL) {
927 work_interval_release(old_th_wi, options);
928 }
929
930 return KERN_SUCCESS;
931 }
932
933 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)934 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
935 {
936 assert(thread == current_thread());
937 return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
938 }
939
940 kern_return_t
work_interval_thread_terminate(thread_t thread)941 work_interval_thread_terminate(thread_t thread)
942 {
943 assert(thread == current_thread());
944 if (thread->th_work_interval != NULL) {
945 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
946 }
947 return KERN_SUCCESS;
948 }
949
950 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)951 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
952 {
953 assert(thread == current_thread());
954 assert(kwi_args->work_interval_id != 0);
955
956 struct work_interval *work_interval = thread->th_work_interval;
957
958 if (work_interval == NULL ||
959 work_interval->wi_id != kwi_args->work_interval_id) {
960 /* This thread must have adopted the work interval to be able to notify */
961 return KERN_INVALID_ARGUMENT;
962 }
963
964 task_t notifying_task = current_task();
965
966 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
967 work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
968 /* Only the creating task can do a notify */
969 return KERN_INVALID_ARGUMENT;
970 }
971
972 spl_t s = splsched();
973
974 #if CONFIG_THREAD_GROUPS
975 assert(work_interval->wi_group == thread->thread_group);
976 #endif /* CONFIG_THREAD_GROUPS */
977
978 uint64_t urgency_param1, urgency_param2;
979 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
980
981 splx(s);
982
983 /* called without interrupts disabled */
984 machine_work_interval_notify(thread, kwi_args);
985
986 return KERN_SUCCESS;
987 }
988
989 /* Start at 1, 0 is not a valid work interval ID */
990 static _Atomic uint64_t unique_work_interval_id = 1;
991
992 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)993 kern_work_interval_create(thread_t thread,
994 struct kern_work_interval_create_args *create_params)
995 {
996 assert(thread == current_thread());
997
998 uint32_t create_flags = create_params->wica_create_flags;
999
1000 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1001 thread->th_work_interval != NULL) {
1002 /*
1003 * If the thread is doing a legacy combined create and join,
1004 * it shouldn't already be part of a work interval.
1005 *
1006 * (Creating a joinable WI is allowed anytime.)
1007 */
1008 return KERN_FAILURE;
1009 }
1010
1011 /*
1012 * Check the validity of the create flags before allocating the work
1013 * interval.
1014 */
1015 task_t creating_task = current_task();
1016 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1017 /*
1018 * CA_CLIENT work intervals do not create new thread groups.
1019 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1020 * per each application task
1021 */
1022 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1023 return KERN_FAILURE;
1024 }
1025 if (!task_is_app(creating_task)) {
1026 #if XNU_TARGET_OS_OSX
1027 /*
1028 * Soft-fail the case of a non-app pretending to be an
1029 * app, by allowing it to press the buttons, but they're
1030 * not actually connected to anything.
1031 */
1032 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1033 #else
1034 /*
1035 * On iOS, it's a hard failure to get your apptype
1036 * wrong and then try to render something.
1037 */
1038 return KERN_NOT_SUPPORTED;
1039 #endif /* XNU_TARGET_OS_OSX */
1040 }
1041 if (task_set_ca_client_wi(creating_task, true) == false) {
1042 return KERN_FAILURE;
1043 }
1044 }
1045
1046 #if CONFIG_SCHED_AUTO_JOIN
1047 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1048 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1049 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1050 return KERN_NOT_SUPPORTED;
1051 }
1052 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1053 return KERN_NOT_SUPPORTED;
1054 }
1055 }
1056
1057 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1058 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1059 return KERN_NOT_SUPPORTED;
1060 }
1061 }
1062 #endif /* CONFIG_SCHED_AUTO_JOIN */
1063
1064 struct work_interval *work_interval = kalloc_type(struct work_interval,
1065 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1066
1067 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1068
1069 *work_interval = (struct work_interval) {
1070 .wi_id = work_interval_id,
1071 .wi_ref_count = {},
1072 .wi_create_flags = create_flags,
1073 .wi_creator_pid = pid_from_task(creating_task),
1074 .wi_creator_uniqueid = get_task_uniqueid(creating_task),
1075 .wi_creator_pidversion = get_task_version(creating_task),
1076 };
1077 os_ref_init(&work_interval->wi_ref_count, NULL);
1078
1079 __kdebug_only uint64_t tg_id = 0;
1080 #if CONFIG_THREAD_GROUPS
1081 struct thread_group *tg;
1082 if ((create_flags &
1083 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1084 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1085 /* defer creation of the thread group until the
1086 * kern_work_interval_set_workload_id() call */
1087 work_interval->wi_group = NULL;
1088 } else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1089 /* create a new group for the interval to represent */
1090 char name[THREAD_GROUP_MAXNAME] = "";
1091
1092 snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1093 work_interval->wi_creator_pid);
1094
1095 tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1096
1097 thread_group_set_name(tg, name);
1098
1099 work_interval->wi_group = tg;
1100 } else {
1101 /* the interval represents the thread's home group */
1102 tg = thread_group_get_home_group(thread);
1103
1104 thread_group_retain(tg);
1105
1106 work_interval->wi_group = tg;
1107 }
1108
1109 /* Capture the tg_id for tracing purposes */
1110 tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1111
1112 #endif /* CONFIG_THREAD_GROUPS */
1113
1114 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1115 mach_port_name_t name = MACH_PORT_NULL;
1116
1117 /* work_interval has a +1 ref, moves to the port */
1118 work_interval->wi_port = ipc_kobject_alloc_port(
1119 (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1120 IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1121
1122 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1123
1124 if (!MACH_PORT_VALID(name)) {
1125 /*
1126 * copyout failed (port is already deallocated)
1127 * Because of the port-destroyed magic,
1128 * the work interval is already deallocated too.
1129 */
1130 return KERN_RESOURCE_SHORTAGE;
1131 }
1132
1133 create_params->wica_port = name;
1134 } else {
1135 /* work_interval has a +1 ref, moves to the thread */
1136 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1137 if (kr != KERN_SUCCESS) {
1138 /* No other thread can join this work interval since it isn't
1139 * JOINABLE so release the reference on work interval */
1140 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1141 return kr;
1142 }
1143 create_params->wica_port = MACH_PORT_NULL;
1144 }
1145
1146 create_params->wica_id = work_interval_id;
1147
1148 if (tg_id != ~0) {
1149 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1150 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1151 }
1152 return KERN_SUCCESS;
1153 }
1154
1155 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1156 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1157 {
1158 assert(flags != NULL);
1159
1160 kern_return_t kr;
1161 struct work_interval *work_interval;
1162
1163 kr = port_name_to_work_interval(port_name, &work_interval);
1164 if (kr != KERN_SUCCESS) {
1165 return kr;
1166 }
1167
1168 assert(work_interval != NULL);
1169 *flags = work_interval->wi_create_flags;
1170
1171 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1172
1173 return KERN_SUCCESS;
1174 }
1175
1176 #if CONFIG_THREAD_GROUPS
1177 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1178 "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1179 #endif /* CONFIG_THREAD_GROUPS */
1180
1181 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1182 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1183 size_t len)
1184 {
1185 kern_return_t kr;
1186 struct work_interval *work_interval;
1187
1188 if (len > WORK_INTERVAL_NAME_MAX) {
1189 return KERN_INVALID_ARGUMENT;
1190 }
1191 kr = port_name_to_work_interval(port_name, &work_interval);
1192 if (kr != KERN_SUCCESS) {
1193 return kr;
1194 }
1195
1196 assert(work_interval != NULL);
1197
1198 #if CONFIG_THREAD_GROUPS
1199 uint32_t wi_group_flags = os_atomic_load(
1200 &work_interval->wi_group_flags, relaxed);
1201 if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1202 kr = KERN_INVALID_ARGUMENT;
1203 goto out;
1204 }
1205 if (!work_interval->wi_group) {
1206 kr = KERN_INVALID_ARGUMENT;
1207 goto out;
1208 }
1209
1210 if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1211 char tgname[THREAD_GROUP_MAXNAME];
1212 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1213 name);
1214 thread_group_set_name(work_interval->wi_group, tgname);
1215 }
1216
1217 out:
1218 #endif /* CONFIG_THREAD_GROUPS */
1219 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1220
1221 return kr;
1222 }
1223
1224 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1225 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1226 struct kern_work_interval_workload_id_args *workload_id_args)
1227 {
1228 kern_return_t kr;
1229 struct work_interval *work_interval;
1230 uint32_t wlida_flags = 0;
1231 uint32_t wlid_flags = 0;
1232 #if CONFIG_THREAD_GROUPS
1233 uint32_t tg_flags = 0;
1234 #endif
1235 bool from_workload_config = false;
1236
1237 /* Ensure workload ID name is non-empty. */
1238 if (!workload_id_args->wlida_name[0]) {
1239 return KERN_INVALID_ARGUMENT;
1240 }
1241
1242 kr = port_name_to_work_interval(port_name, &work_interval);
1243 if (kr != KERN_SUCCESS) {
1244 return kr;
1245 }
1246
1247 assert(work_interval != NULL);
1248 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1249 kr = KERN_INVALID_ARGUMENT;
1250 goto out;
1251 }
1252
1253 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1254 /* Reject work intervals that didn't indicate they will have a workload ID
1255 * at creation. In particular if the work interval has its own thread group,
1256 * its creation must have been deferred in kern_work_interval_create */
1257 kr = KERN_INVALID_ARGUMENT;
1258 goto out;
1259 }
1260
1261 workload_config_t wl_config = {};
1262 kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1263 if (kr == KERN_SUCCESS) {
1264 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1265 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1266 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1267 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1268 /* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1269 } else {
1270 kr = KERN_INVALID_ARGUMENT;
1271 goto out;
1272 }
1273 }
1274
1275 wlida_flags = wl_config.wc_flags;
1276
1277 wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1278
1279 #if CONFIG_THREAD_GROUPS
1280 tg_flags = wl_config.wc_thread_group_flags;
1281 if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1282 (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1283 kr = KERN_INVALID_ARGUMENT;
1284 goto out;
1285 }
1286 #endif /* CONFIG_THREAD_GROUPS */
1287 work_interval->wi_class = wl_config.wc_class;
1288 work_interval->wi_class_offset = wl_config.wc_class_offset;
1289
1290 from_workload_config = true;
1291 } else {
1292 /* If the workload is not present in the table, perform basic validation
1293 * that the create flags passed in match the ones used at work interval
1294 * create time */
1295 if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1296 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1297 kr = KERN_INVALID_ARGUMENT;
1298 goto out;
1299 }
1300
1301 const bool wc_avail = workload_config_available();
1302 if (!wc_avail) {
1303 wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1304 }
1305
1306 /*
1307 * If the workload config wasn't even loaded then fallback to
1308 * older behaviour where the new thread group gets the default
1309 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1310 */
1311 #if CONFIG_THREAD_GROUPS
1312 if (!wc_avail) {
1313 tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1314 } else {
1315 struct thread_group *home_group =
1316 thread_group_get_home_group(current_thread());
1317 if (home_group != NULL) {
1318 tg_flags = thread_group_get_flags(home_group);
1319 }
1320 }
1321 #endif /* CONFIG_THREAD_GROUPS */
1322 }
1323
1324 workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1325
1326 /* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1327 * has been set). */
1328 wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1329 if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1330 &wlid_flags, relaxed)) {
1331 #if CONFIG_THREAD_GROUPS
1332 if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1333 /* Perform deferred thread group creation, now that tgflags are known */
1334 struct thread_group *tg;
1335 tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1336 THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1337
1338 char tgname[THREAD_GROUP_MAXNAME] = "";
1339 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1340 workload_id_args->wlida_name);
1341 thread_group_set_name(tg, tgname);
1342
1343 assert(work_interval->wi_group == NULL);
1344 work_interval->wi_group = tg;
1345
1346 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1347 work_interval->wi_id, work_interval->wi_create_flags,
1348 work_interval->wi_creator_pid, thread_group_get_id(tg));
1349 }
1350 #endif /* CONFIG_THREAD_GROUPS */
1351 } else {
1352 /* Workload ID has previously been set (or a thread has already joined). */
1353 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1354 kr = KERN_INVALID_ARGUMENT;
1355 goto out;
1356 }
1357 /* Treat this request as a query for the out parameters of the ID */
1358 workload_id_args->wlida_flags = wlid_flags;
1359 }
1360
1361 /*
1362 * Emit tracepoints for successfully setting the workload ID.
1363 *
1364 * After rdar://89342390 has been fixed and a new work interval ktrace
1365 * provider has been added, it will be possible to associate a numeric
1366 * ID with an ID name. Thus, for those cases where the ID name has been
1367 * looked up successfully (`from_workload_config` is true) it will no
1368 * longer be necessary to emit a tracepoint with the full ID name.
1369 */
1370 KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1371 work_interval->wi_id, from_workload_config);
1372 kernel_debug_string_simple(
1373 MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1374 workload_id_args->wlida_name);
1375
1376 kr = KERN_SUCCESS;
1377
1378 out:
1379 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1380
1381 return kr;
1382 }
1383
1384
1385 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1386 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1387 {
1388 if (work_interval_id == 0) {
1389 return KERN_INVALID_ARGUMENT;
1390 }
1391
1392 if (thread->th_work_interval == NULL ||
1393 thread->th_work_interval->wi_id != work_interval_id) {
1394 /* work ID isn't valid or doesn't match joined work interval ID */
1395 return KERN_INVALID_ARGUMENT;
1396 }
1397
1398 return thread_set_work_interval_explicit_join(thread, NULL);
1399 }
1400
1401 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1402 kern_work_interval_join(thread_t thread,
1403 mach_port_name_t port_name)
1404 {
1405 struct work_interval *work_interval = NULL;
1406 kern_return_t kr;
1407
1408 if (port_name == MACH_PORT_NULL) {
1409 /* 'Un-join' the current work interval */
1410 return thread_set_work_interval_explicit_join(thread, NULL);
1411 }
1412
1413 kr = port_name_to_work_interval(port_name, &work_interval);
1414 if (kr != KERN_SUCCESS) {
1415 return kr;
1416 }
1417 /* work_interval has a +1 ref */
1418
1419 assert(work_interval != NULL);
1420
1421 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1422 /* ref was consumed by passing it to the thread in the successful case */
1423 if (kr != KERN_SUCCESS) {
1424 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1425 }
1426 return kr;
1427 }
1428
1429 /*
1430 * work_interval_port_type_render_server()
1431 *
1432 * Helper routine to determine if the port points to a
1433 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1434 */
1435 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1436 work_interval_port_type_render_server(mach_port_name_t port_name)
1437 {
1438 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1439 }
1440