1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #include <sys/work_interval.h>
31
32 #include <kern/work_interval.h>
33
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49
50 #include <stdatomic.h>
51
52 /*
53 * With the introduction of auto-join work intervals, it is possible
54 * to change the work interval (and related thread group) of a thread in a
55 * variety of contexts (thread termination, context switch, thread mode
56 * change etc.). In order to clearly specify the policy expectation and
57 * the locking behavior, all calls to thread_set_work_interval() pass
58 * in a set of flags.
59 */
60
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 /* Change the work interval using the explicit join rules */
63 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 /* Change the work interval using the auto-join rules */
65 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
66 /* Caller already holds the thread lock */
67 THREAD_WI_THREAD_LOCK_HELD = 0x4,
68 /* Caller does not hold the thread lock */
69 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
70 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
72 });
73
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78 .iko_op_stable = true,
79 .iko_op_no_senders = work_interval_port_no_senders);
80
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84
85 static void work_interval_deferred_release(struct work_interval *);
86
87 /*
88 * Work Interval Auto-Join Status
89 *
90 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91 * It packs the following information:
92 * - A bit representing if a "finish" is deferred on the work interval
93 * - Count of number of threads auto-joined to the work interval
94 */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111
112 /*
113 * struct work_interval_deferred_finish_state
114 *
115 * Contains the parameters of the finish operation which is being deferred.
116 */
117 struct work_interval_deferred_finish_state {
118 uint64_t instance_id;
119 uint64_t start;
120 uint64_t deadline;
121 uint64_t complexity;
122 };
123
124 struct work_interval_auto_join_info {
125 struct work_interval_deferred_finish_state deferred_finish_state;
126 work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134
135 /*
136 * Work Interval struct
137 *
138 * This struct represents a thread group and/or work interval context
139 * in a mechanism that is represented with a kobject.
140 *
141 * Every thread that has joined a WI has a +1 ref, and the port
142 * has a +1 ref as well.
143 *
144 * TODO: groups need to have a 'is for WI' flag
145 * and they need a flag to create that says 'for WI'
146 * This would allow CLPC to avoid allocating WI support
147 * data unless it is needed
148 *
149 * TODO: Enforce not having more than one non-group joinable work
150 * interval per thread group.
151 * CLPC only wants to see one WI-notify callout per group.
152 */
153 struct work_interval {
154 uint64_t wi_id;
155 struct os_refcnt wi_ref_count;
156 uint32_t wi_create_flags;
157
158 /* for debugging purposes only, does not hold a ref on port */
159 ipc_port_t wi_port;
160
161 /*
162 * holds uniqueid and version of creating process,
163 * used to permission-gate notify
164 * TODO: you'd think there would be a better way to do this
165 */
166 uint64_t wi_creator_uniqueid;
167 uint32_t wi_creator_pid;
168 int wi_creator_pidversion;
169
170 /* flags set by work_interval_set_workload_id and reflected onto
171 * thread->th_work_interval_flags upon join */
172 uint32_t wi_wlid_flags;
173
174 #if CONFIG_THREAD_GROUPS
175 uint32_t wi_group_flags;
176 struct thread_group *wi_group; /* holds +1 ref on group */
177 #endif /* CONFIG_THREAD_GROUPS */
178
179 #if CONFIG_SCHED_AUTO_JOIN
180 /* Information related to auto-join and deferred finish for work interval */
181 struct work_interval_auto_join_info wi_auto_join_info;
182
183 /*
184 * Since the deallocation of auto-join work intervals
185 * can happen in the scheduler when the last thread in
186 * the WI blocks and the thread lock is held, the deallocation
187 * might have to be done on a separate thread.
188 */
189 struct mpsc_queue_chain wi_deallocate_link;
190 #endif /* CONFIG_SCHED_AUTO_JOIN */
191
192 /*
193 * Work interval class info - determines thread priority for threads
194 * with a work interval driven policy.
195 */
196 wi_class_t wi_class;
197 uint8_t wi_class_offset;
198
199 struct recount_work_interval wi_recount;
200 };
201
202 /*
203 * work_interval_telemetry_data_enabled()
204 *
205 * Helper routine to check if work interval has the collection of telemetry data enabled.
206 */
207 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)208 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209 {
210 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
211 }
212
213 /*
214 * work_interval_should_collect_telemetry_from_thread()
215 *
216 * Helper routine to determine whether any work interval telemetry should be collected
217 * for a thread.
218 */
219 static inline bool
work_interval_should_collect_telemetry_from_thread(thread_t thread)220 work_interval_should_collect_telemetry_from_thread(thread_t thread)
221 {
222 if (thread->th_work_interval == NULL) {
223 return false;
224 }
225 return work_interval_telemetry_data_enabled(thread->th_work_interval);
226 }
227
228 /*
229 * work_interval_get_recount_tracks()
230 *
231 * Returns the recount tracks associated with a work interval, or NULL
232 * if the work interval is NULL or has telemetry disabled.
233 */
234 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)235 work_interval_get_recount_tracks(struct work_interval *work_interval)
236 {
237 if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
238 return work_interval->wi_recount.rwi_current_instance;
239 }
240 return NULL;
241 }
242
243 #if CONFIG_SCHED_AUTO_JOIN
244
245 /*
246 * work_interval_perform_deferred_finish()
247 *
248 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
249 * argument rather than looking at the work_interval since the deferred finish can race with another
250 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
251 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
252 * the deferred state without issues.
253 */
254 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)255 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
256 __unused struct work_interval *work_interval, __unused thread_t thread)
257 {
258
259 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
260 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
261 }
262
263 /*
264 * work_interval_auto_join_increment()
265 *
266 * Routine to increment auto-join counter when a new thread is auto-joined to
267 * the work interval.
268 */
269 static void
work_interval_auto_join_increment(struct work_interval * work_interval)270 work_interval_auto_join_increment(struct work_interval *work_interval)
271 {
272 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
273 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
274 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
275 }
276
277 /*
278 * work_interval_auto_join_decrement()
279 *
280 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
281 * blocking or termination). If this was the last auto-joined thread in the work interval and
282 * there was a deferred finish, performs the finish operation for the work interval.
283 */
284 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)285 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
286 {
287 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
288 work_interval_auto_join_status_t old_status, new_status;
289 struct work_interval_deferred_finish_state deferred_finish_state;
290 bool perform_finish;
291
292 /* Update the auto-join count for the work interval atomically */
293 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
294 perform_finish = false;
295 new_status = old_status;
296 assert(work_interval_status_auto_join_count(old_status) > 0);
297 new_status -= 1;
298 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
299 /* No auto-joined threads remaining and finish is deferred */
300 new_status = 0;
301 perform_finish = true;
302 /*
303 * Its important to copy the deferred finish state here so that this works
304 * when racing with another start-finish cycle.
305 */
306 deferred_finish_state = join_info->deferred_finish_state;
307 }
308 });
309
310 if (perform_finish == true) {
311 /*
312 * Since work_interval_perform_deferred_finish() calls down to
313 * the machine layer callout for finish which gets the thread
314 * group from the thread passed in here, it is important to
315 * make sure that the thread still has the work interval thread
316 * group here.
317 */
318 assert(thread->thread_group == work_interval->wi_group);
319 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
320 }
321 }
322
323 /*
324 * work_interval_auto_join_enabled()
325 *
326 * Helper routine to check if work interval has auto-join enabled.
327 */
328 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)329 work_interval_auto_join_enabled(struct work_interval *work_interval)
330 {
331 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
332 }
333
334 /*
335 * work_interval_deferred_finish_enabled()
336 *
337 * Helper routine to check if work interval has deferred finish enabled.
338 */
339 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)340 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
341 {
342 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
343 }
344
345 #endif /* CONFIG_SCHED_AUTO_JOIN */
346
347 static inline void
work_interval_retain(struct work_interval * work_interval)348 work_interval_retain(struct work_interval *work_interval)
349 {
350 /*
351 * Even though wi_retain is called under a port lock, we have
352 * to use os_ref_retain instead of os_ref_retain_locked
353 * because wi_release is not synchronized. wi_release calls
354 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
355 */
356 os_ref_retain(&work_interval->wi_ref_count);
357 }
358
359 static inline void
work_interval_deallocate(struct work_interval * work_interval)360 work_interval_deallocate(struct work_interval *work_interval)
361 {
362 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
363 work_interval->wi_id);
364 if (work_interval_telemetry_data_enabled(work_interval)) {
365 recount_work_interval_deinit(&work_interval->wi_recount);
366 }
367 kfree_type(struct work_interval, work_interval);
368 }
369
370 /*
371 * work_interval_release()
372 *
373 * Routine to release a ref count on the work interval. If the refcount goes down
374 * to zero, the work interval needs to be de-allocated.
375 *
376 * For non auto-join work intervals, they are de-allocated in this context.
377 *
378 * For auto-join work intervals, the de-allocation cannot be done from this context
379 * since that might need the kernel memory allocator lock. In that case, the
380 * deallocation is done via a thread-call based mpsc queue.
381 */
382 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)383 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
384 {
385 if (os_ref_release(&work_interval->wi_ref_count) == 0) {
386 #if CONFIG_SCHED_AUTO_JOIN
387 if (options & THREAD_WI_THREAD_LOCK_HELD) {
388 work_interval_deferred_release(work_interval);
389 } else {
390 work_interval_deallocate(work_interval);
391 }
392 #else /* CONFIG_SCHED_AUTO_JOIN */
393 work_interval_deallocate(work_interval);
394 #endif /* CONFIG_SCHED_AUTO_JOIN */
395 }
396 }
397
398 #if CONFIG_SCHED_AUTO_JOIN
399
400 /*
401 * work_interval_deferred_release()
402 *
403 * Routine to enqueue the work interval on the deallocation mpsc queue.
404 */
405 static void
work_interval_deferred_release(struct work_interval * work_interval)406 work_interval_deferred_release(struct work_interval *work_interval)
407 {
408 mpsc_daemon_enqueue(&work_interval_deallocate_queue,
409 &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
410 }
411
412 /*
413 * work_interval_should_propagate()
414 *
415 * Main policy routine to decide if a thread should be auto-joined to
416 * another thread's work interval. The conditions are arranged such that
417 * the most common bailout condition are checked the earliest. This routine
418 * is called from the scheduler context; so it needs to be efficient and
419 * be careful when taking locks or performing wakeups.
420 */
421 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)422 work_interval_should_propagate(thread_t cthread, thread_t thread)
423 {
424 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
425 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
426 return false;
427 }
428
429 /* Only propagate work intervals which have auto-join enabled */
430 if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
431 return false;
432 }
433
434 /* Work interval propagation is enabled for realtime threads only */
435 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
436 return false;
437 }
438
439
440 /* Work interval propagation only works for threads with the same home thread group */
441 struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
442 if (thread_group_get_home_group(cthread) != thread_home_tg) {
443 return false;
444 }
445
446 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
447 if (thread->thread_group != thread_home_tg) {
448 return false;
449 }
450
451 /* If either thread is inactive (in the termination path), do not propagate auto-join */
452 if ((!cthread->active) || (!thread->active)) {
453 return false;
454 }
455
456 return true;
457 }
458
459 /*
460 * work_interval_auto_join_propagate()
461 *
462 * Routine to auto-join a thread into another thread's work interval
463 *
464 * Should only be invoked if work_interval_should_propagate() returns
465 * true. Also expects "from" thread to be current thread and "to" thread
466 * to be locked.
467 */
468 void
work_interval_auto_join_propagate(thread_t from,thread_t to)469 work_interval_auto_join_propagate(thread_t from, thread_t to)
470 {
471 assert(from == current_thread());
472 work_interval_retain(from->th_work_interval);
473 work_interval_auto_join_increment(from->th_work_interval);
474 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
475 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
476 assert(kr == KERN_SUCCESS);
477 }
478
479 /*
480 * work_interval_auto_join_unwind()
481 *
482 * Routine to un-join an auto-joined work interval for a thread that is blocking.
483 *
484 * Expects thread to be locked.
485 */
486 void
work_interval_auto_join_unwind(thread_t thread)487 work_interval_auto_join_unwind(thread_t thread)
488 {
489 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
490 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
491 assert(kr == KERN_SUCCESS);
492 }
493
494 /*
495 * work_interval_auto_join_demote()
496 *
497 * Routine to un-join an auto-joined work interval when a thread is changing from
498 * realtime to non-realtime scheduling mode. This could happen due to multiple
499 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
500 * the thread being demoted may not be the current thread.
501 *
502 * Expects thread to be locked.
503 */
504 void
work_interval_auto_join_demote(thread_t thread)505 work_interval_auto_join_demote(thread_t thread)
506 {
507 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
508 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
509 assert(kr == KERN_SUCCESS);
510 }
511
512 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)513 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
514 __assert_only mpsc_daemon_queue_t dq)
515 {
516 struct work_interval *work_interval = NULL;
517 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
518 assert(dq == &work_interval_deallocate_queue);
519 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
520 work_interval_deallocate(work_interval);
521 }
522
523 #endif /* CONFIG_SCHED_AUTO_JOIN */
524
525 #if CONFIG_SCHED_AUTO_JOIN
526 __startup_func
527 static void
work_interval_subsystem_init(void)528 work_interval_subsystem_init(void)
529 {
530 /*
531 * The work interval deallocation queue must be a thread call based queue
532 * because it is woken up from contexts where the thread lock is held. The
533 * only way to perform wakeups safely in those contexts is to wakeup a
534 * thread call which is guaranteed to be on a different waitq and would
535 * not hash onto the same global waitq which might be currently locked.
536 */
537 mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
538 work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
539 MPSC_DAEMON_INIT_NONE);
540 }
541 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
542 #endif /* CONFIG_SCHED_AUTO_JOIN */
543
544 /*
545 * work_interval_port_convert
546 *
547 * Called with port locked, returns reference to work interval
548 * if indeed the port is a work interval kobject port
549 */
550 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)551 work_interval_port_convert_locked(ipc_port_t port)
552 {
553 struct work_interval *work_interval = NULL;
554
555 if (IP_VALID(port)) {
556 work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
557 if (work_interval) {
558 work_interval_retain(work_interval);
559 }
560 }
561
562 return work_interval;
563 }
564
565 /*
566 * port_name_to_work_interval
567 *
568 * Description: Obtain a reference to the work_interval associated with a given port.
569 *
570 * Parameters: name A Mach port name to translate.
571 *
572 * Returns: NULL The given Mach port did not reference a work_interval.
573 * !NULL The work_interval that is associated with the Mach port.
574 */
575 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)576 port_name_to_work_interval(mach_port_name_t name,
577 struct work_interval **work_interval)
578 {
579 if (!MACH_PORT_VALID(name)) {
580 return KERN_INVALID_NAME;
581 }
582
583 ipc_port_t port = IP_NULL;
584 kern_return_t kr = KERN_SUCCESS;
585
586 kr = ipc_port_translate_send(current_space(), name, &port);
587 if (kr != KERN_SUCCESS) {
588 return kr;
589 }
590 /* port is locked */
591
592 assert(IP_VALID(port));
593
594 struct work_interval *converted_work_interval;
595
596 converted_work_interval = work_interval_port_convert_locked(port);
597
598 /* the port is valid, but doesn't denote a work_interval */
599 if (converted_work_interval == NULL) {
600 kr = KERN_INVALID_CAPABILITY;
601 }
602
603 ip_mq_unlock(port);
604
605 if (kr == KERN_SUCCESS) {
606 *work_interval = converted_work_interval;
607 }
608
609 return kr;
610 }
611
612
613 /*
614 * work_interval_port_no_senders
615 *
616 * Description: Handle a no-senders notification for a work interval port.
617 * Destroys the port and releases its reference on the work interval.
618 *
619 * Parameters: msg A Mach no-senders notification message.
620 *
621 * Note: This assumes that there is only one create-right-from-work-interval point,
622 * if the ability to extract another send right after creation is added,
623 * this will have to change to handle make-send counts correctly.
624 */
625 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)626 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
627 {
628 struct work_interval *work_interval = NULL;
629
630 work_interval = ipc_kobject_dealloc_port(port, mscount,
631 IKOT_WORK_INTERVAL);
632
633 work_interval->wi_port = MACH_PORT_NULL;
634
635 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
636 }
637
638 /*
639 * work_interval_port_type()
640 *
641 * Converts a port name into the work interval object and returns its type.
642 *
643 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
644 * valid type for work intervals).
645 */
646 static uint32_t
work_interval_port_type(mach_port_name_t port_name)647 work_interval_port_type(mach_port_name_t port_name)
648 {
649 struct work_interval *work_interval = NULL;
650 kern_return_t kr;
651 uint32_t work_interval_type;
652
653 if (port_name == MACH_PORT_NULL) {
654 return WORK_INTERVAL_TYPE_LAST;
655 }
656
657 kr = port_name_to_work_interval(port_name, &work_interval);
658 if (kr != KERN_SUCCESS) {
659 return WORK_INTERVAL_TYPE_LAST;
660 }
661 /* work_interval has a +1 ref */
662
663 assert(work_interval != NULL);
664 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
665 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
666 return work_interval_type;
667 }
668
669 /*
670 * Sparse - not all work interval classes imply a scheduling policy change.
671 * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
672 * adopted the REALTIME sched mode to take effect.
673 */
674 static const struct {
675 int priority;
676 sched_mode_t sched_mode;
677 } work_interval_class_data[WI_CLASS_COUNT] = {
678 [WI_CLASS_BEST_EFFORT] = {
679 BASEPRI_DEFAULT, // 31
680 TH_MODE_TIMESHARE,
681 },
682
683 [WI_CLASS_APP_SUPPORT] = {
684 BASEPRI_DEFAULT, // 31
685 TH_MODE_TIMESHARE,
686 },
687
688 [WI_CLASS_SYSTEM] = {
689 BASEPRI_FOREGROUND + 1, // 48
690 TH_MODE_FIXED,
691 },
692
693 [WI_CLASS_SYSTEM_CRITICAL] = {
694 MAXPRI_USER + 1, // 64
695 TH_MODE_FIXED,
696 },
697
698 [WI_CLASS_REALTIME_CRITICAL] = {
699 BASEPRI_RTQUEUES + 1, // 98
700 TH_MODE_REALTIME,
701 },
702 };
703
704 /*
705 * Called when a thread gets its scheduling priority from its associated work
706 * interval.
707 */
708 int
work_interval_get_priority(thread_t thread)709 work_interval_get_priority(thread_t thread)
710 {
711 const struct work_interval *work_interval = thread->th_work_interval;
712 assert(work_interval != NULL);
713
714 assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
715 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
716 int priority = work_interval_class_data[work_interval->wi_class].priority;
717 assert(priority != 0);
718
719 priority += work_interval->wi_class_offset;
720 assert3u(priority, <=, MAXPRI);
721
722 return priority;
723 }
724
725 /*
726 * Switch to a policy driven by the work interval (if applicable).
727 */
728 static void
work_interval_set_policy(thread_t thread)729 work_interval_set_policy(thread_t thread)
730 {
731 assert3p(thread, ==, current_thread());
732
733 /*
734 * Ignore policy changes if the workload context shouldn't affect the
735 * scheduling policy.
736 */
737 workload_config_flags_t flags = WLC_F_NONE;
738
739 /* There may be no config at all. That's ok. */
740 if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
741 (flags & WLC_F_THREAD_POLICY) == 0) {
742 return;
743 }
744
745 const struct work_interval *work_interval = thread->th_work_interval;
746 assert(work_interval != NULL);
747
748 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
749 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
750
751 /*
752 * A mode of TH_MODE_NONE implies that this work interval has no
753 * associated scheduler effects.
754 */
755 if (mode == TH_MODE_NONE) {
756 return;
757 }
758
759 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
760 TASK_POLICY_WI_DRIVEN, true, mode);
761 assert(thread->requested_policy.thrp_wi_driven);
762
763 return;
764 }
765
766 /*
767 * Clear a work interval driven policy.
768 */
769 static void
work_interval_clear_policy(thread_t thread)770 work_interval_clear_policy(thread_t thread)
771 {
772 assert3p(thread, ==, current_thread());
773
774 if (!thread->requested_policy.thrp_wi_driven) {
775 return;
776 }
777
778 const sched_mode_t mode = sched_get_thread_mode_user(thread);
779
780 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
781 TASK_POLICY_WI_DRIVEN, false,
782 mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
783
784 assert(!thread->requested_policy.thrp_wi_driven);
785
786 return;
787 }
788
789 /*
790 * thread_set_work_interval()
791 *
792 * Change thread's bound work interval to the passed-in work interval
793 * Consumes +1 ref on work_interval upon success.
794 *
795 * May also pass NULL to un-set work_interval on the thread
796 * Will deallocate any old work interval on the thread
797 * Return error if thread does not satisfy requirements to join work interval
798 *
799 * For non auto-join work intervals, deallocate any old work interval on the thread
800 * For auto-join work intervals, the routine may wakeup the work interval deferred
801 * deallocation queue since thread locks might be currently held.
802 */
803 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)804 thread_set_work_interval(thread_t thread,
805 struct work_interval *work_interval, thread_work_interval_options_t options)
806 {
807 /* All explicit work interval operations should always be from the current thread */
808 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
809 assert(thread == current_thread());
810 }
811
812 /* All cases of needing the thread lock should be from explicit join scenarios */
813 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
814 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
815 }
816
817 /* For all cases of auto join must come in with the thread lock held */
818 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
819 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
820 }
821
822 #if CONFIG_THREAD_GROUPS
823 if (work_interval && !work_interval->wi_group) {
824 /* Reject join on work intervals with deferred thread group creation */
825 return KERN_INVALID_ARGUMENT;
826 }
827 #endif /* CONFIG_THREAD_GROUPS */
828
829 if (work_interval) {
830 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
831
832 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
833 /* Ensure no kern_work_interval_set_workload_id can happen after this point */
834 uint32_t wlid_flags;
835 (void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
836 WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
837 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
838 /* For workload IDs with rt-allowed, neuter the check below to
839 * enable joining before the thread has become realtime for all
840 * work interval types */
841 work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
842 }
843 }
844
845 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
846 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
847 return KERN_INVALID_ARGUMENT;
848 }
849 }
850
851 /*
852 * Ensure a work interval scheduling policy is not used if the thread is
853 * leaving the work interval.
854 */
855 if (work_interval == NULL &&
856 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
857 work_interval_clear_policy(thread);
858 }
859
860 struct work_interval *old_th_wi = thread->th_work_interval;
861 #if CONFIG_SCHED_AUTO_JOIN
862 spl_t s;
863 /* Take the thread lock if needed */
864 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
865 s = splsched();
866 thread_lock(thread);
867 }
868
869 /*
870 * Work interval auto-join leak to non-RT threads.
871 *
872 * If thread might be running on a remote core and it's not in the context switch path (where
873 * thread is neither running, blocked or in the runq), its not possible to update the
874 * work interval & thread group remotely since its not possible to update CLPC for a remote
875 * core. This situation might happen when a thread is transitioning from realtime to
876 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
877 * be part of the work interval.
878 *
879 * Since there is no immediate mitigation to this issue, the policy is to set a new
880 * flag on the thread which indicates that such a "leak" has happened. This flag will
881 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
882 */
883 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
884
885 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
886 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
887 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
888 return KERN_SUCCESS;
889 }
890
891 const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
892
893 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
894 __kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
895 __kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
896 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
897 thread_tid(thread), old_tg_id, new_tg_id, options);
898 }
899
900 if (old_wi_auto_joined) {
901 /*
902 * If thread was auto-joined to a work interval and is not realtime, make sure it
903 * happened due to the "leak" described above.
904 */
905 if (thread->sched_mode != TH_MODE_REALTIME) {
906 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
907 }
908
909 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
910 work_interval_auto_join_decrement(old_th_wi, thread);
911 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
912 }
913
914 #endif /* CONFIG_SCHED_AUTO_JOIN */
915
916 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
917 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
918
919 /* transfer +1 ref to thread */
920 thread->th_work_interval = work_interval;
921
922 #if CONFIG_SCHED_AUTO_JOIN
923
924 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
925 assert(work_interval_auto_join_enabled(work_interval) == true);
926 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
927 }
928
929 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
930 thread_unlock(thread);
931 splx(s);
932 }
933 #endif /* CONFIG_SCHED_AUTO_JOIN */
934
935 /*
936 * The thread got a new work interval. It may come with a work interval
937 * scheduling policy that needs to be applied.
938 */
939 if (work_interval != NULL &&
940 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
941 work_interval_set_policy(thread);
942 }
943
944 #if CONFIG_THREAD_GROUPS
945 if (work_interval) {
946 /* Prevent thread_group_set_name after CLPC may have already heard
947 * about the thread group */
948 (void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
949 WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
950 }
951 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
952
953 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
954 #if CONFIG_SCHED_AUTO_JOIN
955 thread_set_autojoin_thread_group_locked(thread, new_tg);
956 #endif
957 } else {
958 thread_set_work_interval_thread_group(thread, new_tg);
959 }
960 #endif /* CONFIG_THREAD_GROUPS */
961
962 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
963 /* Construct mask to XOR with th_work_interval_flags to clear the
964 * currently present flags and set the new flags in wlid_flags. */
965 uint32_t wlid_flags = 0;
966 if (work_interval) {
967 wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
968 }
969 thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
970 &thread->th_work_interval_flags, relaxed);
971 th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
972 TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
973 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
974 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
975 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
976 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
977 }
978 }
979 if (th_wi_xor_mask) {
980 os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
981 }
982
983 /*
984 * Now that the interval flags have been set, re-evaluate
985 * whether the thread needs to be undemoted - the new work
986 * interval may have the RT_ALLOWED flag. and the thread may
987 * have have a realtime policy but be demoted.
988 */
989 thread_rt_evaluate(thread);
990 }
991
992 if (old_th_wi != NULL) {
993 work_interval_release(old_th_wi, options);
994 }
995
996 return KERN_SUCCESS;
997 }
998
999 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1000 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1001 {
1002 assert(thread == current_thread());
1003 return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1004 }
1005
1006 kern_return_t
work_interval_thread_terminate(thread_t thread)1007 work_interval_thread_terminate(thread_t thread)
1008 {
1009 assert(thread == current_thread());
1010 if (thread->th_work_interval != NULL) {
1011 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1012 }
1013 return KERN_SUCCESS;
1014 }
1015
1016 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1017 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1018 {
1019 assert(thread == current_thread());
1020 assert(kwi_args->work_interval_id != 0);
1021
1022 struct work_interval *work_interval = thread->th_work_interval;
1023
1024 if (work_interval == NULL ||
1025 work_interval->wi_id != kwi_args->work_interval_id) {
1026 /* This thread must have adopted the work interval to be able to notify */
1027 return KERN_INVALID_ARGUMENT;
1028 }
1029
1030 task_t notifying_task = current_task();
1031
1032 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1033 work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1034 /* Only the creating task can do a notify */
1035 return KERN_INVALID_ARGUMENT;
1036 }
1037
1038 spl_t s = splsched();
1039
1040 #if CONFIG_THREAD_GROUPS
1041 assert(work_interval->wi_group == thread->thread_group);
1042 #endif /* CONFIG_THREAD_GROUPS */
1043
1044 uint64_t urgency_param1, urgency_param2;
1045 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1046
1047 splx(s);
1048
1049 /* called without interrupts disabled */
1050 machine_work_interval_notify(thread, kwi_args);
1051
1052 return KERN_SUCCESS;
1053 }
1054
1055 /* Start at 1, 0 is not a valid work interval ID */
1056 static _Atomic uint64_t unique_work_interval_id = 1;
1057
1058 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1059 kern_work_interval_create(thread_t thread,
1060 struct kern_work_interval_create_args *create_params)
1061 {
1062 assert(thread == current_thread());
1063
1064 uint32_t create_flags = create_params->wica_create_flags;
1065
1066 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1067 thread->th_work_interval != NULL) {
1068 /*
1069 * If the thread is doing a legacy combined create and join,
1070 * it shouldn't already be part of a work interval.
1071 *
1072 * (Creating a joinable WI is allowed anytime.)
1073 */
1074 return KERN_FAILURE;
1075 }
1076
1077 /*
1078 * Check the validity of the create flags before allocating the work
1079 * interval.
1080 */
1081 task_t creating_task = current_task();
1082 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1083 /*
1084 * CA_CLIENT work intervals do not create new thread groups.
1085 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1086 * per each application task
1087 */
1088 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1089 return KERN_FAILURE;
1090 }
1091 if (!task_is_app(creating_task)) {
1092 #if XNU_TARGET_OS_OSX
1093 /*
1094 * Soft-fail the case of a non-app pretending to be an
1095 * app, by allowing it to press the buttons, but they're
1096 * not actually connected to anything.
1097 */
1098 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1099 #else
1100 /*
1101 * On iOS, it's a hard failure to get your apptype
1102 * wrong and then try to render something.
1103 */
1104 return KERN_NOT_SUPPORTED;
1105 #endif /* XNU_TARGET_OS_OSX */
1106 }
1107 if (task_set_ca_client_wi(creating_task, true) == false) {
1108 return KERN_FAILURE;
1109 }
1110 }
1111
1112 #if CONFIG_SCHED_AUTO_JOIN
1113 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1114 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1115 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1116 return KERN_NOT_SUPPORTED;
1117 }
1118 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1119 return KERN_NOT_SUPPORTED;
1120 }
1121 }
1122
1123 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1124 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1125 return KERN_NOT_SUPPORTED;
1126 }
1127 }
1128 #endif /* CONFIG_SCHED_AUTO_JOIN */
1129
1130 struct work_interval *work_interval = kalloc_type(struct work_interval,
1131 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1132
1133 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1134
1135 *work_interval = (struct work_interval) {
1136 .wi_id = work_interval_id,
1137 .wi_ref_count = {},
1138 .wi_create_flags = create_flags,
1139 .wi_creator_pid = pid_from_task(creating_task),
1140 .wi_creator_uniqueid = get_task_uniqueid(creating_task),
1141 .wi_creator_pidversion = get_task_version(creating_task),
1142 };
1143 os_ref_init(&work_interval->wi_ref_count, NULL);
1144
1145 if (work_interval_telemetry_data_enabled(work_interval)) {
1146 recount_work_interval_init(&work_interval->wi_recount);
1147 }
1148
1149 __kdebug_only uint64_t tg_id = 0;
1150 #if CONFIG_THREAD_GROUPS
1151 struct thread_group *tg;
1152 if ((create_flags &
1153 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1154 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1155 /* defer creation of the thread group until the
1156 * kern_work_interval_set_workload_id() call */
1157 work_interval->wi_group = NULL;
1158 } else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1159 /* create a new group for the interval to represent */
1160 char name[THREAD_GROUP_MAXNAME] = "";
1161
1162 snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1163 work_interval->wi_creator_pid);
1164
1165 tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1166
1167 thread_group_set_name(tg, name);
1168
1169 work_interval->wi_group = tg;
1170 } else {
1171 /* the interval represents the thread's home group */
1172 tg = thread_group_get_home_group(thread);
1173
1174 thread_group_retain(tg);
1175
1176 work_interval->wi_group = tg;
1177 }
1178
1179 /* Capture the tg_id for tracing purposes */
1180 tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1181
1182 #endif /* CONFIG_THREAD_GROUPS */
1183
1184 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1185 mach_port_name_t name = MACH_PORT_NULL;
1186
1187 /* work_interval has a +1 ref, moves to the port */
1188 work_interval->wi_port = ipc_kobject_alloc_port(
1189 (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1190 IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1191
1192
1193 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1194
1195 if (!MACH_PORT_VALID(name)) {
1196 /*
1197 * copyout failed (port is already deallocated)
1198 * Because of the port-destroyed magic,
1199 * the work interval is already deallocated too.
1200 */
1201 return KERN_RESOURCE_SHORTAGE;
1202 }
1203
1204 create_params->wica_port = name;
1205 } else {
1206 /* work_interval has a +1 ref, moves to the thread */
1207 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1208 if (kr != KERN_SUCCESS) {
1209 /* No other thread can join this work interval since it isn't
1210 * JOINABLE so release the reference on work interval */
1211 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1212 return kr;
1213 }
1214
1215 create_params->wica_port = MACH_PORT_NULL;
1216 }
1217
1218 create_params->wica_id = work_interval_id;
1219
1220 if (tg_id != ~0) {
1221 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1222 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1223 }
1224 return KERN_SUCCESS;
1225 }
1226
1227 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1228 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1229 {
1230 assert(flags != NULL);
1231
1232 kern_return_t kr;
1233 struct work_interval *work_interval;
1234
1235 kr = port_name_to_work_interval(port_name, &work_interval);
1236 if (kr != KERN_SUCCESS) {
1237 return kr;
1238 }
1239
1240 assert(work_interval != NULL);
1241 *flags = work_interval->wi_create_flags;
1242
1243 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1244
1245 return KERN_SUCCESS;
1246 }
1247
1248 #if CONFIG_THREAD_GROUPS
1249 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1250 "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1251 #endif /* CONFIG_THREAD_GROUPS */
1252
1253 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1254 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1255 size_t len)
1256 {
1257 kern_return_t kr;
1258 struct work_interval *work_interval;
1259
1260 if (len > WORK_INTERVAL_NAME_MAX) {
1261 return KERN_INVALID_ARGUMENT;
1262 }
1263 kr = port_name_to_work_interval(port_name, &work_interval);
1264 if (kr != KERN_SUCCESS) {
1265 return kr;
1266 }
1267
1268 assert(work_interval != NULL);
1269
1270 #if CONFIG_THREAD_GROUPS
1271 uint32_t wi_group_flags = os_atomic_load(
1272 &work_interval->wi_group_flags, relaxed);
1273 if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1274 kr = KERN_INVALID_ARGUMENT;
1275 goto out;
1276 }
1277 if (!work_interval->wi_group) {
1278 kr = KERN_INVALID_ARGUMENT;
1279 goto out;
1280 }
1281
1282 if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1283 char tgname[THREAD_GROUP_MAXNAME];
1284 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1285 name);
1286 thread_group_set_name(work_interval->wi_group, tgname);
1287 }
1288
1289 out:
1290 #endif /* CONFIG_THREAD_GROUPS */
1291 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1292
1293 return kr;
1294 }
1295
1296 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1297 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1298 struct kern_work_interval_workload_id_args *workload_id_args)
1299 {
1300 kern_return_t kr;
1301 struct work_interval *work_interval;
1302 uint32_t wlida_flags = 0;
1303 uint32_t wlid_flags = 0;
1304 #if CONFIG_THREAD_GROUPS
1305 uint32_t tg_flags = 0;
1306 #endif
1307 bool from_workload_config = false;
1308
1309 /* Ensure workload ID name is non-empty. */
1310 if (!workload_id_args->wlida_name[0]) {
1311 return KERN_INVALID_ARGUMENT;
1312 }
1313
1314 kr = port_name_to_work_interval(port_name, &work_interval);
1315 if (kr != KERN_SUCCESS) {
1316 return kr;
1317 }
1318
1319 assert(work_interval != NULL);
1320 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1321 kr = KERN_INVALID_ARGUMENT;
1322 goto out;
1323 }
1324
1325 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1326 /* Reject work intervals that didn't indicate they will have a workload ID
1327 * at creation. In particular if the work interval has its own thread group,
1328 * its creation must have been deferred in kern_work_interval_create */
1329 kr = KERN_INVALID_ARGUMENT;
1330 goto out;
1331 }
1332
1333 workload_config_t wl_config = {};
1334 kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1335 if (kr == KERN_SUCCESS) {
1336 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1337 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1338 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1339 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1340 /* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1341 } else {
1342 kr = KERN_INVALID_ARGUMENT;
1343 goto out;
1344 }
1345 }
1346
1347 wlida_flags = wl_config.wc_flags;
1348
1349 wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1350
1351 #if CONFIG_THREAD_GROUPS
1352 tg_flags = wl_config.wc_thread_group_flags;
1353 if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1354 (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1355 kr = KERN_INVALID_ARGUMENT;
1356 goto out;
1357 }
1358 #endif /* CONFIG_THREAD_GROUPS */
1359
1360 from_workload_config = true;
1361 } else {
1362 /* If the workload is not present in the table, perform basic validation
1363 * that the create flags passed in match the ones used at work interval
1364 * create time */
1365 if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1366 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1367 kr = KERN_INVALID_ARGUMENT;
1368 goto out;
1369 }
1370
1371 const bool wc_avail = workload_config_available();
1372 if (!wc_avail) {
1373 wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1374 }
1375
1376 /*
1377 * If the workload config wasn't even loaded then fallback to
1378 * older behaviour where the new thread group gets the default
1379 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1380 */
1381 #if CONFIG_THREAD_GROUPS
1382 if (!wc_avail) {
1383 tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1384 } else {
1385 struct thread_group *home_group =
1386 thread_group_get_home_group(current_thread());
1387 if (home_group != NULL) {
1388 tg_flags = thread_group_get_flags(home_group);
1389 }
1390 }
1391 #endif /* CONFIG_THREAD_GROUPS */
1392 }
1393
1394 workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1395
1396 /* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1397 * has been set). */
1398 wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1399 if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1400 &wlid_flags, relaxed)) {
1401 if (from_workload_config) {
1402 work_interval->wi_class = wl_config.wc_class;
1403 work_interval->wi_class_offset = wl_config.wc_class_offset;
1404 }
1405 #if CONFIG_THREAD_GROUPS
1406 if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1407 /* Perform deferred thread group creation, now that tgflags are known */
1408 struct thread_group *tg;
1409 tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1410 THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1411
1412 char tgname[THREAD_GROUP_MAXNAME] = "";
1413 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1414 workload_id_args->wlida_name);
1415 thread_group_set_name(tg, tgname);
1416
1417 assert(work_interval->wi_group == NULL);
1418 work_interval->wi_group = tg;
1419 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1420 work_interval->wi_id, work_interval->wi_create_flags,
1421 work_interval->wi_creator_pid, thread_group_get_id(tg));
1422 }
1423 #endif /* CONFIG_THREAD_GROUPS */
1424 } else {
1425 /* Workload ID has previously been set (or a thread has already joined). */
1426 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1427 kr = KERN_INVALID_ARGUMENT;
1428 goto out;
1429 }
1430 /* Treat this request as a query for the out parameters of the ID */
1431 workload_id_args->wlida_flags = wlid_flags;
1432 }
1433
1434 /*
1435 * Emit tracepoints for successfully setting the workload ID.
1436 *
1437 * After rdar://89342390 has been fixed and a new work interval ktrace
1438 * provider has been added, it will be possible to associate a numeric
1439 * ID with an ID name. Thus, for those cases where the ID name has been
1440 * looked up successfully (`from_workload_config` is true) it will no
1441 * longer be necessary to emit a tracepoint with the full ID name.
1442 */
1443 KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1444 work_interval->wi_id, from_workload_config);
1445 kernel_debug_string_simple(
1446 MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1447 workload_id_args->wlida_name);
1448
1449 kr = KERN_SUCCESS;
1450
1451 out:
1452 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1453
1454 return kr;
1455 }
1456
1457
1458 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1459 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1460 {
1461 if (work_interval_id == 0) {
1462 return KERN_INVALID_ARGUMENT;
1463 }
1464
1465 if (thread->th_work_interval == NULL ||
1466 thread->th_work_interval->wi_id != work_interval_id) {
1467 /* work ID isn't valid or doesn't match joined work interval ID */
1468 return KERN_INVALID_ARGUMENT;
1469 }
1470
1471 return thread_set_work_interval_explicit_join(thread, NULL);
1472 }
1473
1474 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1475 kern_work_interval_join(thread_t thread,
1476 mach_port_name_t port_name)
1477 {
1478 struct work_interval *work_interval = NULL;
1479 kern_return_t kr;
1480
1481 if (port_name == MACH_PORT_NULL) {
1482 /* 'Un-join' the current work interval */
1483 return thread_set_work_interval_explicit_join(thread, NULL);
1484 }
1485
1486 kr = port_name_to_work_interval(port_name, &work_interval);
1487 if (kr != KERN_SUCCESS) {
1488 return kr;
1489 }
1490 /* work_interval has a +1 ref */
1491
1492 assert(work_interval != NULL);
1493
1494 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1495 /* ref was consumed by passing it to the thread in the successful case */
1496 if (kr != KERN_SUCCESS) {
1497 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1498 }
1499 return kr;
1500 }
1501
1502 /*
1503 * work_interval_port_type_render_server()
1504 *
1505 * Helper routine to determine if the port points to a
1506 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1507 */
1508 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1509 work_interval_port_type_render_server(mach_port_name_t port_name)
1510 {
1511 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1512 }
1513