1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #include <sys/work_interval.h>
31
32 #include <kern/work_interval.h>
33
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49
50 /*
51 * With the introduction of auto-join work intervals, it is possible
52 * to change the work interval (and related thread group) of a thread in a
53 * variety of contexts (thread termination, context switch, thread mode
54 * change etc.). In order to clearly specify the policy expectation and
55 * the locking behavior, all calls to thread_set_work_interval() pass
56 * in a set of flags.
57 */
58
59 __options_decl(thread_work_interval_options_t, uint32_t, {
60 /* Change the work interval using the explicit join rules */
61 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
62 /* Change the work interval using the auto-join rules */
63 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
64 /* Caller already holds the thread lock */
65 THREAD_WI_THREAD_LOCK_HELD = 0x4,
66 /* Caller does not hold the thread lock */
67 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
68 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
69 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
70 });
71
72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
73 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
74
75 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
76 .iko_op_movable_send = true,
77 .iko_op_stable = true,
78 .iko_op_no_senders = work_interval_port_no_senders);
79
80 #if CONFIG_SCHED_AUTO_JOIN
81 /* MPSC queue used to defer deallocate work intervals */
82 static struct mpsc_daemon_queue work_interval_deallocate_queue;
83
84 static void work_interval_deferred_release(struct work_interval *);
85
86 /*
87 * Work Interval Auto-Join Status
88 *
89 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
90 * It packs the following information:
91 * - A bit representing if a "finish" is deferred on the work interval
92 * - Count of number of threads auto-joined to the work interval
93 */
94 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
95 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
97 typedef uint32_t work_interval_auto_join_status_t;
98
99 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)100 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
101 {
102 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
103 }
104
105 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)106 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
107 {
108 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
109 }
110
111 /*
112 * struct work_interval_deferred_finish_state
113 *
114 * Contains the parameters of the finish operation which is being deferred.
115 */
116 struct work_interval_deferred_finish_state {
117 uint64_t instance_id;
118 uint64_t start;
119 uint64_t deadline;
120 uint64_t complexity;
121 };
122
123 struct work_interval_auto_join_info {
124 struct work_interval_deferred_finish_state deferred_finish_state;
125 work_interval_auto_join_status_t _Atomic status;
126 };
127 #endif /* CONFIG_SCHED_AUTO_JOIN */
128
129 #if CONFIG_THREAD_GROUPS
130 /* Flags atomically set in wi_group_flags wi_group_flags */
131 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
132 #endif
133
134 /*
135 * Work Interval struct
136 *
137 * This struct represents a thread group and/or work interval context
138 * in a mechanism that is represented with a kobject.
139 *
140 * Every thread that has joined a WI has a +1 ref, and the port
141 * has a +1 ref as well.
142 *
143 * TODO: groups need to have a 'is for WI' flag
144 * and they need a flag to create that says 'for WI'
145 * This would allow CLPC to avoid allocating WI support
146 * data unless it is needed
147 *
148 * TODO: Enforce not having more than one non-group joinable work
149 * interval per thread group.
150 * CLPC only wants to see one WI-notify callout per group.
151 */
152 struct work_interval {
153 uint64_t wi_id;
154 struct os_refcnt wi_ref_count;
155 uint32_t wi_create_flags;
156
157 /* for debugging purposes only, does not hold a ref on port */
158 ipc_port_t wi_port;
159
160 /*
161 * holds uniqueid and version of creating process,
162 * used to permission-gate notify
163 * TODO: you'd think there would be a better way to do this
164 */
165 uint64_t wi_creator_uniqueid;
166 uint32_t wi_creator_pid;
167 int wi_creator_pidversion;
168
169 /* flags set by work_interval_set_workload_id and reflected onto
170 * thread->th_work_interval_flags upon join */
171 uint32_t wi_wlid_flags;
172
173 #if CONFIG_THREAD_GROUPS
174 uint32_t wi_group_flags;
175 struct thread_group *wi_group; /* holds +1 ref on group */
176 #endif /* CONFIG_THREAD_GROUPS */
177
178 #if CONFIG_SCHED_AUTO_JOIN
179 /* Information related to auto-join and deferred finish for work interval */
180 struct work_interval_auto_join_info wi_auto_join_info;
181
182 /*
183 * Since the deallocation of auto-join work intervals
184 * can happen in the scheduler when the last thread in
185 * the WI blocks and the thread lock is held, the deallocation
186 * might have to be done on a separate thread.
187 */
188 struct mpsc_queue_chain wi_deallocate_link;
189 #endif /* CONFIG_SCHED_AUTO_JOIN */
190
191 /*
192 * Work interval class info - determines thread priority for threads
193 * with a work interval driven policy.
194 */
195 wi_class_t wi_class;
196 uint8_t wi_class_offset;
197
198 struct recount_work_interval wi_recount;
199 };
200
201 /*
202 * work_interval_telemetry_data_enabled()
203 *
204 * Helper routine to check if work interval has the collection of telemetry data enabled.
205 */
206 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)207 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
208 {
209 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
210 }
211
212
213 /*
214 * work_interval_get_recount_tracks()
215 *
216 * Returns the recount tracks associated with a work interval, or NULL
217 * if the work interval is NULL or has telemetry disabled.
218 */
219 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)220 work_interval_get_recount_tracks(struct work_interval *work_interval)
221 {
222 if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
223 return work_interval->wi_recount.rwi_current_instance;
224 }
225 return NULL;
226 }
227
228 #if CONFIG_SCHED_AUTO_JOIN
229
230 /*
231 * work_interval_perform_deferred_finish()
232 *
233 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
234 * argument rather than looking at the work_interval since the deferred finish can race with another
235 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
236 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
237 * the deferred state without issues.
238 */
239 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)240 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
241 __unused struct work_interval *work_interval, __unused thread_t thread)
242 {
243
244 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
245 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
246 }
247
248 /*
249 * work_interval_auto_join_increment()
250 *
251 * Routine to increment auto-join counter when a new thread is auto-joined to
252 * the work interval.
253 */
254 static void
work_interval_auto_join_increment(struct work_interval * work_interval)255 work_interval_auto_join_increment(struct work_interval *work_interval)
256 {
257 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
258 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
259 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
260 }
261
262 /*
263 * work_interval_auto_join_decrement()
264 *
265 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
266 * blocking or termination). If this was the last auto-joined thread in the work interval and
267 * there was a deferred finish, performs the finish operation for the work interval.
268 */
269 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)270 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
271 {
272 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
273 work_interval_auto_join_status_t old_status, new_status;
274 struct work_interval_deferred_finish_state deferred_finish_state;
275 bool perform_finish;
276
277 /* Update the auto-join count for the work interval atomically */
278 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
279 perform_finish = false;
280 new_status = old_status;
281 assert(work_interval_status_auto_join_count(old_status) > 0);
282 new_status -= 1;
283 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
284 /* No auto-joined threads remaining and finish is deferred */
285 new_status = 0;
286 perform_finish = true;
287 /*
288 * Its important to copy the deferred finish state here so that this works
289 * when racing with another start-finish cycle.
290 */
291 deferred_finish_state = join_info->deferred_finish_state;
292 }
293 });
294
295 if (perform_finish == true) {
296 /*
297 * Since work_interval_perform_deferred_finish() calls down to
298 * the machine layer callout for finish which gets the thread
299 * group from the thread passed in here, it is important to
300 * make sure that the thread still has the work interval thread
301 * group here.
302 */
303 assert(thread->thread_group == work_interval->wi_group);
304 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
305 }
306 }
307
308 /*
309 * work_interval_auto_join_enabled()
310 *
311 * Helper routine to check if work interval has auto-join enabled.
312 */
313 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)314 work_interval_auto_join_enabled(struct work_interval *work_interval)
315 {
316 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
317 }
318
319 /*
320 * work_interval_deferred_finish_enabled()
321 *
322 * Helper routine to check if work interval has deferred finish enabled.
323 */
324 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)325 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
326 {
327 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
328 }
329
330 #endif /* CONFIG_SCHED_AUTO_JOIN */
331
332 static inline void
work_interval_retain(struct work_interval * work_interval)333 work_interval_retain(struct work_interval *work_interval)
334 {
335 /*
336 * Even though wi_retain is called under a port lock, we have
337 * to use os_ref_retain instead of os_ref_retain_locked
338 * because wi_release is not synchronized. wi_release calls
339 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
340 */
341 os_ref_retain(&work_interval->wi_ref_count);
342 }
343
344 static inline void
work_interval_deallocate(struct work_interval * work_interval)345 work_interval_deallocate(struct work_interval *work_interval)
346 {
347 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
348 work_interval->wi_id);
349 if (work_interval_telemetry_data_enabled(work_interval)) {
350 recount_work_interval_deinit(&work_interval->wi_recount);
351 }
352 kfree_type(struct work_interval, work_interval);
353 }
354
355 /*
356 * work_interval_release()
357 *
358 * Routine to release a ref count on the work interval. If the refcount goes down
359 * to zero, the work interval needs to be de-allocated.
360 *
361 * For non auto-join work intervals, they are de-allocated in this context.
362 *
363 * For auto-join work intervals, the de-allocation cannot be done from this context
364 * since that might need the kernel memory allocator lock. In that case, the
365 * deallocation is done via a thread-call based mpsc queue.
366 */
367 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)368 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
369 {
370 if (os_ref_release(&work_interval->wi_ref_count) == 0) {
371 #if CONFIG_SCHED_AUTO_JOIN
372 if (options & THREAD_WI_THREAD_LOCK_HELD) {
373 work_interval_deferred_release(work_interval);
374 } else {
375 work_interval_deallocate(work_interval);
376 }
377 #else /* CONFIG_SCHED_AUTO_JOIN */
378 work_interval_deallocate(work_interval);
379 #endif /* CONFIG_SCHED_AUTO_JOIN */
380 }
381 }
382
383 void
kern_work_interval_release(struct work_interval * work_interval)384 kern_work_interval_release(struct work_interval *work_interval)
385 {
386 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
387 }
388
389 #if CONFIG_SCHED_AUTO_JOIN
390
391 /*
392 * work_interval_deferred_release()
393 *
394 * Routine to enqueue the work interval on the deallocation mpsc queue.
395 */
396 static void
work_interval_deferred_release(struct work_interval * work_interval)397 work_interval_deferred_release(struct work_interval *work_interval)
398 {
399 mpsc_daemon_enqueue(&work_interval_deallocate_queue,
400 &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
401 }
402
403 /*
404 * work_interval_should_propagate()
405 *
406 * Main policy routine to decide if a thread should be auto-joined to
407 * another thread's work interval. The conditions are arranged such that
408 * the most common bailout condition are checked the earliest. This routine
409 * is called from the scheduler context; so it needs to be efficient and
410 * be careful when taking locks or performing wakeups.
411 */
412 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)413 work_interval_should_propagate(thread_t cthread, thread_t thread)
414 {
415 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
416 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
417 return false;
418 }
419
420 /* Only propagate work intervals which have auto-join enabled */
421 if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
422 return false;
423 }
424
425 /* Work interval propagation is enabled for realtime threads only */
426 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
427 return false;
428 }
429
430
431 /* Work interval propagation only works for threads with the same home thread group */
432 struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
433 if (thread_group_get_home_group(cthread) != thread_home_tg) {
434 return false;
435 }
436
437 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
438 if (thread->thread_group != thread_home_tg) {
439 return false;
440 }
441
442 /* If either thread is inactive (in the termination path), do not propagate auto-join */
443 if ((!cthread->active) || (!thread->active)) {
444 return false;
445 }
446
447 return true;
448 }
449
450 /*
451 * work_interval_auto_join_propagate()
452 *
453 * Routine to auto-join a thread into another thread's work interval
454 *
455 * Should only be invoked if work_interval_should_propagate() returns
456 * true. Also expects "from" thread to be current thread and "to" thread
457 * to be locked.
458 */
459 void
work_interval_auto_join_propagate(thread_t from,thread_t to)460 work_interval_auto_join_propagate(thread_t from, thread_t to)
461 {
462 assert(from == current_thread());
463 work_interval_retain(from->th_work_interval);
464 work_interval_auto_join_increment(from->th_work_interval);
465 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
466 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
467 assert(kr == KERN_SUCCESS);
468 }
469
470 /*
471 * work_interval_auto_join_unwind()
472 *
473 * Routine to un-join an auto-joined work interval for a thread that is blocking.
474 *
475 * Expects thread to be locked.
476 */
477 void
work_interval_auto_join_unwind(thread_t thread)478 work_interval_auto_join_unwind(thread_t thread)
479 {
480 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
481 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
482 assert(kr == KERN_SUCCESS);
483 }
484
485 /*
486 * work_interval_auto_join_demote()
487 *
488 * Routine to un-join an auto-joined work interval when a thread is changing from
489 * realtime to non-realtime scheduling mode. This could happen due to multiple
490 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
491 * the thread being demoted may not be the current thread.
492 *
493 * Expects thread to be locked.
494 */
495 void
work_interval_auto_join_demote(thread_t thread)496 work_interval_auto_join_demote(thread_t thread)
497 {
498 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
499 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
500 assert(kr == KERN_SUCCESS);
501 }
502
503 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)504 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
505 __assert_only mpsc_daemon_queue_t dq)
506 {
507 struct work_interval *work_interval = NULL;
508 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
509 assert(dq == &work_interval_deallocate_queue);
510 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
511 work_interval_deallocate(work_interval);
512 }
513
514 #endif /* CONFIG_SCHED_AUTO_JOIN */
515
516 #if CONFIG_SCHED_AUTO_JOIN
517 __startup_func
518 static void
work_interval_subsystem_init(void)519 work_interval_subsystem_init(void)
520 {
521 /*
522 * The work interval deallocation queue must be a thread call based queue
523 * because it is woken up from contexts where the thread lock is held. The
524 * only way to perform wakeups safely in those contexts is to wakeup a
525 * thread call which is guaranteed to be on a different waitq and would
526 * not hash onto the same global waitq which might be currently locked.
527 */
528 mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
529 work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
530 MPSC_DAEMON_INIT_NONE);
531 }
532 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
533 #endif /* CONFIG_SCHED_AUTO_JOIN */
534
535 /*
536 * work_interval_port_convert
537 *
538 * Called with port locked, returns reference to work interval
539 * if indeed the port is a work interval kobject port
540 */
541 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)542 work_interval_port_convert_locked(ipc_port_t port)
543 {
544 struct work_interval *work_interval = NULL;
545
546 if (IP_VALID(port)) {
547 work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
548 if (work_interval) {
549 work_interval_retain(work_interval);
550 }
551 }
552
553 return work_interval;
554 }
555
556 /*
557 * port_name_to_work_interval
558 *
559 * Description: Obtain a reference to the work_interval associated with a given port.
560 *
561 * Parameters: name A Mach port name to translate.
562 *
563 * Returns: NULL The given Mach port did not reference a work_interval.
564 * !NULL The work_interval that is associated with the Mach port.
565 */
566 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)567 port_name_to_work_interval(mach_port_name_t name,
568 struct work_interval **work_interval)
569 {
570 if (!MACH_PORT_VALID(name)) {
571 return KERN_INVALID_NAME;
572 }
573
574 ipc_port_t port = IP_NULL;
575 kern_return_t kr = KERN_SUCCESS;
576
577 kr = ipc_port_translate_send(current_space(), name, &port);
578 if (kr != KERN_SUCCESS) {
579 return kr;
580 }
581 /* port is locked */
582
583 assert(IP_VALID(port));
584
585 struct work_interval *converted_work_interval;
586
587 converted_work_interval = work_interval_port_convert_locked(port);
588
589 /* the port is valid, but doesn't denote a work_interval */
590 if (converted_work_interval == NULL) {
591 kr = KERN_INVALID_CAPABILITY;
592 }
593
594 ip_mq_unlock(port);
595
596 if (kr == KERN_SUCCESS) {
597 *work_interval = converted_work_interval;
598 }
599
600 return kr;
601 }
602
603 kern_return_t
kern_port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)604 kern_port_name_to_work_interval(mach_port_name_t name,
605 struct work_interval **work_interval)
606 {
607 return port_name_to_work_interval(name, work_interval);
608 }
609
610 /*
611 * work_interval_port_no_senders
612 *
613 * Description: Handle a no-senders notification for a work interval port.
614 * Destroys the port and releases its reference on the work interval.
615 *
616 * Parameters: msg A Mach no-senders notification message.
617 *
618 * Note: This assumes that there is only one create-right-from-work-interval point,
619 * if the ability to extract another send right after creation is added,
620 * this will have to change to handle make-send counts correctly.
621 */
622 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)623 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
624 {
625 struct work_interval *work_interval = NULL;
626
627 work_interval = ipc_kobject_dealloc_port(port, mscount,
628 IKOT_WORK_INTERVAL);
629
630 work_interval->wi_port = MACH_PORT_NULL;
631
632 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
633 }
634
635 /*
636 * work_interval_port_type()
637 *
638 * Converts a port name into the work interval object and returns its type.
639 *
640 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
641 * valid type for work intervals).
642 */
643 static uint32_t
work_interval_port_type(mach_port_name_t port_name)644 work_interval_port_type(mach_port_name_t port_name)
645 {
646 struct work_interval *work_interval = NULL;
647 kern_return_t kr;
648 uint32_t work_interval_type;
649
650 if (port_name == MACH_PORT_NULL) {
651 return WORK_INTERVAL_TYPE_LAST;
652 }
653
654 kr = port_name_to_work_interval(port_name, &work_interval);
655 if (kr != KERN_SUCCESS) {
656 return WORK_INTERVAL_TYPE_LAST;
657 }
658 /* work_interval has a +1 ref */
659
660 assert(work_interval != NULL);
661 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
662 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
663 return work_interval_type;
664 }
665
666 /*
667 * Sparse - not all work interval classes imply a scheduling policy change.
668 * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
669 * adopted the REALTIME sched mode to take effect.
670 */
671 static const struct {
672 int priority;
673 sched_mode_t sched_mode;
674 } work_interval_class_data[WI_CLASS_COUNT] = {
675 [WI_CLASS_BEST_EFFORT] = {
676 BASEPRI_DEFAULT, // 31
677 TH_MODE_TIMESHARE,
678 },
679
680 [WI_CLASS_APP_SUPPORT] = {
681 BASEPRI_USER_INITIATED, // 37
682 TH_MODE_TIMESHARE,
683 },
684
685 [WI_CLASS_SYSTEM] = {
686 BASEPRI_FOREGROUND + 1, // 48
687 TH_MODE_FIXED,
688 },
689
690 [WI_CLASS_SYSTEM_CRITICAL] = {
691 MAXPRI_USER + 1, // 64
692 TH_MODE_FIXED,
693 },
694
695 [WI_CLASS_REALTIME_CRITICAL] = {
696 BASEPRI_RTQUEUES + 1, // 98
697 TH_MODE_REALTIME,
698 },
699 };
700
701 /*
702 * Called when a thread gets its scheduling priority from its associated work
703 * interval.
704 */
705 int
work_interval_get_priority(thread_t thread)706 work_interval_get_priority(thread_t thread)
707 {
708 const struct work_interval *work_interval = thread->th_work_interval;
709 assert(work_interval != NULL);
710
711 assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
712 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
713 int priority = work_interval_class_data[work_interval->wi_class].priority;
714 assert(priority != 0);
715
716 priority += work_interval->wi_class_offset;
717 assert3u(priority, <=, MAXPRI);
718
719 return priority;
720 }
721
722 kern_return_t
kern_work_interval_get_policy(struct work_interval * work_interval,integer_t * policy,integer_t * priority)723 kern_work_interval_get_policy(struct work_interval *work_interval,
724 integer_t *policy,
725 integer_t *priority)
726 {
727 if (!work_interval || !priority || !policy) {
728 return KERN_INVALID_ARGUMENT;
729 }
730
731 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
732
733 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
734 if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) {
735 *policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
736 *priority = work_interval_class_data[work_interval->wi_class].priority;
737 assert(*priority != 0);
738 *priority += work_interval->wi_class_offset;
739 assert3u(*priority, <=, MAXPRI);
740 } /* No sched mode change for REALTIME (threads must explicitly opt-in) */
741 return KERN_SUCCESS;
742 }
743
744 #if CONFIG_THREAD_GROUPS
745 kern_return_t
kern_work_interval_get_thread_group(struct work_interval * work_interval,struct thread_group ** tg)746 kern_work_interval_get_thread_group(struct work_interval *work_interval,
747 struct thread_group **tg)
748 {
749 if (!work_interval || !tg) {
750 return KERN_INVALID_ARGUMENT;
751 }
752 if (work_interval->wi_group) {
753 *tg = thread_group_retain(work_interval->wi_group);
754 return KERN_SUCCESS;
755 } else {
756 return KERN_INVALID_ARGUMENT;
757 }
758 }
759 #endif /* CONFIG_THREAD_GROUPS */
760
761 /*
762 * Switch to a policy driven by the work interval (if applicable).
763 */
764 static void
work_interval_set_policy(thread_t thread)765 work_interval_set_policy(thread_t thread)
766 {
767 assert3p(thread, ==, current_thread());
768
769 /*
770 * Ignore policy changes if the workload context shouldn't affect the
771 * scheduling policy.
772 */
773 workload_config_flags_t flags = WLC_F_NONE;
774
775 /* There may be no config at all. That's ok. */
776 if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
777 (flags & WLC_F_THREAD_POLICY) == 0) {
778 return;
779 }
780
781 const struct work_interval *work_interval = thread->th_work_interval;
782 assert(work_interval != NULL);
783
784 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
785 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
786
787 /*
788 * A mode of TH_MODE_NONE implies that this work interval has no
789 * associated scheduler effects.
790 */
791 if (mode == TH_MODE_NONE) {
792 return;
793 }
794
795 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
796 TASK_POLICY_WI_DRIVEN, true, mode);
797 assert(thread->requested_policy.thrp_wi_driven);
798
799 return;
800 }
801
802 /*
803 * Clear a work interval driven policy.
804 */
805 static void
work_interval_clear_policy(thread_t thread)806 work_interval_clear_policy(thread_t thread)
807 {
808 assert3p(thread, ==, current_thread());
809
810 if (!thread->requested_policy.thrp_wi_driven) {
811 return;
812 }
813
814 const sched_mode_t mode = sched_get_thread_mode_user(thread);
815
816 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
817 TASK_POLICY_WI_DRIVEN, false,
818 mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
819
820 assert(!thread->requested_policy.thrp_wi_driven);
821
822 return;
823 }
824
825 /*
826 * thread_set_work_interval()
827 *
828 * Change thread's bound work interval to the passed-in work interval
829 * Consumes +1 ref on work_interval upon success.
830 *
831 * May also pass NULL to un-set work_interval on the thread
832 * Will deallocate any old work interval on the thread
833 * Return error if thread does not satisfy requirements to join work interval
834 *
835 * For non auto-join work intervals, deallocate any old work interval on the thread
836 * For auto-join work intervals, the routine may wakeup the work interval deferred
837 * deallocation queue since thread locks might be currently held.
838 */
839 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)840 thread_set_work_interval(thread_t thread,
841 struct work_interval *work_interval, thread_work_interval_options_t options)
842 {
843 /* All explicit work interval operations should always be from the current thread */
844 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
845 assert(thread == current_thread());
846 }
847
848 /* All cases of needing the thread lock should be from explicit join scenarios */
849 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
850 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
851 }
852
853 /* For all cases of auto join must come in with the thread lock held */
854 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
855 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
856 }
857
858 #if CONFIG_THREAD_GROUPS
859 if (work_interval && !work_interval->wi_group) {
860 /* Reject join on work intervals with deferred thread group creation */
861 return KERN_INVALID_ARGUMENT;
862 }
863 #endif /* CONFIG_THREAD_GROUPS */
864
865 if (work_interval) {
866 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
867
868 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
869 /* Ensure no kern_work_interval_set_workload_id can happen after this point */
870 uint32_t wlid_flags;
871 (void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
872 WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
873 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
874 /* For workload IDs with rt-allowed, neuter the check below to
875 * enable joining before the thread has become realtime for all
876 * work interval types */
877 work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
878 }
879 }
880
881 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
882 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
883 return KERN_INVALID_ARGUMENT;
884 }
885 }
886
887 /*
888 * Ensure a work interval scheduling policy is not used if the thread is
889 * leaving the work interval.
890 */
891 if (work_interval == NULL &&
892 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
893 work_interval_clear_policy(thread);
894 }
895
896 struct work_interval *old_th_wi = thread->th_work_interval;
897 #if CONFIG_SCHED_AUTO_JOIN
898 spl_t s;
899 /* Take the thread lock if needed */
900 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
901 s = splsched();
902 thread_lock(thread);
903 }
904
905 /*
906 * Work interval auto-join leak to non-RT threads.
907 *
908 * If thread might be running on a remote core and it's not in the context switch path (where
909 * thread is neither running, blocked or in the runq), its not possible to update the
910 * work interval & thread group remotely since its not possible to update CLPC for a remote
911 * core. This situation might happen when a thread is transitioning from realtime to
912 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
913 * be part of the work interval.
914 *
915 * Since there is no immediate mitigation to this issue, the policy is to set a new
916 * flag on the thread which indicates that such a "leak" has happened. This flag will
917 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
918 */
919 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
920
921 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
922 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
923 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
924 return KERN_SUCCESS;
925 }
926
927 const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
928
929 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
930 __kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
931 __kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
932 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
933 thread_tid(thread), old_tg_id, new_tg_id, options);
934 }
935
936 if (old_wi_auto_joined) {
937 /*
938 * If thread was auto-joined to a work interval and is not realtime, make sure it
939 * happened due to the "leak" described above.
940 */
941 if (thread->sched_mode != TH_MODE_REALTIME) {
942 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
943 }
944
945 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
946 work_interval_auto_join_decrement(old_th_wi, thread);
947 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
948 }
949
950 #endif /* CONFIG_SCHED_AUTO_JOIN */
951
952 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
953 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
954
955 /* transfer +1 ref to thread */
956 thread->th_work_interval = work_interval;
957
958 #if CONFIG_SCHED_AUTO_JOIN
959
960 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
961 assert(work_interval_auto_join_enabled(work_interval) == true);
962 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
963 }
964
965 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
966 thread_unlock(thread);
967 splx(s);
968 }
969 #endif /* CONFIG_SCHED_AUTO_JOIN */
970
971 /*
972 * The thread got a new work interval. It may come with a work interval
973 * scheduling policy that needs to be applied.
974 */
975 if (work_interval != NULL &&
976 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
977 work_interval_set_policy(thread);
978 }
979
980 #if CONFIG_THREAD_GROUPS
981 if (work_interval) {
982 /* Prevent thread_group_set_name after CLPC may have already heard
983 * about the thread group */
984 (void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
985 WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
986 }
987 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
988
989 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
990 #if CONFIG_SCHED_AUTO_JOIN
991 thread_set_autojoin_thread_group_locked(thread, new_tg);
992 #endif
993 } else {
994 thread_set_work_interval_thread_group(thread, new_tg);
995 }
996 #endif /* CONFIG_THREAD_GROUPS */
997
998 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
999 /* Construct mask to XOR with th_work_interval_flags to clear the
1000 * currently present flags and set the new flags in wlid_flags. */
1001 uint32_t wlid_flags = 0;
1002 if (work_interval) {
1003 wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
1004 }
1005 thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
1006 &thread->th_work_interval_flags, relaxed);
1007 th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
1008 TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
1009 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1010 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1011 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1012 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1013 }
1014 }
1015 if (th_wi_xor_mask) {
1016 os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1017 }
1018
1019 /*
1020 * Now that the interval flags have been set, re-evaluate
1021 * whether the thread needs to be undemoted - the new work
1022 * interval may have the RT_ALLOWED flag. and the thread may
1023 * have have a realtime policy but be demoted.
1024 */
1025 thread_rt_evaluate(thread);
1026 }
1027
1028 if (old_th_wi != NULL) {
1029 work_interval_release(old_th_wi, options);
1030 }
1031
1032 return KERN_SUCCESS;
1033 }
1034
1035 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1036 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1037 {
1038 assert(thread == current_thread());
1039 return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1040 }
1041
1042 kern_return_t
work_interval_thread_terminate(thread_t thread)1043 work_interval_thread_terminate(thread_t thread)
1044 {
1045 assert(thread == current_thread());
1046 if (thread->th_work_interval != NULL) {
1047 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1048 }
1049 return KERN_SUCCESS;
1050 }
1051
1052 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1053 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1054 {
1055 assert(thread == current_thread());
1056 assert(kwi_args->work_interval_id != 0);
1057
1058 struct work_interval *work_interval = thread->th_work_interval;
1059
1060 if (work_interval == NULL ||
1061 work_interval->wi_id != kwi_args->work_interval_id) {
1062 /* This thread must have adopted the work interval to be able to notify */
1063 return KERN_INVALID_ARGUMENT;
1064 }
1065
1066 task_t notifying_task = current_task();
1067
1068 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1069 work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1070 /* Only the creating task can do a notify */
1071 return KERN_INVALID_ARGUMENT;
1072 }
1073
1074 spl_t s = splsched();
1075
1076 #if CONFIG_THREAD_GROUPS
1077 assert(work_interval->wi_group == thread->thread_group);
1078 #endif /* CONFIG_THREAD_GROUPS */
1079
1080 uint64_t urgency_param1, urgency_param2;
1081 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1082
1083 splx(s);
1084
1085 /* called without interrupts disabled */
1086 machine_work_interval_notify(thread, kwi_args);
1087
1088 return KERN_SUCCESS;
1089 }
1090
1091 /* Start at 1, 0 is not a valid work interval ID */
1092 static _Atomic uint64_t unique_work_interval_id = 1;
1093
1094 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1095 kern_work_interval_create(thread_t thread,
1096 struct kern_work_interval_create_args *create_params)
1097 {
1098 assert(thread == current_thread());
1099
1100 uint32_t create_flags = create_params->wica_create_flags;
1101
1102 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1103 thread->th_work_interval != NULL) {
1104 /*
1105 * If the thread is doing a legacy combined create and join,
1106 * it shouldn't already be part of a work interval.
1107 *
1108 * (Creating a joinable WI is allowed anytime.)
1109 */
1110 return KERN_FAILURE;
1111 }
1112
1113 /*
1114 * Check the validity of the create flags before allocating the work
1115 * interval.
1116 */
1117 task_t creating_task = current_task();
1118 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1119 /*
1120 * CA_CLIENT work intervals do not create new thread groups.
1121 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1122 * per each application task
1123 */
1124 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1125 return KERN_FAILURE;
1126 }
1127 if (!task_is_app(creating_task)) {
1128 #if XNU_TARGET_OS_OSX
1129 /*
1130 * Soft-fail the case of a non-app pretending to be an
1131 * app, by allowing it to press the buttons, but they're
1132 * not actually connected to anything.
1133 */
1134 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1135 #else
1136 /*
1137 * On iOS, it's a hard failure to get your apptype
1138 * wrong and then try to render something.
1139 */
1140 return KERN_NOT_SUPPORTED;
1141 #endif /* XNU_TARGET_OS_OSX */
1142 }
1143 if (task_set_ca_client_wi(creating_task, true) == false) {
1144 return KERN_FAILURE;
1145 }
1146 }
1147
1148 #if CONFIG_SCHED_AUTO_JOIN
1149 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1150 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1151 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1152 return KERN_NOT_SUPPORTED;
1153 }
1154 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1155 return KERN_NOT_SUPPORTED;
1156 }
1157 }
1158
1159 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1160 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1161 return KERN_NOT_SUPPORTED;
1162 }
1163 }
1164 #endif /* CONFIG_SCHED_AUTO_JOIN */
1165
1166 struct work_interval *work_interval = kalloc_type(struct work_interval,
1167 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1168
1169 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1170
1171 *work_interval = (struct work_interval) {
1172 .wi_id = work_interval_id,
1173 .wi_ref_count = {},
1174 .wi_create_flags = create_flags,
1175 .wi_creator_pid = pid_from_task(creating_task),
1176 .wi_creator_uniqueid = get_task_uniqueid(creating_task),
1177 .wi_creator_pidversion = get_task_version(creating_task),
1178 };
1179 os_ref_init(&work_interval->wi_ref_count, NULL);
1180
1181 if (work_interval_telemetry_data_enabled(work_interval)) {
1182 recount_work_interval_init(&work_interval->wi_recount);
1183 }
1184
1185 __kdebug_only uint64_t tg_id = 0;
1186 #if CONFIG_THREAD_GROUPS
1187 struct thread_group *tg;
1188 if ((create_flags &
1189 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1190 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1191 /* defer creation of the thread group until the
1192 * kern_work_interval_set_workload_id() call */
1193 work_interval->wi_group = NULL;
1194 } else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1195 /* create a new group for the interval to represent */
1196 char name[THREAD_GROUP_MAXNAME] = "";
1197
1198 snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1199 work_interval->wi_creator_pid);
1200
1201 tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1202
1203 thread_group_set_name(tg, name);
1204
1205 work_interval->wi_group = tg;
1206 } else {
1207 /* the interval represents the thread's home group */
1208 tg = thread_group_get_home_group(thread);
1209
1210 thread_group_retain(tg);
1211
1212 work_interval->wi_group = tg;
1213 }
1214
1215 /* Capture the tg_id for tracing purposes */
1216 tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1217
1218 #endif /* CONFIG_THREAD_GROUPS */
1219
1220 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1221 mach_port_name_t name = MACH_PORT_NULL;
1222
1223 /* work_interval has a +1 ref, moves to the port */
1224 work_interval->wi_port = ipc_kobject_alloc_port(work_interval,
1225 IKOT_WORK_INTERVAL, IPC_KOBJECT_ALLOC_MAKE_SEND);
1226
1227
1228 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1229
1230 if (!MACH_PORT_VALID(name)) {
1231 /*
1232 * copyout failed (port is already deallocated)
1233 * Because of the port-destroyed magic,
1234 * the work interval is already deallocated too.
1235 */
1236 return KERN_RESOURCE_SHORTAGE;
1237 }
1238
1239 create_params->wica_port = name;
1240 } else {
1241 /* work_interval has a +1 ref, moves to the thread */
1242 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1243 if (kr != KERN_SUCCESS) {
1244 /* No other thread can join this work interval since it isn't
1245 * JOINABLE so release the reference on work interval */
1246 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1247 return kr;
1248 }
1249
1250 create_params->wica_port = MACH_PORT_NULL;
1251 }
1252
1253 create_params->wica_id = work_interval_id;
1254
1255 if (tg_id != ~0) {
1256 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1257 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1258 }
1259 return KERN_SUCCESS;
1260 }
1261
1262 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1263 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1264 {
1265 assert(flags != NULL);
1266
1267 kern_return_t kr;
1268 struct work_interval *work_interval;
1269
1270 kr = port_name_to_work_interval(port_name, &work_interval);
1271 if (kr != KERN_SUCCESS) {
1272 return kr;
1273 }
1274
1275 assert(work_interval != NULL);
1276 *flags = work_interval->wi_create_flags;
1277
1278 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1279
1280 return KERN_SUCCESS;
1281 }
1282
1283 #if CONFIG_THREAD_GROUPS
1284 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1285 "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1286 #endif /* CONFIG_THREAD_GROUPS */
1287
1288 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1289 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1290 size_t len)
1291 {
1292 kern_return_t kr;
1293 struct work_interval *work_interval;
1294
1295 if (len > WORK_INTERVAL_NAME_MAX) {
1296 return KERN_INVALID_ARGUMENT;
1297 }
1298 kr = port_name_to_work_interval(port_name, &work_interval);
1299 if (kr != KERN_SUCCESS) {
1300 return kr;
1301 }
1302
1303 assert(work_interval != NULL);
1304
1305 #if CONFIG_THREAD_GROUPS
1306 uint32_t wi_group_flags = os_atomic_load(
1307 &work_interval->wi_group_flags, relaxed);
1308 if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1309 kr = KERN_INVALID_ARGUMENT;
1310 goto out;
1311 }
1312 if (!work_interval->wi_group) {
1313 kr = KERN_INVALID_ARGUMENT;
1314 goto out;
1315 }
1316
1317 if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1318 char tgname[THREAD_GROUP_MAXNAME];
1319 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1320 name);
1321 thread_group_set_name(work_interval->wi_group, tgname);
1322 }
1323
1324 out:
1325 #endif /* CONFIG_THREAD_GROUPS */
1326 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1327
1328 return kr;
1329 }
1330
1331 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1332 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1333 struct kern_work_interval_workload_id_args *workload_id_args)
1334 {
1335 kern_return_t kr;
1336 struct work_interval *work_interval;
1337 uint32_t wlida_flags = 0;
1338 uint32_t wlid_flags = 0;
1339 #if CONFIG_THREAD_GROUPS
1340 uint32_t tg_flags = 0;
1341 #endif
1342 bool from_workload_config = false;
1343
1344 /* Ensure workload ID name is non-empty. */
1345 if (!workload_id_args->wlida_name[0]) {
1346 return KERN_INVALID_ARGUMENT;
1347 }
1348
1349 kr = port_name_to_work_interval(port_name, &work_interval);
1350 if (kr != KERN_SUCCESS) {
1351 return kr;
1352 }
1353
1354 assert(work_interval != NULL);
1355 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1356 kr = KERN_INVALID_ARGUMENT;
1357 goto out;
1358 }
1359
1360 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1361 /* Reject work intervals that didn't indicate they will have a workload ID
1362 * at creation. In particular if the work interval has its own thread group,
1363 * its creation must have been deferred in kern_work_interval_create */
1364 kr = KERN_INVALID_ARGUMENT;
1365 goto out;
1366 }
1367
1368 workload_config_t wl_config = {};
1369 kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1370 if (kr == KERN_SUCCESS) {
1371 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1372 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1373 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1374 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1375 /* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1376 } else {
1377 kr = KERN_INVALID_ARGUMENT;
1378 goto out;
1379 }
1380 }
1381
1382 wlida_flags = wl_config.wc_flags;
1383
1384 #if !defined(XNU_TARGET_OS_XR)
1385 wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1386 #endif /* !XNU_TARGET_OS_XR */
1387
1388 #if CONFIG_THREAD_GROUPS
1389 tg_flags = wl_config.wc_thread_group_flags;
1390 if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1391 (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1392 kr = KERN_INVALID_ARGUMENT;
1393 goto out;
1394 }
1395 #endif /* CONFIG_THREAD_GROUPS */
1396
1397 from_workload_config = true;
1398 } else {
1399 /* If the workload is not present in the table, perform basic validation
1400 * that the create flags passed in match the ones used at work interval
1401 * create time */
1402 if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1403 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1404 kr = KERN_INVALID_ARGUMENT;
1405 goto out;
1406 }
1407
1408 const bool wc_avail = workload_config_available();
1409 if (!wc_avail) {
1410 wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1411 }
1412
1413 if (workload_id_args->wlida_flags & WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED) {
1414 wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED;
1415 }
1416
1417 /*
1418 * If the workload config wasn't even loaded then fallback to
1419 * older behaviour where the new thread group gets the default
1420 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1421 */
1422 #if CONFIG_THREAD_GROUPS
1423 if (!wc_avail) {
1424 tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1425 } else {
1426 struct thread_group *home_group =
1427 thread_group_get_home_group(current_thread());
1428 if (home_group != NULL) {
1429 tg_flags = thread_group_get_flags(home_group);
1430 }
1431 }
1432 #endif /* CONFIG_THREAD_GROUPS */
1433 }
1434
1435 workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1436
1437 /* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1438 * has been set). */
1439 wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1440 if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1441 &wlid_flags, relaxed)) {
1442 if (from_workload_config) {
1443 work_interval->wi_class = wl_config.wc_class;
1444 work_interval->wi_class_offset = wl_config.wc_class_offset;
1445 }
1446 #if CONFIG_THREAD_GROUPS
1447 if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1448 /* Perform deferred thread group creation, now that tgflags are known */
1449 struct thread_group *tg;
1450 tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1451 THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1452
1453 char tgname[THREAD_GROUP_MAXNAME] = "";
1454 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1455 workload_id_args->wlida_name);
1456 thread_group_set_name(tg, tgname);
1457
1458 assert(work_interval->wi_group == NULL);
1459 work_interval->wi_group = tg;
1460 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1461 work_interval->wi_id, work_interval->wi_create_flags,
1462 work_interval->wi_creator_pid, thread_group_get_id(tg));
1463 }
1464 #endif /* CONFIG_THREAD_GROUPS */
1465 } else {
1466 /* Workload ID has previously been set (or a thread has already joined). */
1467 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1468 kr = KERN_INVALID_ARGUMENT;
1469 goto out;
1470 }
1471 /* Treat this request as a query for the out parameters of the ID */
1472 workload_id_args->wlida_flags = wlid_flags;
1473 }
1474
1475 /*
1476 * Emit tracepoints for successfully setting the workload ID.
1477 *
1478 * After rdar://89342390 has been fixed and a new work interval ktrace
1479 * provider has been added, it will be possible to associate a numeric
1480 * ID with an ID name. Thus, for those cases where the ID name has been
1481 * looked up successfully (`from_workload_config` is true) it will no
1482 * longer be necessary to emit a tracepoint with the full ID name.
1483 */
1484 KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1485 work_interval->wi_id, from_workload_config);
1486 kernel_debug_string_simple(
1487 MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1488 workload_id_args->wlida_name);
1489
1490 kr = KERN_SUCCESS;
1491
1492 out:
1493 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1494
1495 return kr;
1496 }
1497
1498
1499 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1500 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1501 {
1502 if (work_interval_id == 0) {
1503 return KERN_INVALID_ARGUMENT;
1504 }
1505
1506 if (thread->th_work_interval == NULL ||
1507 thread->th_work_interval->wi_id != work_interval_id) {
1508 /* work ID isn't valid or doesn't match joined work interval ID */
1509 return KERN_INVALID_ARGUMENT;
1510 }
1511
1512 return thread_set_work_interval_explicit_join(thread, NULL);
1513 }
1514
1515 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1516 kern_work_interval_join(thread_t thread,
1517 mach_port_name_t port_name)
1518 {
1519 struct work_interval *work_interval = NULL;
1520 kern_return_t kr;
1521
1522 if (port_name == MACH_PORT_NULL) {
1523 /* 'Un-join' the current work interval */
1524 return thread_set_work_interval_explicit_join(thread, NULL);
1525 }
1526
1527 kr = port_name_to_work_interval(port_name, &work_interval);
1528 if (kr != KERN_SUCCESS) {
1529 return kr;
1530 }
1531 /* work_interval has a +1 ref */
1532
1533 assert(work_interval != NULL);
1534
1535 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1536 /* ref was consumed by passing it to the thread in the successful case */
1537 if (kr != KERN_SUCCESS) {
1538 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1539 }
1540 return kr;
1541 }
1542
1543 kern_return_t
kern_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1544 kern_work_interval_explicit_join(thread_t thread,
1545 struct work_interval *work_interval)
1546 {
1547 kern_return_t kr;
1548 assert(thread == current_thread());
1549 assert(work_interval != NULL);
1550
1551 /*
1552 * We take +1 ref on the work interval which is consumed by passing it
1553 * on to the thread below in the successful case.
1554 */
1555 work_interval_retain(work_interval);
1556
1557 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1558 if (kr != KERN_SUCCESS) {
1559 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1560 }
1561 return kr;
1562 }
1563
1564 /*
1565 * work_interval_port_type_render_server()
1566 *
1567 * Helper routine to determine if the port points to a
1568 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1569 */
1570 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1571 work_interval_port_type_render_server(mach_port_name_t port_name)
1572 {
1573 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1574 }
1575