1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #include <sys/work_interval.h>
31
32 #include <kern/work_interval.h>
33
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43
44 #include <mach/kern_return.h>
45 #include <mach/notify.h>
46 #include <os/refcnt.h>
47
48 #include <stdatomic.h>
49
50 /*
51 * With the introduction of auto-join work intervals, it is possible
52 * to change the work interval (and related thread group) of a thread in a
53 * variety of contexts (thread termination, context switch, thread mode
54 * change etc.). In order to clearly specify the policy expectation and
55 * the locking behavior, all calls to thread_set_work_interval() pass
56 * in a set of flags.
57 */
58
59 __options_decl(thread_work_interval_options_t, uint32_t, {
60 /* Change the work interval using the explicit join rules */
61 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
62 /* Change the work interval using the auto-join rules */
63 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
64 /* Caller already holds the thread lock */
65 THREAD_WI_THREAD_LOCK_HELD = 0x4,
66 /* Caller does not hold the thread lock */
67 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
68 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
69 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
70 });
71
72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
73 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
74
75 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
76 .iko_op_stable = true,
77 .iko_op_no_senders = work_interval_port_no_senders);
78
79 #if CONFIG_SCHED_AUTO_JOIN
80 /* MPSC queue used to defer deallocate work intervals */
81 static struct mpsc_daemon_queue work_interval_deallocate_queue;
82
83 static void work_interval_deferred_release(struct work_interval *);
84
85 /*
86 * Work Interval Auto-Join Status
87 *
88 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
89 * It packs the following information:
90 * - A bit representing if a "finish" is deferred on the work interval
91 * - Count of number of threads auto-joined to the work interval
92 */
93 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
94 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
95 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
96 typedef uint32_t work_interval_auto_join_status_t;
97
98 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)99 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
100 {
101 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
102 }
103
104 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)105 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
106 {
107 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
108 }
109
110 /*
111 * struct work_interval_deferred_finish_state
112 *
113 * Contains the parameters of the finish operation which is being deferred.
114 */
115 struct work_interval_deferred_finish_state {
116 uint64_t instance_id;
117 uint64_t start;
118 uint64_t deadline;
119 uint64_t complexity;
120 };
121
122 struct work_interval_auto_join_info {
123 struct work_interval_deferred_finish_state deferred_finish_state;
124 work_interval_auto_join_status_t _Atomic status;
125 };
126 #endif /* CONFIG_SCHED_AUTO_JOIN */
127
128 /*
129 * Work Interval structs
130 *
131 * This struct represents a thread group and/or work interval context
132 * in a mechanism that is represented with a kobject.
133 *
134 * Every thread that has joined a WI has a +1 ref, and the port
135 * has a +1 ref as well.
136 *
137 * TODO: groups need to have a 'is for WI' flag
138 * and they need a flag to create that says 'for WI'
139 * This would allow CLPC to avoid allocating WI support
140 * data unless it is needed
141 *
142 * TODO: Enforce not having more than one non-group joinable work
143 * interval per thread group.
144 * CLPC only wants to see one WI-notify callout per group.
145 */
146
147 struct work_interval {
148 uint64_t wi_id;
149 struct os_refcnt wi_ref_count;
150 uint32_t wi_create_flags;
151
152 /* for debugging purposes only, does not hold a ref on port */
153 ipc_port_t wi_port;
154
155 /*
156 * holds uniqueid and version of creating process,
157 * used to permission-gate notify
158 * TODO: you'd think there would be a better way to do this
159 */
160 uint64_t wi_creator_uniqueid;
161 uint32_t wi_creator_pid;
162 int wi_creator_pidversion;
163
164 #if CONFIG_THREAD_GROUPS
165 struct thread_group *wi_group; /* holds +1 ref on group */
166 #endif /* CONFIG_THREAD_GROUPS */
167
168 #if CONFIG_SCHED_AUTO_JOIN
169 /* Information related to auto-join and deferred finish for work interval */
170 struct work_interval_auto_join_info wi_auto_join_info;
171
172 /*
173 * Since the deallocation of auto-join work intervals
174 * can happen in the scheduler when the last thread in
175 * the WI blocks and the thread lock is held, the deallocation
176 * might have to be done on a separate thread.
177 */
178 struct mpsc_queue_chain wi_deallocate_link;
179 #endif /* CONFIG_SCHED_AUTO_JOIN */
180 };
181
182 #if CONFIG_SCHED_AUTO_JOIN
183
184 /*
185 * work_interval_perform_deferred_finish()
186 *
187 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
188 * argument rather than looking at the work_interval since the deferred finish can race with another
189 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
190 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
191 * the deferred state without issues.
192 */
193 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)194 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
195 __unused struct work_interval *work_interval, __unused thread_t thread)
196 {
197
198 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
199 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
200 }
201
202 /*
203 * work_interval_auto_join_increment()
204 *
205 * Routine to increment auto-join counter when a new thread is auto-joined to
206 * the work interval.
207 */
208 static void
work_interval_auto_join_increment(struct work_interval * work_interval)209 work_interval_auto_join_increment(struct work_interval *work_interval)
210 {
211 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
212 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
213 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
214 }
215
216 /*
217 * work_interval_auto_join_decrement()
218 *
219 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
220 * blocking or termination). If this was the last auto-joined thread in the work interval and
221 * there was a deferred finish, performs the finish operation for the work interval.
222 */
223 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)224 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
225 {
226 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
227 work_interval_auto_join_status_t old_status, new_status;
228 struct work_interval_deferred_finish_state deferred_finish_state;
229 bool perform_finish;
230
231 /* Update the auto-join count for the work interval atomically */
232 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
233 perform_finish = false;
234 new_status = old_status;
235 assert(work_interval_status_auto_join_count(old_status) > 0);
236 new_status -= 1;
237 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
238 /* No auto-joined threads remaining and finish is deferred */
239 new_status = 0;
240 perform_finish = true;
241 /*
242 * Its important to copy the deferred finish state here so that this works
243 * when racing with another start-finish cycle.
244 */
245 deferred_finish_state = join_info->deferred_finish_state;
246 }
247 });
248
249 if (perform_finish == true) {
250 /*
251 * Since work_interval_perform_deferred_finish() calls down to
252 * the machine layer callout for finish which gets the thread
253 * group from the thread passed in here, it is important to
254 * make sure that the thread still has the work interval thread
255 * group here.
256 */
257 assert(thread->thread_group == work_interval->wi_group);
258 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
259 }
260 }
261
262 /*
263 * work_interval_auto_join_enabled()
264 *
265 * Helper routine to check if work interval has auto-join enabled.
266 */
267 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)268 work_interval_auto_join_enabled(struct work_interval *work_interval)
269 {
270 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
271 }
272
273 /*
274 * work_interval_deferred_finish_enabled()
275 *
276 * Helper routine to check if work interval has deferred finish enabled.
277 */
278 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)279 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
280 {
281 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
282 }
283
284 #endif /* CONFIG_SCHED_AUTO_JOIN */
285
286 static inline void
work_interval_retain(struct work_interval * work_interval)287 work_interval_retain(struct work_interval *work_interval)
288 {
289 /*
290 * Even though wi_retain is called under a port lock, we have
291 * to use os_ref_retain instead of os_ref_retain_locked
292 * because wi_release is not synchronized. wi_release calls
293 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
294 */
295 os_ref_retain(&work_interval->wi_ref_count);
296 }
297
298 static inline void
work_interval_deallocate(struct work_interval * work_interval)299 work_interval_deallocate(struct work_interval *work_interval)
300 {
301 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
302 work_interval->wi_id);
303 #if CONFIG_THREAD_GROUPS
304 thread_group_release(work_interval->wi_group);
305 work_interval->wi_group = NULL;
306 #endif /* CONFIG_THREAD_GROUPS */
307 kfree_type(struct work_interval, work_interval);
308 }
309
310 /*
311 * work_interval_release()
312 *
313 * Routine to release a ref count on the work interval. If the refcount goes down
314 * to zero, the work interval needs to be de-allocated.
315 *
316 * For non auto-join work intervals, they are de-allocated in this context.
317 *
318 * For auto-join work intervals, the de-allocation cannot be done from this context
319 * since that might need the kernel memory allocator lock. In that case, the
320 * deallocation is done via a thread-call based mpsc queue.
321 */
322 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)323 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
324 {
325 if (os_ref_release(&work_interval->wi_ref_count) == 0) {
326 #if CONFIG_SCHED_AUTO_JOIN
327 if (options & THREAD_WI_THREAD_LOCK_HELD) {
328 work_interval_deferred_release(work_interval);
329 } else {
330 work_interval_deallocate(work_interval);
331 }
332 #else /* CONFIG_SCHED_AUTO_JOIN */
333 work_interval_deallocate(work_interval);
334 #endif /* CONFIG_SCHED_AUTO_JOIN */
335 }
336 }
337
338 #if CONFIG_SCHED_AUTO_JOIN
339
340 /*
341 * work_interval_deferred_release()
342 *
343 * Routine to enqueue the work interval on the deallocation mpsc queue.
344 */
345 static void
work_interval_deferred_release(struct work_interval * work_interval)346 work_interval_deferred_release(struct work_interval *work_interval)
347 {
348 mpsc_daemon_enqueue(&work_interval_deallocate_queue,
349 &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
350 }
351
352 /*
353 * work_interval_should_propagate()
354 *
355 * Main policy routine to decide if a thread should be auto-joined to
356 * another thread's work interval. The conditions are arranged such that
357 * the most common bailout condition are checked the earliest. This routine
358 * is called from the scheduler context; so it needs to be efficient and
359 * be careful when taking locks or performing wakeups.
360 */
361 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)362 work_interval_should_propagate(thread_t cthread, thread_t thread)
363 {
364 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
365 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
366 return false;
367 }
368
369 /* Only propagate work intervals which have auto-join enabled */
370 if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
371 return false;
372 }
373
374 /* Work interval propagation is enabled for realtime threads only */
375 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
376 return false;
377 }
378
379
380 /* Work interval propagation only works for threads with the same home thread group */
381 struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
382 if (thread_group_get_home_group(cthread) != thread_home_tg) {
383 return false;
384 }
385
386 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
387 if (thread->thread_group != thread_home_tg) {
388 return false;
389 }
390
391 /* If either thread is inactive (in the termination path), do not propagate auto-join */
392 if ((!cthread->active) || (!thread->active)) {
393 return false;
394 }
395
396 return true;
397 }
398
399 /*
400 * work_interval_auto_join_propagate()
401 *
402 * Routine to auto-join a thread into another thread's work interval
403 *
404 * Should only be invoked if work_interval_should_propagate() returns
405 * true. Also expects "from" thread to be current thread and "to" thread
406 * to be locked.
407 */
408 void
work_interval_auto_join_propagate(thread_t from,thread_t to)409 work_interval_auto_join_propagate(thread_t from, thread_t to)
410 {
411 assert(from == current_thread());
412 work_interval_retain(from->th_work_interval);
413 work_interval_auto_join_increment(from->th_work_interval);
414 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
415 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
416 assert(kr == KERN_SUCCESS);
417 }
418
419 /*
420 * work_interval_auto_join_unwind()
421 *
422 * Routine to un-join an auto-joined work interval for a thread that is blocking.
423 *
424 * Expects thread to be locked.
425 */
426 void
work_interval_auto_join_unwind(thread_t thread)427 work_interval_auto_join_unwind(thread_t thread)
428 {
429 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
430 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
431 assert(kr == KERN_SUCCESS);
432 }
433
434 /*
435 * work_interval_auto_join_demote()
436 *
437 * Routine to un-join an auto-joined work interval when a thread is changing from
438 * realtime to non-realtime scheduling mode. This could happen due to multiple
439 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
440 * the thread being demoted may not be the current thread.
441 *
442 * Expects thread to be locked.
443 */
444 void
work_interval_auto_join_demote(thread_t thread)445 work_interval_auto_join_demote(thread_t thread)
446 {
447 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
448 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
449 assert(kr == KERN_SUCCESS);
450 }
451
452 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)453 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
454 __assert_only mpsc_daemon_queue_t dq)
455 {
456 struct work_interval *work_interval = NULL;
457 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
458 assert(dq == &work_interval_deallocate_queue);
459 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
460 work_interval_deallocate(work_interval);
461 }
462
463 #endif /* CONFIG_SCHED_AUTO_JOIN */
464
465 #if CONFIG_SCHED_AUTO_JOIN
466 __startup_func
467 static void
work_interval_subsystem_init(void)468 work_interval_subsystem_init(void)
469 {
470 /*
471 * The work interval deallocation queue must be a thread call based queue
472 * because it is woken up from contexts where the thread lock is held. The
473 * only way to perform wakeups safely in those contexts is to wakeup a
474 * thread call which is guaranteed to be on a different waitq and would
475 * not hash onto the same global waitq which might be currently locked.
476 */
477 mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
478 work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL);
479 }
480 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
481 #endif /* CONFIG_SCHED_AUTO_JOIN */
482
483 /*
484 * work_interval_port_convert
485 *
486 * Called with port locked, returns reference to work interval
487 * if indeed the port is a work interval kobject port
488 */
489 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)490 work_interval_port_convert_locked(ipc_port_t port)
491 {
492 struct work_interval *work_interval = NULL;
493
494 if (IP_VALID(port)) {
495 work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
496 if (work_interval) {
497 work_interval_retain(work_interval);
498 }
499 }
500
501 return work_interval;
502 }
503
504 /*
505 * port_name_to_work_interval
506 *
507 * Description: Obtain a reference to the work_interval associated with a given port.
508 *
509 * Parameters: name A Mach port name to translate.
510 *
511 * Returns: NULL The given Mach port did not reference a work_interval.
512 * !NULL The work_interval that is associated with the Mach port.
513 */
514 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)515 port_name_to_work_interval(mach_port_name_t name,
516 struct work_interval **work_interval)
517 {
518 if (!MACH_PORT_VALID(name)) {
519 return KERN_INVALID_NAME;
520 }
521
522 ipc_port_t port = IPC_PORT_NULL;
523 kern_return_t kr = KERN_SUCCESS;
524
525 kr = ipc_port_translate_send(current_space(), name, &port);
526 if (kr != KERN_SUCCESS) {
527 return kr;
528 }
529 /* port is locked */
530
531 assert(IP_VALID(port));
532
533 struct work_interval *converted_work_interval;
534
535 converted_work_interval = work_interval_port_convert_locked(port);
536
537 /* the port is valid, but doesn't denote a work_interval */
538 if (converted_work_interval == NULL) {
539 kr = KERN_INVALID_CAPABILITY;
540 }
541
542 ip_mq_unlock(port);
543
544 if (kr == KERN_SUCCESS) {
545 *work_interval = converted_work_interval;
546 }
547
548 return kr;
549 }
550
551
552 /*
553 * work_interval_port_no_senders
554 *
555 * Description: Handle a no-senders notification for a work interval port.
556 * Destroys the port and releases its reference on the work interval.
557 *
558 * Parameters: msg A Mach no-senders notification message.
559 *
560 * Note: This assumes that there is only one create-right-from-work-interval point,
561 * if the ability to extract another send right after creation is added,
562 * this will have to change to handle make-send counts correctly.
563 */
564 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)565 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
566 {
567 struct work_interval *work_interval = NULL;
568
569 work_interval = ipc_kobject_dealloc_port(port, mscount,
570 IKOT_WORK_INTERVAL);
571
572 work_interval->wi_port = MACH_PORT_NULL;
573
574 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
575 }
576
577 /*
578 * work_interval_port_type()
579 *
580 * Converts a port name into the work interval object and returns its type.
581 *
582 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
583 * valid type for work intervals).
584 */
585 static uint32_t
work_interval_port_type(mach_port_name_t port_name)586 work_interval_port_type(mach_port_name_t port_name)
587 {
588 struct work_interval *work_interval = NULL;
589 kern_return_t kr;
590 uint32_t work_interval_type;
591
592 if (port_name == MACH_PORT_NULL) {
593 return WORK_INTERVAL_TYPE_LAST;
594 }
595
596 kr = port_name_to_work_interval(port_name, &work_interval);
597 if (kr != KERN_SUCCESS) {
598 return WORK_INTERVAL_TYPE_LAST;
599 }
600 /* work_interval has a +1 ref */
601
602 assert(work_interval != NULL);
603 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
604 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
605 return work_interval_type;
606 }
607
608
609 /*
610 * thread_set_work_interval()
611 *
612 * Change thread's bound work interval to the passed-in work interval
613 * Consumes +1 ref on work_interval upon success.
614 *
615 * May also pass NULL to un-set work_interval on the thread
616 * Will deallocate any old work interval on the thread
617 * Return error if thread does not satisfy requirements to join work interval
618 *
619 * For non auto-join work intervals, deallocate any old work interval on the thread
620 * For auto-join work intervals, the routine may wakeup the work interval deferred
621 * deallocation queue since thread locks might be currently held.
622 */
623 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)624 thread_set_work_interval(thread_t thread,
625 struct work_interval *work_interval, thread_work_interval_options_t options)
626 {
627 /* All explicit work interval operations should always be from the current thread */
628 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
629 assert(thread == current_thread());
630 }
631
632 /* All cases of needing the thread lock should be from explicit join scenarios */
633 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
634 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
635 }
636
637 /* For all cases of auto join must come in with the thread lock held */
638 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
639 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
640 }
641
642 if (work_interval) {
643 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
644
645 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
646 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
647 return KERN_INVALID_ARGUMENT;
648 }
649 }
650
651 struct work_interval *old_th_wi = thread->th_work_interval;
652 #if CONFIG_SCHED_AUTO_JOIN
653 bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
654
655 spl_t s;
656 /* Take the thread lock if needed */
657 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
658 s = splsched();
659 thread_lock(thread);
660 }
661
662 /*
663 * Work interval auto-join leak to non-RT threads.
664 *
665 * If thread might be running on a remote core and it's not in the context switch path (where
666 * thread is neither running, blocked or in the runq), its not possible to update the
667 * work interval & thread group remotely since its not possible to update CLPC for a remote
668 * core. This situation might happen when a thread is transitioning from realtime to
669 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
670 * be part of the work interval.
671 *
672 * Since there is no immediate mitigation to this issue, the policy is to set a new
673 * flag on the thread which indicates that such a "leak" has happened. This flag will
674 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
675 */
676 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
677
678 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
679 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
680 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
681 return KERN_SUCCESS;
682 }
683
684 old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
685
686 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
687 __kdebug_only uint64_t old_tg_id = (old_th_wi) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
688 __kdebug_only uint64_t new_tg_id = (work_interval) ? thread_group_get_id(work_interval->wi_group) : ~0;
689 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
690 thread_tid(thread), old_tg_id, new_tg_id, options);
691 }
692
693 if (old_wi_auto_joined) {
694 /*
695 * If thread was auto-joined to a work interval and is not realtime, make sure it
696 * happened due to the "leak" described above.
697 */
698 if (thread->sched_mode != TH_MODE_REALTIME) {
699 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
700 }
701
702 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
703 work_interval_auto_join_decrement(old_th_wi, thread);
704 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
705 }
706
707 #endif /* CONFIG_SCHED_AUTO_JOIN */
708
709 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
710 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
711
712 /* transfer +1 ref to thread */
713 thread->th_work_interval = work_interval;
714
715 #if CONFIG_SCHED_AUTO_JOIN
716
717 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
718 assert(work_interval_auto_join_enabled(work_interval) == true);
719 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
720 }
721
722 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
723 thread_unlock(thread);
724 splx(s);
725 }
726 #endif /* CONFIG_SCHED_AUTO_JOIN */
727
728 #if CONFIG_THREAD_GROUPS
729 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
730
731 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
732 #if CONFIG_SCHED_AUTO_JOIN
733 thread_set_autojoin_thread_group_locked(thread, new_tg);
734 #endif
735 } else {
736 thread_set_work_interval_thread_group(thread, new_tg);
737 }
738 #endif /* CONFIG_THREAD_GROUPS */
739
740 if (old_th_wi != NULL) {
741 work_interval_release(old_th_wi, options);
742 }
743
744 return KERN_SUCCESS;
745 }
746
747 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)748 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
749 {
750 assert(thread == current_thread());
751 return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
752 }
753
754 kern_return_t
work_interval_thread_terminate(thread_t thread)755 work_interval_thread_terminate(thread_t thread)
756 {
757 assert(thread == current_thread());
758 if (thread->th_work_interval != NULL) {
759 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
760 }
761 return KERN_SUCCESS;
762 }
763
764 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)765 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
766 {
767 assert(thread == current_thread());
768 assert(kwi_args->work_interval_id != 0);
769
770 struct work_interval *work_interval = thread->th_work_interval;
771
772 if (work_interval == NULL ||
773 work_interval->wi_id != kwi_args->work_interval_id) {
774 /* This thread must have adopted the work interval to be able to notify */
775 return KERN_INVALID_ARGUMENT;
776 }
777
778 task_t notifying_task = current_task();
779
780 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
781 work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
782 /* Only the creating task can do a notify */
783 return KERN_INVALID_ARGUMENT;
784 }
785
786 spl_t s = splsched();
787
788 #if CONFIG_THREAD_GROUPS
789 assert(work_interval->wi_group == thread->thread_group);
790 #endif /* CONFIG_THREAD_GROUPS */
791
792 uint64_t urgency_param1, urgency_param2;
793 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
794
795 splx(s);
796
797 /* called without interrupts disabled */
798 machine_work_interval_notify(thread, kwi_args);
799
800 return KERN_SUCCESS;
801 }
802
803 /* Start at 1, 0 is not a valid work interval ID */
804 static _Atomic uint64_t unique_work_interval_id = 1;
805
806 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)807 kern_work_interval_create(thread_t thread,
808 struct kern_work_interval_create_args *create_params)
809 {
810 assert(thread == current_thread());
811
812 uint32_t create_flags = create_params->wica_create_flags;
813
814 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
815 thread->th_work_interval != NULL) {
816 /*
817 * If the thread is doing a legacy combined create and join,
818 * it shouldn't already be part of a work interval.
819 *
820 * (Creating a joinable WI is allowed anytime.)
821 */
822 return KERN_FAILURE;
823 }
824
825 /*
826 * Check the validity of the create flags before allocating the work
827 * interval.
828 */
829 task_t creating_task = current_task();
830 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
831 /*
832 * CA_CLIENT work intervals do not create new thread groups.
833 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
834 * per each application task
835 */
836 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
837 return KERN_FAILURE;
838 }
839 if (!task_is_app(creating_task)) {
840 #if XNU_TARGET_OS_OSX
841 /*
842 * Soft-fail the case of a non-app pretending to be an
843 * app, by allowing it to press the buttons, but they're
844 * not actually connected to anything.
845 */
846 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
847 #else
848 /*
849 * On iOS, it's a hard failure to get your apptype
850 * wrong and then try to render something.
851 */
852 return KERN_NOT_SUPPORTED;
853 #endif /* XNU_TARGET_OS_OSX */
854 }
855 if (task_set_ca_client_wi(creating_task, true) == false) {
856 return KERN_FAILURE;
857 }
858 }
859
860 #if CONFIG_SCHED_AUTO_JOIN
861 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
862 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
863 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
864 return KERN_NOT_SUPPORTED;
865 }
866 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
867 return KERN_NOT_SUPPORTED;
868 }
869 }
870
871 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
872 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
873 return KERN_NOT_SUPPORTED;
874 }
875 }
876 #endif /* CONFIG_SCHED_AUTO_JOIN */
877
878 struct work_interval *work_interval = kalloc_type(struct work_interval,
879 Z_WAITOK | Z_ZERO | Z_NOFAIL);
880
881 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
882
883 *work_interval = (struct work_interval) {
884 .wi_id = work_interval_id,
885 .wi_ref_count = {},
886 .wi_create_flags = create_flags,
887 .wi_creator_pid = pid_from_task(creating_task),
888 .wi_creator_uniqueid = get_task_uniqueid(creating_task),
889 .wi_creator_pidversion = get_task_version(creating_task),
890 };
891 os_ref_init(&work_interval->wi_ref_count, NULL);
892
893 __kdebug_only uint64_t tg_id = 0;
894 #if CONFIG_THREAD_GROUPS
895 struct thread_group *tg;
896 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
897 /* create a new group for the interval to represent */
898 char name[THREAD_GROUP_MAXNAME] = "";
899
900 snprintf(name, sizeof(name), "WI[%d] #%lld",
901 work_interval->wi_creator_pid, work_interval_id);
902
903 tg = thread_group_create_and_retain(FALSE);
904
905 thread_group_set_name(tg, name);
906
907 work_interval->wi_group = tg;
908 } else {
909 /* the interval represents the thread's home group */
910 tg = thread_group_get_home_group(thread);
911
912 thread_group_retain(tg);
913
914 work_interval->wi_group = tg;
915 }
916
917 /* Capture the tg_id for tracing purposes */
918 tg_id = thread_group_get_id(work_interval->wi_group);
919
920 #endif /* CONFIG_THREAD_GROUPS */
921
922 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
923 mach_port_name_t name = MACH_PORT_NULL;
924
925 /* work_interval has a +1 ref, moves to the port */
926 work_interval->wi_port = ipc_kobject_alloc_port(
927 (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
928 IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
929
930 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
931
932 if (!MACH_PORT_VALID(name)) {
933 /*
934 * copyout failed (port is already deallocated)
935 * Because of the port-destroyed magic,
936 * the work interval is already deallocated too.
937 */
938 return KERN_RESOURCE_SHORTAGE;
939 }
940
941 create_params->wica_port = name;
942 } else {
943 /* work_interval has a +1 ref, moves to the thread */
944 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
945 if (kr != KERN_SUCCESS) {
946 /* No other thread can join this work interval since it isn't
947 * JOINABLE so release the reference on work interval */
948 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
949 return kr;
950 }
951 create_params->wica_port = MACH_PORT_NULL;
952 }
953
954 create_params->wica_id = work_interval_id;
955
956 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
957 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
958 return KERN_SUCCESS;
959 }
960
961 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)962 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
963 {
964 assert(flags != NULL);
965
966 kern_return_t kr;
967 struct work_interval *work_interval;
968
969 kr = port_name_to_work_interval(port_name, &work_interval);
970 if (kr != KERN_SUCCESS) {
971 return kr;
972 }
973
974 assert(work_interval != NULL);
975 *flags = work_interval->wi_create_flags;
976
977 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
978
979 return KERN_SUCCESS;
980 }
981
982
983 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)984 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
985 {
986 if (work_interval_id == 0) {
987 return KERN_INVALID_ARGUMENT;
988 }
989
990 if (thread->th_work_interval == NULL ||
991 thread->th_work_interval->wi_id != work_interval_id) {
992 /* work ID isn't valid or doesn't match joined work interval ID */
993 return KERN_INVALID_ARGUMENT;
994 }
995
996 return thread_set_work_interval_explicit_join(thread, NULL);
997 }
998
999 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1000 kern_work_interval_join(thread_t thread,
1001 mach_port_name_t port_name)
1002 {
1003 struct work_interval *work_interval = NULL;
1004 kern_return_t kr;
1005
1006 if (port_name == MACH_PORT_NULL) {
1007 /* 'Un-join' the current work interval */
1008 return thread_set_work_interval_explicit_join(thread, NULL);
1009 }
1010
1011 kr = port_name_to_work_interval(port_name, &work_interval);
1012 if (kr != KERN_SUCCESS) {
1013 return kr;
1014 }
1015 /* work_interval has a +1 ref */
1016
1017 assert(work_interval != NULL);
1018
1019 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1020 /* ref was consumed by passing it to the thread in the successful case */
1021 if (kr != KERN_SUCCESS) {
1022 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1023 }
1024 return kr;
1025 }
1026
1027 /*
1028 * work_interval_port_type_render_server()
1029 *
1030 * Helper routine to determine if the port points to a
1031 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1032 */
1033 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1034 work_interval_port_type_render_server(mach_port_name_t port_name)
1035 {
1036 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1037 }
1038