xref: /xnu-8020.121.3/osfmk/kern/work_interval.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 
44 #include <mach/kern_return.h>
45 #include <mach/notify.h>
46 #include <os/refcnt.h>
47 
48 #include <stdatomic.h>
49 
50 /*
51  * With the introduction of auto-join work intervals, it is possible
52  * to change the work interval (and related thread group) of a thread in a
53  * variety of contexts (thread termination, context switch, thread mode
54  * change etc.). In order to clearly specify the policy expectation and
55  * the locking behavior, all calls to thread_set_work_interval() pass
56  * in a set of flags.
57  */
58 
59 __options_decl(thread_work_interval_options_t, uint32_t, {
60 	/* Change the work interval using the explicit join rules */
61 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
62 	/* Change the work interval using the auto-join rules */
63 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
64 	/* Caller already holds the thread lock */
65 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
66 	/* Caller does not hold the thread lock */
67 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
68 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
69 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
70 });
71 
72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
73 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
74 
75 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
76     .iko_op_stable     = true,
77     .iko_op_no_senders = work_interval_port_no_senders);
78 
79 #if CONFIG_SCHED_AUTO_JOIN
80 /* MPSC queue used to defer deallocate work intervals */
81 static struct mpsc_daemon_queue work_interval_deallocate_queue;
82 
83 static void work_interval_deferred_release(struct work_interval *);
84 
85 /*
86  * Work Interval Auto-Join Status
87  *
88  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
89  * It packs the following information:
90  * - A bit representing if a "finish" is deferred on the work interval
91  * - Count of number of threads auto-joined to the work interval
92  */
93 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
94 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
95 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
96 typedef uint32_t work_interval_auto_join_status_t;
97 
98 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)99 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
100 {
101 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
102 }
103 
104 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)105 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
106 {
107 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
108 }
109 
110 /*
111  * struct work_interval_deferred_finish_state
112  *
113  * Contains the parameters of the finish operation which is being deferred.
114  */
115 struct work_interval_deferred_finish_state {
116 	uint64_t instance_id;
117 	uint64_t start;
118 	uint64_t deadline;
119 	uint64_t complexity;
120 };
121 
122 struct work_interval_auto_join_info {
123 	struct work_interval_deferred_finish_state deferred_finish_state;
124 	work_interval_auto_join_status_t _Atomic status;
125 };
126 #endif /* CONFIG_SCHED_AUTO_JOIN */
127 
128 /*
129  * Work Interval structs
130  *
131  * This struct represents a thread group and/or work interval context
132  * in a mechanism that is represented with a kobject.
133  *
134  * Every thread that has joined a WI has a +1 ref, and the port
135  * has a +1 ref as well.
136  *
137  * TODO: groups need to have a 'is for WI' flag
138  *      and they need a flag to create that says 'for WI'
139  *      This would allow CLPC to avoid allocating WI support
140  *      data unless it is needed
141  *
142  * TODO: Enforce not having more than one non-group joinable work
143  *      interval per thread group.
144  *      CLPC only wants to see one WI-notify callout per group.
145  */
146 
147 struct work_interval {
148 	uint64_t wi_id;
149 	struct os_refcnt wi_ref_count;
150 	uint32_t wi_create_flags;
151 
152 	/* for debugging purposes only, does not hold a ref on port */
153 	ipc_port_t wi_port;
154 
155 	/*
156 	 * holds uniqueid and version of creating process,
157 	 * used to permission-gate notify
158 	 * TODO: you'd think there would be a better way to do this
159 	 */
160 	uint64_t wi_creator_uniqueid;
161 	uint32_t wi_creator_pid;
162 	int wi_creator_pidversion;
163 
164 #if CONFIG_THREAD_GROUPS
165 	struct thread_group *wi_group;  /* holds +1 ref on group */
166 #endif /* CONFIG_THREAD_GROUPS */
167 
168 #if CONFIG_SCHED_AUTO_JOIN
169 	/* Information related to auto-join and deferred finish for work interval */
170 	struct work_interval_auto_join_info wi_auto_join_info;
171 
172 	/*
173 	 * Since the deallocation of auto-join work intervals
174 	 * can happen in the scheduler when the last thread in
175 	 * the WI blocks and the thread lock is held, the deallocation
176 	 * might have to be done on a separate thread.
177 	 */
178 	struct mpsc_queue_chain   wi_deallocate_link;
179 #endif /* CONFIG_SCHED_AUTO_JOIN */
180 };
181 
182 #if CONFIG_SCHED_AUTO_JOIN
183 
184 /*
185  * work_interval_perform_deferred_finish()
186  *
187  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
188  * argument rather than looking at the work_interval since the deferred finish can race with another
189  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
190  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
191  * the deferred state without issues.
192  */
193 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)194 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
195     __unused struct work_interval *work_interval, __unused thread_t thread)
196 {
197 
198 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
199 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
200 }
201 
202 /*
203  * work_interval_auto_join_increment()
204  *
205  * Routine to increment auto-join counter when a new thread is auto-joined to
206  * the work interval.
207  */
208 static void
work_interval_auto_join_increment(struct work_interval * work_interval)209 work_interval_auto_join_increment(struct work_interval *work_interval)
210 {
211 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
212 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
213 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
214 }
215 
216 /*
217  * work_interval_auto_join_decrement()
218  *
219  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
220  * blocking or termination). If this was the last auto-joined thread in the work interval and
221  * there was a deferred finish, performs the finish operation for the work interval.
222  */
223 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)224 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
225 {
226 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
227 	work_interval_auto_join_status_t old_status, new_status;
228 	struct work_interval_deferred_finish_state deferred_finish_state;
229 	bool perform_finish;
230 
231 	/* Update the auto-join count for the work interval atomically */
232 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
233 		perform_finish = false;
234 		new_status = old_status;
235 		assert(work_interval_status_auto_join_count(old_status) > 0);
236 		new_status -= 1;
237 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
238 		        /* No auto-joined threads remaining and finish is deferred */
239 		        new_status = 0;
240 		        perform_finish = true;
241 		        /*
242 		         * Its important to copy the deferred finish state here so that this works
243 		         * when racing with another start-finish cycle.
244 		         */
245 		        deferred_finish_state = join_info->deferred_finish_state;
246 		}
247 	});
248 
249 	if (perform_finish == true) {
250 		/*
251 		 * Since work_interval_perform_deferred_finish() calls down to
252 		 * the machine layer callout for finish which gets the thread
253 		 * group from the thread passed in here, it is important to
254 		 * make sure that the thread still has the work interval thread
255 		 * group here.
256 		 */
257 		assert(thread->thread_group == work_interval->wi_group);
258 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
259 	}
260 }
261 
262 /*
263  * work_interval_auto_join_enabled()
264  *
265  * Helper routine to check if work interval has auto-join enabled.
266  */
267 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)268 work_interval_auto_join_enabled(struct work_interval *work_interval)
269 {
270 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
271 }
272 
273 /*
274  * work_interval_deferred_finish_enabled()
275  *
276  * Helper routine to check if work interval has deferred finish enabled.
277  */
278 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)279 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
280 {
281 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
282 }
283 
284 #endif /* CONFIG_SCHED_AUTO_JOIN */
285 
286 static inline void
work_interval_retain(struct work_interval * work_interval)287 work_interval_retain(struct work_interval *work_interval)
288 {
289 	/*
290 	 * Even though wi_retain is called under a port lock, we have
291 	 * to use os_ref_retain instead of os_ref_retain_locked
292 	 * because wi_release is not synchronized. wi_release calls
293 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
294 	 */
295 	os_ref_retain(&work_interval->wi_ref_count);
296 }
297 
298 static inline void
work_interval_deallocate(struct work_interval * work_interval)299 work_interval_deallocate(struct work_interval *work_interval)
300 {
301 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
302 	    work_interval->wi_id);
303 #if CONFIG_THREAD_GROUPS
304 	thread_group_release(work_interval->wi_group);
305 	work_interval->wi_group = NULL;
306 #endif /* CONFIG_THREAD_GROUPS */
307 	kfree_type(struct work_interval, work_interval);
308 }
309 
310 /*
311  * work_interval_release()
312  *
313  * Routine to release a ref count on the work interval. If the refcount goes down
314  * to zero, the work interval needs to be de-allocated.
315  *
316  * For non auto-join work intervals, they are de-allocated in this context.
317  *
318  * For auto-join work intervals, the de-allocation cannot be done from this context
319  * since that might need the kernel memory allocator lock. In that case, the
320  * deallocation is done via a thread-call based mpsc queue.
321  */
322 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)323 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
324 {
325 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
326 #if CONFIG_SCHED_AUTO_JOIN
327 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
328 			work_interval_deferred_release(work_interval);
329 		} else {
330 			work_interval_deallocate(work_interval);
331 		}
332 #else /* CONFIG_SCHED_AUTO_JOIN */
333 		work_interval_deallocate(work_interval);
334 #endif /* CONFIG_SCHED_AUTO_JOIN */
335 	}
336 }
337 
338 #if CONFIG_SCHED_AUTO_JOIN
339 
340 /*
341  * work_interval_deferred_release()
342  *
343  * Routine to enqueue the work interval on the deallocation mpsc queue.
344  */
345 static void
work_interval_deferred_release(struct work_interval * work_interval)346 work_interval_deferred_release(struct work_interval *work_interval)
347 {
348 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
349 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
350 }
351 
352 /*
353  * work_interval_should_propagate()
354  *
355  * Main policy routine to decide if a thread should be auto-joined to
356  * another thread's work interval. The conditions are arranged such that
357  * the most common bailout condition are checked the earliest. This routine
358  * is called from the scheduler context; so it needs to be efficient and
359  * be careful when taking locks or performing wakeups.
360  */
361 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)362 work_interval_should_propagate(thread_t cthread, thread_t thread)
363 {
364 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
365 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
366 		return false;
367 	}
368 
369 	/* Only propagate work intervals which have auto-join enabled */
370 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
371 		return false;
372 	}
373 
374 	/* Work interval propagation is enabled for realtime threads only */
375 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
376 		return false;
377 	}
378 
379 
380 	/* Work interval propagation only works for threads with the same home thread group */
381 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
382 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
383 		return false;
384 	}
385 
386 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
387 	if (thread->thread_group != thread_home_tg) {
388 		return false;
389 	}
390 
391 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
392 	if ((!cthread->active) || (!thread->active)) {
393 		return false;
394 	}
395 
396 	return true;
397 }
398 
399 /*
400  * work_interval_auto_join_propagate()
401  *
402  * Routine to auto-join a thread into another thread's work interval
403  *
404  * Should only be invoked if work_interval_should_propagate() returns
405  * true. Also expects "from" thread to be current thread and "to" thread
406  * to be locked.
407  */
408 void
work_interval_auto_join_propagate(thread_t from,thread_t to)409 work_interval_auto_join_propagate(thread_t from, thread_t to)
410 {
411 	assert(from == current_thread());
412 	work_interval_retain(from->th_work_interval);
413 	work_interval_auto_join_increment(from->th_work_interval);
414 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
415 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
416 	assert(kr == KERN_SUCCESS);
417 }
418 
419 /*
420  * work_interval_auto_join_unwind()
421  *
422  * Routine to un-join an auto-joined work interval for a thread that is blocking.
423  *
424  * Expects thread to be locked.
425  */
426 void
work_interval_auto_join_unwind(thread_t thread)427 work_interval_auto_join_unwind(thread_t thread)
428 {
429 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
430 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
431 	assert(kr == KERN_SUCCESS);
432 }
433 
434 /*
435  * work_interval_auto_join_demote()
436  *
437  * Routine to un-join an auto-joined work interval when a thread is changing from
438  * realtime to non-realtime scheduling mode. This could happen due to multiple
439  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
440  * the thread being demoted may not be the current thread.
441  *
442  * Expects thread to be locked.
443  */
444 void
work_interval_auto_join_demote(thread_t thread)445 work_interval_auto_join_demote(thread_t thread)
446 {
447 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
448 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
449 	assert(kr == KERN_SUCCESS);
450 }
451 
452 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)453 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
454     __assert_only mpsc_daemon_queue_t dq)
455 {
456 	struct work_interval *work_interval = NULL;
457 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
458 	assert(dq == &work_interval_deallocate_queue);
459 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
460 	work_interval_deallocate(work_interval);
461 }
462 
463 #endif /* CONFIG_SCHED_AUTO_JOIN */
464 
465 #if CONFIG_SCHED_AUTO_JOIN
466 __startup_func
467 static void
work_interval_subsystem_init(void)468 work_interval_subsystem_init(void)
469 {
470 	/*
471 	 * The work interval deallocation queue must be a thread call based queue
472 	 * because it is woken up from contexts where the thread lock is held. The
473 	 * only way to perform wakeups safely in those contexts is to wakeup a
474 	 * thread call which is guaranteed to be on a different waitq and would
475 	 * not hash onto the same global waitq which might be currently locked.
476 	 */
477 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
478 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
479 	    MPSC_DAEMON_INIT_NONE);
480 }
481 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
482 #endif /* CONFIG_SCHED_AUTO_JOIN */
483 
484 /*
485  * work_interval_port_convert
486  *
487  * Called with port locked, returns reference to work interval
488  * if indeed the port is a work interval kobject port
489  */
490 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)491 work_interval_port_convert_locked(ipc_port_t port)
492 {
493 	struct work_interval *work_interval = NULL;
494 
495 	if (IP_VALID(port)) {
496 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
497 		if (work_interval) {
498 			work_interval_retain(work_interval);
499 		}
500 	}
501 
502 	return work_interval;
503 }
504 
505 /*
506  * port_name_to_work_interval
507  *
508  * Description: Obtain a reference to the work_interval associated with a given port.
509  *
510  * Parameters:  name    A Mach port name to translate.
511  *
512  * Returns:     NULL    The given Mach port did not reference a work_interval.
513  *              !NULL   The work_interval that is associated with the Mach port.
514  */
515 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)516 port_name_to_work_interval(mach_port_name_t     name,
517     struct work_interval **work_interval)
518 {
519 	if (!MACH_PORT_VALID(name)) {
520 		return KERN_INVALID_NAME;
521 	}
522 
523 	ipc_port_t port = IPC_PORT_NULL;
524 	kern_return_t kr = KERN_SUCCESS;
525 
526 	kr = ipc_port_translate_send(current_space(), name, &port);
527 	if (kr != KERN_SUCCESS) {
528 		return kr;
529 	}
530 	/* port is locked */
531 
532 	assert(IP_VALID(port));
533 
534 	struct work_interval *converted_work_interval;
535 
536 	converted_work_interval = work_interval_port_convert_locked(port);
537 
538 	/* the port is valid, but doesn't denote a work_interval */
539 	if (converted_work_interval == NULL) {
540 		kr = KERN_INVALID_CAPABILITY;
541 	}
542 
543 	ip_mq_unlock(port);
544 
545 	if (kr == KERN_SUCCESS) {
546 		*work_interval = converted_work_interval;
547 	}
548 
549 	return kr;
550 }
551 
552 
553 /*
554  * work_interval_port_no_senders
555  *
556  * Description: Handle a no-senders notification for a work interval port.
557  *              Destroys the port and releases its reference on the work interval.
558  *
559  * Parameters:  msg     A Mach no-senders notification message.
560  *
561  * Note: This assumes that there is only one create-right-from-work-interval point,
562  *       if the ability to extract another send right after creation is added,
563  *       this will have to change to handle make-send counts correctly.
564  */
565 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)566 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
567 {
568 	struct work_interval *work_interval = NULL;
569 
570 	work_interval = ipc_kobject_dealloc_port(port, mscount,
571 	    IKOT_WORK_INTERVAL);
572 
573 	work_interval->wi_port = MACH_PORT_NULL;
574 
575 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
576 }
577 
578 /*
579  * work_interval_port_type()
580  *
581  * Converts a port name into the work interval object and returns its type.
582  *
583  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
584  * valid type for work intervals).
585  */
586 static uint32_t
work_interval_port_type(mach_port_name_t port_name)587 work_interval_port_type(mach_port_name_t port_name)
588 {
589 	struct work_interval *work_interval = NULL;
590 	kern_return_t kr;
591 	uint32_t work_interval_type;
592 
593 	if (port_name == MACH_PORT_NULL) {
594 		return WORK_INTERVAL_TYPE_LAST;
595 	}
596 
597 	kr = port_name_to_work_interval(port_name, &work_interval);
598 	if (kr != KERN_SUCCESS) {
599 		return WORK_INTERVAL_TYPE_LAST;
600 	}
601 	/* work_interval has a +1 ref */
602 
603 	assert(work_interval != NULL);
604 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
605 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
606 	return work_interval_type;
607 }
608 
609 
610 /*
611  * thread_set_work_interval()
612  *
613  * Change thread's bound work interval to the passed-in work interval
614  * Consumes +1 ref on work_interval upon success.
615  *
616  * May also pass NULL to un-set work_interval on the thread
617  * Will deallocate any old work interval on the thread
618  * Return error if thread does not satisfy requirements to join work interval
619  *
620  * For non auto-join work intervals, deallocate any old work interval on the thread
621  * For auto-join work intervals, the routine may wakeup the work interval deferred
622  * deallocation queue since thread locks might be currently held.
623  */
624 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)625 thread_set_work_interval(thread_t thread,
626     struct work_interval *work_interval, thread_work_interval_options_t options)
627 {
628 	/* All explicit work interval operations should always be from the current thread */
629 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
630 		assert(thread == current_thread());
631 	}
632 
633 	/* All cases of needing the thread lock should be from explicit join scenarios */
634 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
635 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
636 	}
637 
638 	/* For all cases of auto join must come in with the thread lock held */
639 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
640 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
641 	}
642 
643 	if (work_interval) {
644 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
645 
646 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
647 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
648 			return KERN_INVALID_ARGUMENT;
649 		}
650 	}
651 
652 	struct work_interval *old_th_wi = thread->th_work_interval;
653 #if CONFIG_SCHED_AUTO_JOIN
654 	bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
655 
656 	spl_t s;
657 	/* Take the thread lock if needed */
658 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
659 		s = splsched();
660 		thread_lock(thread);
661 	}
662 
663 	/*
664 	 * Work interval auto-join leak to non-RT threads.
665 	 *
666 	 * If thread might be running on a remote core and it's not in the context switch path (where
667 	 * thread is neither running, blocked or in the runq), its not possible to update the
668 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
669 	 * core. This situation might happen when a thread is transitioning from realtime to
670 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
671 	 * be part of the work interval.
672 	 *
673 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
674 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
675 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
676 	 */
677 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
678 
679 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
680 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
681 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
682 		return KERN_SUCCESS;
683 	}
684 
685 	old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
686 
687 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
688 		__kdebug_only uint64_t old_tg_id = (old_th_wi) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
689 		__kdebug_only uint64_t new_tg_id = (work_interval) ? thread_group_get_id(work_interval->wi_group) : ~0;
690 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
691 		    thread_tid(thread), old_tg_id, new_tg_id, options);
692 	}
693 
694 	if (old_wi_auto_joined) {
695 		/*
696 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
697 		 * happened due to the "leak" described above.
698 		 */
699 		if (thread->sched_mode != TH_MODE_REALTIME) {
700 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
701 		}
702 
703 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
704 		work_interval_auto_join_decrement(old_th_wi, thread);
705 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
706 	}
707 
708 #endif /* CONFIG_SCHED_AUTO_JOIN */
709 
710 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
711 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
712 
713 	/* transfer +1 ref to thread */
714 	thread->th_work_interval = work_interval;
715 
716 #if CONFIG_SCHED_AUTO_JOIN
717 
718 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
719 		assert(work_interval_auto_join_enabled(work_interval) == true);
720 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
721 	}
722 
723 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
724 		thread_unlock(thread);
725 		splx(s);
726 	}
727 #endif /* CONFIG_SCHED_AUTO_JOIN */
728 
729 #if CONFIG_THREAD_GROUPS
730 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
731 
732 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
733 #if CONFIG_SCHED_AUTO_JOIN
734 		thread_set_autojoin_thread_group_locked(thread, new_tg);
735 #endif
736 	} else {
737 		thread_set_work_interval_thread_group(thread, new_tg);
738 	}
739 #endif /* CONFIG_THREAD_GROUPS */
740 
741 	if (old_th_wi != NULL) {
742 		work_interval_release(old_th_wi, options);
743 	}
744 
745 	return KERN_SUCCESS;
746 }
747 
748 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)749 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
750 {
751 	assert(thread == current_thread());
752 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
753 }
754 
755 kern_return_t
work_interval_thread_terminate(thread_t thread)756 work_interval_thread_terminate(thread_t thread)
757 {
758 	assert(thread == current_thread());
759 	if (thread->th_work_interval != NULL) {
760 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
761 	}
762 	return KERN_SUCCESS;
763 }
764 
765 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)766 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
767 {
768 	assert(thread == current_thread());
769 	assert(kwi_args->work_interval_id != 0);
770 
771 	struct work_interval *work_interval = thread->th_work_interval;
772 
773 	if (work_interval == NULL ||
774 	    work_interval->wi_id != kwi_args->work_interval_id) {
775 		/* This thread must have adopted the work interval to be able to notify */
776 		return KERN_INVALID_ARGUMENT;
777 	}
778 
779 	task_t notifying_task = current_task();
780 
781 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
782 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
783 		/* Only the creating task can do a notify */
784 		return KERN_INVALID_ARGUMENT;
785 	}
786 
787 	spl_t s = splsched();
788 
789 #if CONFIG_THREAD_GROUPS
790 	assert(work_interval->wi_group == thread->thread_group);
791 #endif /* CONFIG_THREAD_GROUPS */
792 
793 	uint64_t urgency_param1, urgency_param2;
794 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
795 
796 	splx(s);
797 
798 	/* called without interrupts disabled */
799 	machine_work_interval_notify(thread, kwi_args);
800 
801 	return KERN_SUCCESS;
802 }
803 
804 /* Start at 1, 0 is not a valid work interval ID */
805 static _Atomic uint64_t unique_work_interval_id = 1;
806 
807 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)808 kern_work_interval_create(thread_t thread,
809     struct kern_work_interval_create_args *create_params)
810 {
811 	assert(thread == current_thread());
812 
813 	uint32_t create_flags = create_params->wica_create_flags;
814 
815 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
816 	    thread->th_work_interval != NULL) {
817 		/*
818 		 * If the thread is doing a legacy combined create and join,
819 		 * it shouldn't already be part of a work interval.
820 		 *
821 		 * (Creating a joinable WI is allowed anytime.)
822 		 */
823 		return KERN_FAILURE;
824 	}
825 
826 	/*
827 	 * Check the validity of the create flags before allocating the work
828 	 * interval.
829 	 */
830 	task_t creating_task = current_task();
831 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
832 		/*
833 		 * CA_CLIENT work intervals do not create new thread groups.
834 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
835 		 * per each application task
836 		 */
837 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
838 			return KERN_FAILURE;
839 		}
840 		if (!task_is_app(creating_task)) {
841 #if XNU_TARGET_OS_OSX
842 			/*
843 			 * Soft-fail the case of a non-app pretending to be an
844 			 * app, by allowing it to press the buttons, but they're
845 			 * not actually connected to anything.
846 			 */
847 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
848 #else
849 			/*
850 			 * On iOS, it's a hard failure to get your apptype
851 			 * wrong and then try to render something.
852 			 */
853 			return KERN_NOT_SUPPORTED;
854 #endif /* XNU_TARGET_OS_OSX */
855 		}
856 		if (task_set_ca_client_wi(creating_task, true) == false) {
857 			return KERN_FAILURE;
858 		}
859 	}
860 
861 #if CONFIG_SCHED_AUTO_JOIN
862 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
863 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
864 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
865 			return KERN_NOT_SUPPORTED;
866 		}
867 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
868 			return KERN_NOT_SUPPORTED;
869 		}
870 	}
871 
872 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
873 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
874 			return KERN_NOT_SUPPORTED;
875 		}
876 	}
877 #endif /* CONFIG_SCHED_AUTO_JOIN */
878 
879 	struct work_interval *work_interval = kalloc_type(struct work_interval,
880 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
881 
882 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
883 
884 	*work_interval = (struct work_interval) {
885 		.wi_id                  = work_interval_id,
886 		.wi_ref_count           = {},
887 		.wi_create_flags        = create_flags,
888 		.wi_creator_pid         = pid_from_task(creating_task),
889 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
890 		.wi_creator_pidversion  = get_task_version(creating_task),
891 	};
892 	os_ref_init(&work_interval->wi_ref_count, NULL);
893 
894 	__kdebug_only uint64_t tg_id = 0;
895 #if CONFIG_THREAD_GROUPS
896 	struct thread_group *tg;
897 	if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
898 		/* create a new group for the interval to represent */
899 		char name[THREAD_GROUP_MAXNAME] = "";
900 
901 		snprintf(name, sizeof(name), "WI[%d] #%lld",
902 		    work_interval->wi_creator_pid, work_interval_id);
903 
904 		tg = thread_group_create_and_retain(FALSE);
905 
906 		thread_group_set_name(tg, name);
907 
908 		work_interval->wi_group = tg;
909 	} else {
910 		/* the interval represents the thread's home group */
911 		tg = thread_group_get_home_group(thread);
912 
913 		thread_group_retain(tg);
914 
915 		work_interval->wi_group = tg;
916 	}
917 
918 	/* Capture the tg_id for tracing purposes */
919 	tg_id = thread_group_get_id(work_interval->wi_group);
920 
921 #endif /* CONFIG_THREAD_GROUPS */
922 
923 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
924 		mach_port_name_t name = MACH_PORT_NULL;
925 
926 		/* work_interval has a +1 ref, moves to the port */
927 		work_interval->wi_port = ipc_kobject_alloc_port(
928 			(ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
929 			IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
930 
931 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
932 
933 		if (!MACH_PORT_VALID(name)) {
934 			/*
935 			 * copyout failed (port is already deallocated)
936 			 * Because of the port-destroyed magic,
937 			 * the work interval is already deallocated too.
938 			 */
939 			return KERN_RESOURCE_SHORTAGE;
940 		}
941 
942 		create_params->wica_port = name;
943 	} else {
944 		/* work_interval has a +1 ref, moves to the thread */
945 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
946 		if (kr != KERN_SUCCESS) {
947 			/* No other thread can join this work interval since it isn't
948 			 * JOINABLE so release the reference on work interval */
949 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
950 			return kr;
951 		}
952 		create_params->wica_port = MACH_PORT_NULL;
953 	}
954 
955 	create_params->wica_id = work_interval_id;
956 
957 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
958 	    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
959 	return KERN_SUCCESS;
960 }
961 
962 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)963 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
964 {
965 	assert(flags != NULL);
966 
967 	kern_return_t kr;
968 	struct work_interval *work_interval;
969 
970 	kr = port_name_to_work_interval(port_name, &work_interval);
971 	if (kr != KERN_SUCCESS) {
972 		return kr;
973 	}
974 
975 	assert(work_interval != NULL);
976 	*flags = work_interval->wi_create_flags;
977 
978 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
979 
980 	return KERN_SUCCESS;
981 }
982 
983 
984 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)985 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
986 {
987 	if (work_interval_id == 0) {
988 		return KERN_INVALID_ARGUMENT;
989 	}
990 
991 	if (thread->th_work_interval == NULL ||
992 	    thread->th_work_interval->wi_id != work_interval_id) {
993 		/* work ID isn't valid or doesn't match joined work interval ID */
994 		return KERN_INVALID_ARGUMENT;
995 	}
996 
997 	return thread_set_work_interval_explicit_join(thread, NULL);
998 }
999 
1000 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1001 kern_work_interval_join(thread_t            thread,
1002     mach_port_name_t    port_name)
1003 {
1004 	struct work_interval *work_interval = NULL;
1005 	kern_return_t kr;
1006 
1007 	if (port_name == MACH_PORT_NULL) {
1008 		/* 'Un-join' the current work interval */
1009 		return thread_set_work_interval_explicit_join(thread, NULL);
1010 	}
1011 
1012 	kr = port_name_to_work_interval(port_name, &work_interval);
1013 	if (kr != KERN_SUCCESS) {
1014 		return kr;
1015 	}
1016 	/* work_interval has a +1 ref */
1017 
1018 	assert(work_interval != NULL);
1019 
1020 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1021 	/* ref was consumed by passing it to the thread in the successful case */
1022 	if (kr != KERN_SUCCESS) {
1023 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1024 	}
1025 	return kr;
1026 }
1027 
1028 /*
1029  * work_interval_port_type_render_server()
1030  *
1031  * Helper routine to determine if the port points to a
1032  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1033  */
1034 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1035 work_interval_port_type_render_server(mach_port_name_t port_name)
1036 {
1037 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1038 }
1039