xref: /xnu-8796.121.2/osfmk/kern/work_interval.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45 
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49 
50 #include <stdatomic.h>
51 
52 /*
53  * With the introduction of auto-join work intervals, it is possible
54  * to change the work interval (and related thread group) of a thread in a
55  * variety of contexts (thread termination, context switch, thread mode
56  * change etc.). In order to clearly specify the policy expectation and
57  * the locking behavior, all calls to thread_set_work_interval() pass
58  * in a set of flags.
59  */
60 
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 	/* Change the work interval using the explicit join rules */
63 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 	/* Change the work interval using the auto-join rules */
65 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
66 	/* Caller already holds the thread lock */
67 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
68 	/* Caller does not hold the thread lock */
69 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
70 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
72 });
73 
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76 
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78     .iko_op_stable     = true,
79     .iko_op_no_senders = work_interval_port_no_senders);
80 
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84 
85 static void work_interval_deferred_release(struct work_interval *);
86 
87 /*
88  * Work Interval Auto-Join Status
89  *
90  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91  * It packs the following information:
92  * - A bit representing if a "finish" is deferred on the work interval
93  * - Count of number of threads auto-joined to the work interval
94  */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99 
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105 
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111 
112 /*
113  * struct work_interval_deferred_finish_state
114  *
115  * Contains the parameters of the finish operation which is being deferred.
116  */
117 struct work_interval_deferred_finish_state {
118 	uint64_t instance_id;
119 	uint64_t start;
120 	uint64_t deadline;
121 	uint64_t complexity;
122 };
123 
124 struct work_interval_auto_join_info {
125 	struct work_interval_deferred_finish_state deferred_finish_state;
126 	work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129 
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134 
135 /*
136  * Work Interval structs
137  *
138  * This struct represents a thread group and/or work interval context
139  * in a mechanism that is represented with a kobject.
140  *
141  * Every thread that has joined a WI has a +1 ref, and the port
142  * has a +1 ref as well.
143  *
144  * TODO: groups need to have a 'is for WI' flag
145  *      and they need a flag to create that says 'for WI'
146  *      This would allow CLPC to avoid allocating WI support
147  *      data unless it is needed
148  *
149  * TODO: Enforce not having more than one non-group joinable work
150  *      interval per thread group.
151  *      CLPC only wants to see one WI-notify callout per group.
152  */
153 
154 struct work_interval {
155 	uint64_t wi_id;
156 	struct os_refcnt wi_ref_count;
157 	uint32_t wi_create_flags;
158 
159 	/* for debugging purposes only, does not hold a ref on port */
160 	ipc_port_t wi_port;
161 
162 	/*
163 	 * holds uniqueid and version of creating process,
164 	 * used to permission-gate notify
165 	 * TODO: you'd think there would be a better way to do this
166 	 */
167 	uint64_t wi_creator_uniqueid;
168 	uint32_t wi_creator_pid;
169 	int wi_creator_pidversion;
170 
171 	/* flags set by work_interval_set_workload_id and reflected onto
172 	 *  thread->th_work_interval_flags upon join */
173 	uint32_t wi_wlid_flags;
174 
175 #if CONFIG_THREAD_GROUPS
176 	uint32_t wi_group_flags;
177 	struct thread_group *wi_group;  /* holds +1 ref on group */
178 #endif /* CONFIG_THREAD_GROUPS */
179 
180 #if CONFIG_SCHED_AUTO_JOIN
181 	/* Information related to auto-join and deferred finish for work interval */
182 	struct work_interval_auto_join_info wi_auto_join_info;
183 
184 	/*
185 	 * Since the deallocation of auto-join work intervals
186 	 * can happen in the scheduler when the last thread in
187 	 * the WI blocks and the thread lock is held, the deallocation
188 	 * might have to be done on a separate thread.
189 	 */
190 	struct mpsc_queue_chain   wi_deallocate_link;
191 #endif /* CONFIG_SCHED_AUTO_JOIN */
192 
193 	/*
194 	 * Work interval class info - determines thread priority for threads
195 	 * with a work interval driven policy.
196 	 */
197 	wi_class_t wi_class;
198 	uint8_t wi_class_offset;
199 };
200 
201 #if CONFIG_SCHED_AUTO_JOIN
202 
203 /*
204  * work_interval_perform_deferred_finish()
205  *
206  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
207  * argument rather than looking at the work_interval since the deferred finish can race with another
208  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
209  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
210  * the deferred state without issues.
211  */
212 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)213 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
214     __unused struct work_interval *work_interval, __unused thread_t thread)
215 {
216 
217 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
218 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
219 }
220 
221 /*
222  * work_interval_auto_join_increment()
223  *
224  * Routine to increment auto-join counter when a new thread is auto-joined to
225  * the work interval.
226  */
227 static void
work_interval_auto_join_increment(struct work_interval * work_interval)228 work_interval_auto_join_increment(struct work_interval *work_interval)
229 {
230 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
231 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
232 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
233 }
234 
235 /*
236  * work_interval_auto_join_decrement()
237  *
238  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
239  * blocking or termination). If this was the last auto-joined thread in the work interval and
240  * there was a deferred finish, performs the finish operation for the work interval.
241  */
242 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)243 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
244 {
245 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
246 	work_interval_auto_join_status_t old_status, new_status;
247 	struct work_interval_deferred_finish_state deferred_finish_state;
248 	bool perform_finish;
249 
250 	/* Update the auto-join count for the work interval atomically */
251 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
252 		perform_finish = false;
253 		new_status = old_status;
254 		assert(work_interval_status_auto_join_count(old_status) > 0);
255 		new_status -= 1;
256 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
257 		        /* No auto-joined threads remaining and finish is deferred */
258 		        new_status = 0;
259 		        perform_finish = true;
260 		        /*
261 		         * Its important to copy the deferred finish state here so that this works
262 		         * when racing with another start-finish cycle.
263 		         */
264 		        deferred_finish_state = join_info->deferred_finish_state;
265 		}
266 	});
267 
268 	if (perform_finish == true) {
269 		/*
270 		 * Since work_interval_perform_deferred_finish() calls down to
271 		 * the machine layer callout for finish which gets the thread
272 		 * group from the thread passed in here, it is important to
273 		 * make sure that the thread still has the work interval thread
274 		 * group here.
275 		 */
276 		assert(thread->thread_group == work_interval->wi_group);
277 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
278 	}
279 }
280 
281 /*
282  * work_interval_auto_join_enabled()
283  *
284  * Helper routine to check if work interval has auto-join enabled.
285  */
286 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)287 work_interval_auto_join_enabled(struct work_interval *work_interval)
288 {
289 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
290 }
291 
292 /*
293  * work_interval_deferred_finish_enabled()
294  *
295  * Helper routine to check if work interval has deferred finish enabled.
296  */
297 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)298 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
299 {
300 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
301 }
302 
303 #endif /* CONFIG_SCHED_AUTO_JOIN */
304 
305 static inline void
work_interval_retain(struct work_interval * work_interval)306 work_interval_retain(struct work_interval *work_interval)
307 {
308 	/*
309 	 * Even though wi_retain is called under a port lock, we have
310 	 * to use os_ref_retain instead of os_ref_retain_locked
311 	 * because wi_release is not synchronized. wi_release calls
312 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
313 	 */
314 	os_ref_retain(&work_interval->wi_ref_count);
315 }
316 
317 static inline void
work_interval_deallocate(struct work_interval * work_interval)318 work_interval_deallocate(struct work_interval *work_interval)
319 {
320 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
321 	    work_interval->wi_id);
322 #if CONFIG_THREAD_GROUPS
323 	if (work_interval->wi_group) {
324 		thread_group_release(work_interval->wi_group);
325 		work_interval->wi_group = NULL;
326 	}
327 #endif /* CONFIG_THREAD_GROUPS */
328 	kfree_type(struct work_interval, work_interval);
329 }
330 
331 /*
332  * work_interval_release()
333  *
334  * Routine to release a ref count on the work interval. If the refcount goes down
335  * to zero, the work interval needs to be de-allocated.
336  *
337  * For non auto-join work intervals, they are de-allocated in this context.
338  *
339  * For auto-join work intervals, the de-allocation cannot be done from this context
340  * since that might need the kernel memory allocator lock. In that case, the
341  * deallocation is done via a thread-call based mpsc queue.
342  */
343 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)344 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
345 {
346 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
347 #if CONFIG_SCHED_AUTO_JOIN
348 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
349 			work_interval_deferred_release(work_interval);
350 		} else {
351 			work_interval_deallocate(work_interval);
352 		}
353 #else /* CONFIG_SCHED_AUTO_JOIN */
354 		work_interval_deallocate(work_interval);
355 #endif /* CONFIG_SCHED_AUTO_JOIN */
356 	}
357 }
358 
359 #if CONFIG_SCHED_AUTO_JOIN
360 
361 /*
362  * work_interval_deferred_release()
363  *
364  * Routine to enqueue the work interval on the deallocation mpsc queue.
365  */
366 static void
work_interval_deferred_release(struct work_interval * work_interval)367 work_interval_deferred_release(struct work_interval *work_interval)
368 {
369 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
370 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
371 }
372 
373 /*
374  * work_interval_should_propagate()
375  *
376  * Main policy routine to decide if a thread should be auto-joined to
377  * another thread's work interval. The conditions are arranged such that
378  * the most common bailout condition are checked the earliest. This routine
379  * is called from the scheduler context; so it needs to be efficient and
380  * be careful when taking locks or performing wakeups.
381  */
382 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)383 work_interval_should_propagate(thread_t cthread, thread_t thread)
384 {
385 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
386 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
387 		return false;
388 	}
389 
390 	/* Only propagate work intervals which have auto-join enabled */
391 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
392 		return false;
393 	}
394 
395 	/* Work interval propagation is enabled for realtime threads only */
396 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
397 		return false;
398 	}
399 
400 
401 	/* Work interval propagation only works for threads with the same home thread group */
402 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
403 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
404 		return false;
405 	}
406 
407 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
408 	if (thread->thread_group != thread_home_tg) {
409 		return false;
410 	}
411 
412 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
413 	if ((!cthread->active) || (!thread->active)) {
414 		return false;
415 	}
416 
417 	return true;
418 }
419 
420 /*
421  * work_interval_auto_join_propagate()
422  *
423  * Routine to auto-join a thread into another thread's work interval
424  *
425  * Should only be invoked if work_interval_should_propagate() returns
426  * true. Also expects "from" thread to be current thread and "to" thread
427  * to be locked.
428  */
429 void
work_interval_auto_join_propagate(thread_t from,thread_t to)430 work_interval_auto_join_propagate(thread_t from, thread_t to)
431 {
432 	assert(from == current_thread());
433 	work_interval_retain(from->th_work_interval);
434 	work_interval_auto_join_increment(from->th_work_interval);
435 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
436 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
437 	assert(kr == KERN_SUCCESS);
438 }
439 
440 /*
441  * work_interval_auto_join_unwind()
442  *
443  * Routine to un-join an auto-joined work interval for a thread that is blocking.
444  *
445  * Expects thread to be locked.
446  */
447 void
work_interval_auto_join_unwind(thread_t thread)448 work_interval_auto_join_unwind(thread_t thread)
449 {
450 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
451 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
452 	assert(kr == KERN_SUCCESS);
453 }
454 
455 /*
456  * work_interval_auto_join_demote()
457  *
458  * Routine to un-join an auto-joined work interval when a thread is changing from
459  * realtime to non-realtime scheduling mode. This could happen due to multiple
460  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
461  * the thread being demoted may not be the current thread.
462  *
463  * Expects thread to be locked.
464  */
465 void
work_interval_auto_join_demote(thread_t thread)466 work_interval_auto_join_demote(thread_t thread)
467 {
468 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
469 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
470 	assert(kr == KERN_SUCCESS);
471 }
472 
473 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)474 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
475     __assert_only mpsc_daemon_queue_t dq)
476 {
477 	struct work_interval *work_interval = NULL;
478 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
479 	assert(dq == &work_interval_deallocate_queue);
480 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
481 	work_interval_deallocate(work_interval);
482 }
483 
484 #endif /* CONFIG_SCHED_AUTO_JOIN */
485 
486 #if CONFIG_SCHED_AUTO_JOIN
487 __startup_func
488 static void
work_interval_subsystem_init(void)489 work_interval_subsystem_init(void)
490 {
491 	/*
492 	 * The work interval deallocation queue must be a thread call based queue
493 	 * because it is woken up from contexts where the thread lock is held. The
494 	 * only way to perform wakeups safely in those contexts is to wakeup a
495 	 * thread call which is guaranteed to be on a different waitq and would
496 	 * not hash onto the same global waitq which might be currently locked.
497 	 */
498 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
499 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
500 	    MPSC_DAEMON_INIT_NONE);
501 }
502 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
503 #endif /* CONFIG_SCHED_AUTO_JOIN */
504 
505 /*
506  * work_interval_port_convert
507  *
508  * Called with port locked, returns reference to work interval
509  * if indeed the port is a work interval kobject port
510  */
511 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)512 work_interval_port_convert_locked(ipc_port_t port)
513 {
514 	struct work_interval *work_interval = NULL;
515 
516 	if (IP_VALID(port)) {
517 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
518 		if (work_interval) {
519 			work_interval_retain(work_interval);
520 		}
521 	}
522 
523 	return work_interval;
524 }
525 
526 /*
527  * port_name_to_work_interval
528  *
529  * Description: Obtain a reference to the work_interval associated with a given port.
530  *
531  * Parameters:  name    A Mach port name to translate.
532  *
533  * Returns:     NULL    The given Mach port did not reference a work_interval.
534  *              !NULL   The work_interval that is associated with the Mach port.
535  */
536 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)537 port_name_to_work_interval(mach_port_name_t     name,
538     struct work_interval **work_interval)
539 {
540 	if (!MACH_PORT_VALID(name)) {
541 		return KERN_INVALID_NAME;
542 	}
543 
544 	ipc_port_t port = IP_NULL;
545 	kern_return_t kr = KERN_SUCCESS;
546 
547 	kr = ipc_port_translate_send(current_space(), name, &port);
548 	if (kr != KERN_SUCCESS) {
549 		return kr;
550 	}
551 	/* port is locked */
552 
553 	assert(IP_VALID(port));
554 
555 	struct work_interval *converted_work_interval;
556 
557 	converted_work_interval = work_interval_port_convert_locked(port);
558 
559 	/* the port is valid, but doesn't denote a work_interval */
560 	if (converted_work_interval == NULL) {
561 		kr = KERN_INVALID_CAPABILITY;
562 	}
563 
564 	ip_mq_unlock(port);
565 
566 	if (kr == KERN_SUCCESS) {
567 		*work_interval = converted_work_interval;
568 	}
569 
570 	return kr;
571 }
572 
573 
574 /*
575  * work_interval_port_no_senders
576  *
577  * Description: Handle a no-senders notification for a work interval port.
578  *              Destroys the port and releases its reference on the work interval.
579  *
580  * Parameters:  msg     A Mach no-senders notification message.
581  *
582  * Note: This assumes that there is only one create-right-from-work-interval point,
583  *       if the ability to extract another send right after creation is added,
584  *       this will have to change to handle make-send counts correctly.
585  */
586 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)587 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
588 {
589 	struct work_interval *work_interval = NULL;
590 
591 	work_interval = ipc_kobject_dealloc_port(port, mscount,
592 	    IKOT_WORK_INTERVAL);
593 
594 	work_interval->wi_port = MACH_PORT_NULL;
595 
596 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
597 }
598 
599 /*
600  * work_interval_port_type()
601  *
602  * Converts a port name into the work interval object and returns its type.
603  *
604  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
605  * valid type for work intervals).
606  */
607 static uint32_t
work_interval_port_type(mach_port_name_t port_name)608 work_interval_port_type(mach_port_name_t port_name)
609 {
610 	struct work_interval *work_interval = NULL;
611 	kern_return_t kr;
612 	uint32_t work_interval_type;
613 
614 	if (port_name == MACH_PORT_NULL) {
615 		return WORK_INTERVAL_TYPE_LAST;
616 	}
617 
618 	kr = port_name_to_work_interval(port_name, &work_interval);
619 	if (kr != KERN_SUCCESS) {
620 		return WORK_INTERVAL_TYPE_LAST;
621 	}
622 	/* work_interval has a +1 ref */
623 
624 	assert(work_interval != NULL);
625 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
626 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
627 	return work_interval_type;
628 }
629 
630 /*
631  * Sparse - not all work interval classes imply a scheduling policy change.
632  * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
633  * adopted the REALTIME sched mode to take effect.
634  */
635 static const struct {
636 	int          priority;
637 	sched_mode_t sched_mode;
638 } work_interval_class_data[WI_CLASS_COUNT] = {
639 	[WI_CLASS_BEST_EFFORT] = {
640 		BASEPRI_DEFAULT,        // 31
641 		TH_MODE_TIMESHARE,
642 	},
643 
644 	[WI_CLASS_APP_SUPPORT] = {
645 		BASEPRI_DEFAULT,        // 31
646 		TH_MODE_TIMESHARE,
647 	},
648 
649 	[WI_CLASS_SYSTEM] = {
650 		BASEPRI_FOREGROUND + 1, // 48
651 		TH_MODE_FIXED,
652 	},
653 
654 	[WI_CLASS_SYSTEM_CRITICAL] = {
655 		MAXPRI_USER + 1,        // 64
656 		TH_MODE_FIXED,
657 	},
658 
659 	[WI_CLASS_REALTIME_CRITICAL] = {
660 		BASEPRI_RTQUEUES + 1,   // 98
661 		TH_MODE_REALTIME,
662 	},
663 };
664 
665 /*
666  * Called when a thread gets its scheduling priority from its associated work
667  * interval.
668  */
669 int
work_interval_get_priority(thread_t thread)670 work_interval_get_priority(thread_t thread)
671 {
672 	const struct work_interval *work_interval = thread->th_work_interval;
673 	assert(work_interval != NULL);
674 
675 	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
676 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
677 	int priority = work_interval_class_data[work_interval->wi_class].priority;
678 	assert(priority != 0);
679 
680 	priority += work_interval->wi_class_offset;
681 	assert3u(priority, <=, MAXPRI);
682 
683 	return priority;
684 }
685 
686 /*
687  * Switch to a policy driven by the work interval (if applicable).
688  */
689 static void
work_interval_set_policy(thread_t thread)690 work_interval_set_policy(thread_t thread)
691 {
692 	assert3p(thread, ==, current_thread());
693 
694 	/*
695 	 * Ignore policy changes if the workload context shouldn't affect the
696 	 * scheduling policy.
697 	 */
698 	workload_config_flags_t flags = WLC_F_NONE;
699 
700 	/* There may be no config at all. That's ok. */
701 	if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
702 	    (flags & WLC_F_THREAD_POLICY) == 0) {
703 		return;
704 	}
705 
706 	const struct work_interval *work_interval = thread->th_work_interval;
707 	assert(work_interval != NULL);
708 
709 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
710 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
711 
712 	/*
713 	 * A mode of TH_MODE_NONE implies that this work interval has no
714 	 * associated scheduler effects.
715 	 */
716 	if (mode == TH_MODE_NONE) {
717 		return;
718 	}
719 
720 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
721 	    TASK_POLICY_WI_DRIVEN, true, mode);
722 	assert(thread->requested_policy.thrp_wi_driven);
723 
724 	return;
725 }
726 
727 /*
728  * Clear a work interval driven policy.
729  */
730 static void
work_interval_clear_policy(thread_t thread)731 work_interval_clear_policy(thread_t thread)
732 {
733 	assert3p(thread, ==, current_thread());
734 
735 	if (!thread->requested_policy.thrp_wi_driven) {
736 		return;
737 	}
738 
739 	const sched_mode_t mode = sched_get_thread_mode_user(thread);
740 
741 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
742 	    TASK_POLICY_WI_DRIVEN, false,
743 	    mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
744 
745 	assert(!thread->requested_policy.thrp_wi_driven);
746 
747 	return;
748 }
749 
750 /*
751  * thread_set_work_interval()
752  *
753  * Change thread's bound work interval to the passed-in work interval
754  * Consumes +1 ref on work_interval upon success.
755  *
756  * May also pass NULL to un-set work_interval on the thread
757  * Will deallocate any old work interval on the thread
758  * Return error if thread does not satisfy requirements to join work interval
759  *
760  * For non auto-join work intervals, deallocate any old work interval on the thread
761  * For auto-join work intervals, the routine may wakeup the work interval deferred
762  * deallocation queue since thread locks might be currently held.
763  */
764 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)765 thread_set_work_interval(thread_t thread,
766     struct work_interval *work_interval, thread_work_interval_options_t options)
767 {
768 	/* All explicit work interval operations should always be from the current thread */
769 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
770 		assert(thread == current_thread());
771 	}
772 
773 	/* All cases of needing the thread lock should be from explicit join scenarios */
774 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
775 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
776 	}
777 
778 	/* For all cases of auto join must come in with the thread lock held */
779 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
780 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
781 	}
782 
783 #if CONFIG_THREAD_GROUPS
784 	if (work_interval && !work_interval->wi_group) {
785 		/* Reject join on work intervals with deferred thread group creation */
786 		return KERN_INVALID_ARGUMENT;
787 	}
788 #endif /* CONFIG_THREAD_GROUPS */
789 
790 	if (work_interval) {
791 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
792 
793 		if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
794 			/* Ensure no kern_work_interval_set_workload_id can happen after this point */
795 			uint32_t wlid_flags;
796 			(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
797 			    WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
798 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
799 				/* For workload IDs with rt-allowed, neuter the check below to
800 				 * enable joining before the thread has become realtime for all
801 				 * work interval types */
802 				work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
803 			}
804 		}
805 
806 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
807 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
808 			return KERN_INVALID_ARGUMENT;
809 		}
810 	}
811 
812 	/*
813 	 * Ensure a work interval scheduling policy is not used if the thread is
814 	 * leaving the work interval.
815 	 */
816 	if (work_interval == NULL &&
817 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
818 		work_interval_clear_policy(thread);
819 	}
820 
821 	struct work_interval *old_th_wi = thread->th_work_interval;
822 #if CONFIG_SCHED_AUTO_JOIN
823 	spl_t s;
824 	/* Take the thread lock if needed */
825 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
826 		s = splsched();
827 		thread_lock(thread);
828 	}
829 
830 	/*
831 	 * Work interval auto-join leak to non-RT threads.
832 	 *
833 	 * If thread might be running on a remote core and it's not in the context switch path (where
834 	 * thread is neither running, blocked or in the runq), its not possible to update the
835 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
836 	 * core. This situation might happen when a thread is transitioning from realtime to
837 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
838 	 * be part of the work interval.
839 	 *
840 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
841 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
842 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
843 	 */
844 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
845 
846 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
847 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
848 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
849 		return KERN_SUCCESS;
850 	}
851 
852 	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
853 
854 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
855 		__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
856 		__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
857 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
858 		    thread_tid(thread), old_tg_id, new_tg_id, options);
859 	}
860 
861 	if (old_wi_auto_joined) {
862 		/*
863 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
864 		 * happened due to the "leak" described above.
865 		 */
866 		if (thread->sched_mode != TH_MODE_REALTIME) {
867 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
868 		}
869 
870 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
871 		work_interval_auto_join_decrement(old_th_wi, thread);
872 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
873 	}
874 
875 #endif /* CONFIG_SCHED_AUTO_JOIN */
876 
877 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
878 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
879 
880 	/* transfer +1 ref to thread */
881 	thread->th_work_interval = work_interval;
882 
883 #if CONFIG_SCHED_AUTO_JOIN
884 
885 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
886 		assert(work_interval_auto_join_enabled(work_interval) == true);
887 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
888 	}
889 
890 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
891 		thread_unlock(thread);
892 		splx(s);
893 	}
894 #endif /* CONFIG_SCHED_AUTO_JOIN */
895 
896 	/*
897 	 * The thread got a new work interval. It may come with a work interval
898 	 * scheduling policy that needs to be applied.
899 	 */
900 	if (work_interval != NULL &&
901 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
902 		work_interval_set_policy(thread);
903 	}
904 
905 #if CONFIG_THREAD_GROUPS
906 	if (work_interval) {
907 		/* Prevent thread_group_set_name after CLPC may have already heard
908 		 * about the thread group */
909 		(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
910 		    WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
911 	}
912 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
913 
914 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
915 #if CONFIG_SCHED_AUTO_JOIN
916 		thread_set_autojoin_thread_group_locked(thread, new_tg);
917 #endif
918 	} else {
919 		thread_set_work_interval_thread_group(thread, new_tg);
920 	}
921 #endif /* CONFIG_THREAD_GROUPS */
922 
923 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
924 		/* Construct mask to XOR with th_work_interval_flags to clear the
925 		* currently present flags and set the new flags in wlid_flags. */
926 		uint32_t wlid_flags = 0;
927 		if (work_interval) {
928 			wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
929 		}
930 		thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
931 			&thread->th_work_interval_flags, relaxed);
932 		th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
933 		    TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
934 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
935 			th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
936 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
937 				th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
938 			}
939 		}
940 		if (th_wi_xor_mask) {
941 			os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
942 		}
943 
944 		/*
945 		 * Now that the interval flags have been set, re-evaluate
946 		 * whether the thread needs to be undemoted - the new work
947 		 * interval may have the RT_ALLOWED flag. and the thread may
948 		 * have have a realtime policy but be demoted.
949 		 */
950 		thread_rt_evaluate(thread);
951 	}
952 
953 	if (old_th_wi != NULL) {
954 		work_interval_release(old_th_wi, options);
955 	}
956 
957 	return KERN_SUCCESS;
958 }
959 
960 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)961 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
962 {
963 	assert(thread == current_thread());
964 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
965 }
966 
967 kern_return_t
work_interval_thread_terminate(thread_t thread)968 work_interval_thread_terminate(thread_t thread)
969 {
970 	assert(thread == current_thread());
971 	if (thread->th_work_interval != NULL) {
972 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
973 	}
974 	return KERN_SUCCESS;
975 }
976 
977 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)978 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
979 {
980 	assert(thread == current_thread());
981 	assert(kwi_args->work_interval_id != 0);
982 
983 	struct work_interval *work_interval = thread->th_work_interval;
984 
985 	if (work_interval == NULL ||
986 	    work_interval->wi_id != kwi_args->work_interval_id) {
987 		/* This thread must have adopted the work interval to be able to notify */
988 		return KERN_INVALID_ARGUMENT;
989 	}
990 
991 	task_t notifying_task = current_task();
992 
993 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
994 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
995 		/* Only the creating task can do a notify */
996 		return KERN_INVALID_ARGUMENT;
997 	}
998 
999 	spl_t s = splsched();
1000 
1001 #if CONFIG_THREAD_GROUPS
1002 	assert(work_interval->wi_group == thread->thread_group);
1003 #endif /* CONFIG_THREAD_GROUPS */
1004 
1005 	uint64_t urgency_param1, urgency_param2;
1006 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1007 
1008 	splx(s);
1009 
1010 	/* called without interrupts disabled */
1011 	machine_work_interval_notify(thread, kwi_args);
1012 
1013 	return KERN_SUCCESS;
1014 }
1015 
1016 /* Start at 1, 0 is not a valid work interval ID */
1017 static _Atomic uint64_t unique_work_interval_id = 1;
1018 
1019 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1020 kern_work_interval_create(thread_t thread,
1021     struct kern_work_interval_create_args *create_params)
1022 {
1023 	assert(thread == current_thread());
1024 
1025 	uint32_t create_flags = create_params->wica_create_flags;
1026 
1027 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1028 	    thread->th_work_interval != NULL) {
1029 		/*
1030 		 * If the thread is doing a legacy combined create and join,
1031 		 * it shouldn't already be part of a work interval.
1032 		 *
1033 		 * (Creating a joinable WI is allowed anytime.)
1034 		 */
1035 		return KERN_FAILURE;
1036 	}
1037 
1038 	/*
1039 	 * Check the validity of the create flags before allocating the work
1040 	 * interval.
1041 	 */
1042 	task_t creating_task = current_task();
1043 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1044 		/*
1045 		 * CA_CLIENT work intervals do not create new thread groups.
1046 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1047 		 * per each application task
1048 		 */
1049 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1050 			return KERN_FAILURE;
1051 		}
1052 		if (!task_is_app(creating_task)) {
1053 #if XNU_TARGET_OS_OSX
1054 			/*
1055 			 * Soft-fail the case of a non-app pretending to be an
1056 			 * app, by allowing it to press the buttons, but they're
1057 			 * not actually connected to anything.
1058 			 */
1059 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1060 #else
1061 			/*
1062 			 * On iOS, it's a hard failure to get your apptype
1063 			 * wrong and then try to render something.
1064 			 */
1065 			return KERN_NOT_SUPPORTED;
1066 #endif /* XNU_TARGET_OS_OSX */
1067 		}
1068 		if (task_set_ca_client_wi(creating_task, true) == false) {
1069 			return KERN_FAILURE;
1070 		}
1071 	}
1072 
1073 #if CONFIG_SCHED_AUTO_JOIN
1074 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1075 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1076 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1077 			return KERN_NOT_SUPPORTED;
1078 		}
1079 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1080 			return KERN_NOT_SUPPORTED;
1081 		}
1082 	}
1083 
1084 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1085 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1086 			return KERN_NOT_SUPPORTED;
1087 		}
1088 	}
1089 #endif /* CONFIG_SCHED_AUTO_JOIN */
1090 
1091 	struct work_interval *work_interval = kalloc_type(struct work_interval,
1092 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1093 
1094 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1095 
1096 	*work_interval = (struct work_interval) {
1097 		.wi_id                  = work_interval_id,
1098 		.wi_ref_count           = {},
1099 		.wi_create_flags        = create_flags,
1100 		.wi_creator_pid         = pid_from_task(creating_task),
1101 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
1102 		.wi_creator_pidversion  = get_task_version(creating_task),
1103 	};
1104 	os_ref_init(&work_interval->wi_ref_count, NULL);
1105 
1106 	__kdebug_only uint64_t tg_id = 0;
1107 #if CONFIG_THREAD_GROUPS
1108 	struct thread_group *tg;
1109 	if ((create_flags &
1110 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1111 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1112 		/* defer creation of the thread group until the
1113 		 * kern_work_interval_set_workload_id() call */
1114 		work_interval->wi_group = NULL;
1115 	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1116 		/* create a new group for the interval to represent */
1117 		char name[THREAD_GROUP_MAXNAME] = "";
1118 
1119 		snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1120 		    work_interval->wi_creator_pid);
1121 
1122 		tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1123 
1124 		thread_group_set_name(tg, name);
1125 
1126 		work_interval->wi_group = tg;
1127 	} else {
1128 		/* the interval represents the thread's home group */
1129 		tg = thread_group_get_home_group(thread);
1130 
1131 		thread_group_retain(tg);
1132 
1133 		work_interval->wi_group = tg;
1134 	}
1135 
1136 	/* Capture the tg_id for tracing purposes */
1137 	tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1138 
1139 #endif /* CONFIG_THREAD_GROUPS */
1140 
1141 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1142 		mach_port_name_t name = MACH_PORT_NULL;
1143 
1144 		/* work_interval has a +1 ref, moves to the port */
1145 		work_interval->wi_port = ipc_kobject_alloc_port(
1146 			(ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1147 			IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1148 
1149 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1150 
1151 		if (!MACH_PORT_VALID(name)) {
1152 			/*
1153 			 * copyout failed (port is already deallocated)
1154 			 * Because of the port-destroyed magic,
1155 			 * the work interval is already deallocated too.
1156 			 */
1157 			return KERN_RESOURCE_SHORTAGE;
1158 		}
1159 
1160 		create_params->wica_port = name;
1161 	} else {
1162 		/* work_interval has a +1 ref, moves to the thread */
1163 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1164 		if (kr != KERN_SUCCESS) {
1165 			/* No other thread can join this work interval since it isn't
1166 			 * JOINABLE so release the reference on work interval */
1167 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1168 			return kr;
1169 		}
1170 		create_params->wica_port = MACH_PORT_NULL;
1171 	}
1172 
1173 	create_params->wica_id = work_interval_id;
1174 
1175 	if (tg_id != ~0) {
1176 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1177 		    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1178 	}
1179 	return KERN_SUCCESS;
1180 }
1181 
1182 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1183 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1184 {
1185 	assert(flags != NULL);
1186 
1187 	kern_return_t kr;
1188 	struct work_interval *work_interval;
1189 
1190 	kr = port_name_to_work_interval(port_name, &work_interval);
1191 	if (kr != KERN_SUCCESS) {
1192 		return kr;
1193 	}
1194 
1195 	assert(work_interval != NULL);
1196 	*flags = work_interval->wi_create_flags;
1197 
1198 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1199 
1200 	return KERN_SUCCESS;
1201 }
1202 
1203 #if CONFIG_THREAD_GROUPS
1204 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1205     "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1206 #endif /* CONFIG_THREAD_GROUPS */
1207 
1208 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1209 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1210     size_t len)
1211 {
1212 	kern_return_t kr;
1213 	struct work_interval *work_interval;
1214 
1215 	if (len > WORK_INTERVAL_NAME_MAX) {
1216 		return KERN_INVALID_ARGUMENT;
1217 	}
1218 	kr = port_name_to_work_interval(port_name, &work_interval);
1219 	if (kr != KERN_SUCCESS) {
1220 		return kr;
1221 	}
1222 
1223 	assert(work_interval != NULL);
1224 
1225 #if CONFIG_THREAD_GROUPS
1226 	uint32_t wi_group_flags = os_atomic_load(
1227 		&work_interval->wi_group_flags, relaxed);
1228 	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1229 		kr = KERN_INVALID_ARGUMENT;
1230 		goto out;
1231 	}
1232 	if (!work_interval->wi_group) {
1233 		kr = KERN_INVALID_ARGUMENT;
1234 		goto out;
1235 	}
1236 
1237 	if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1238 		char tgname[THREAD_GROUP_MAXNAME];
1239 		snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1240 		    name);
1241 		thread_group_set_name(work_interval->wi_group, tgname);
1242 	}
1243 
1244 out:
1245 #endif /* CONFIG_THREAD_GROUPS */
1246 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1247 
1248 	return kr;
1249 }
1250 
1251 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1252 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1253     struct kern_work_interval_workload_id_args *workload_id_args)
1254 {
1255 	kern_return_t kr;
1256 	struct work_interval *work_interval;
1257 	uint32_t wlida_flags = 0;
1258 	uint32_t wlid_flags = 0;
1259 #if CONFIG_THREAD_GROUPS
1260 	uint32_t tg_flags = 0;
1261 #endif
1262 	bool from_workload_config = false;
1263 
1264 	/* Ensure workload ID name is non-empty. */
1265 	if (!workload_id_args->wlida_name[0]) {
1266 		return KERN_INVALID_ARGUMENT;
1267 	}
1268 
1269 	kr = port_name_to_work_interval(port_name, &work_interval);
1270 	if (kr != KERN_SUCCESS) {
1271 		return kr;
1272 	}
1273 
1274 	assert(work_interval != NULL);
1275 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1276 		kr = KERN_INVALID_ARGUMENT;
1277 		goto out;
1278 	}
1279 
1280 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1281 		/* Reject work intervals that didn't indicate they will have a workload ID
1282 		 * at creation. In particular if the work interval has its own thread group,
1283 		 * its creation must have been deferred in kern_work_interval_create */
1284 		kr = KERN_INVALID_ARGUMENT;
1285 		goto out;
1286 	}
1287 
1288 	workload_config_t wl_config = {};
1289 	kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1290 	if (kr == KERN_SUCCESS) {
1291 		if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1292 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1293 			if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1294 			    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1295 				/* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1296 			} else {
1297 				kr = KERN_INVALID_ARGUMENT;
1298 				goto out;
1299 			}
1300 		}
1301 
1302 		wlida_flags = wl_config.wc_flags;
1303 
1304 		wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1305 
1306 #if CONFIG_THREAD_GROUPS
1307 		tg_flags = wl_config.wc_thread_group_flags;
1308 		if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1309 		    (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1310 			kr = KERN_INVALID_ARGUMENT;
1311 			goto out;
1312 		}
1313 #endif /* CONFIG_THREAD_GROUPS */
1314 
1315 		from_workload_config = true;
1316 	} else {
1317 		/* If the workload is not present in the table, perform basic validation
1318 		 * that the create flags passed in match the ones used at work interval
1319 		 * create time */
1320 		if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1321 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1322 			kr = KERN_INVALID_ARGUMENT;
1323 			goto out;
1324 		}
1325 
1326 		const bool wc_avail = workload_config_available();
1327 		if (!wc_avail) {
1328 			wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1329 		}
1330 
1331 		/*
1332 		 * If the workload config wasn't even loaded then fallback to
1333 		 * older behaviour where the new thread group gets the default
1334 		 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1335 		 */
1336 #if CONFIG_THREAD_GROUPS
1337 		if (!wc_avail) {
1338 			tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1339 		} else {
1340 			struct thread_group *home_group =
1341 			    thread_group_get_home_group(current_thread());
1342 			if (home_group != NULL) {
1343 				tg_flags = thread_group_get_flags(home_group);
1344 			}
1345 		}
1346 #endif /* CONFIG_THREAD_GROUPS */
1347 	}
1348 
1349 	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1350 
1351 	/* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1352 	 * has been set). */
1353 	wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1354 	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1355 	    &wlid_flags, relaxed)) {
1356 		if (from_workload_config) {
1357 			work_interval->wi_class = wl_config.wc_class;
1358 			work_interval->wi_class_offset = wl_config.wc_class_offset;
1359 		}
1360 #if CONFIG_THREAD_GROUPS
1361 		if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1362 			/* Perform deferred thread group creation, now that tgflags are known */
1363 			struct thread_group *tg;
1364 			tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1365 			    THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1366 
1367 			char tgname[THREAD_GROUP_MAXNAME] = "";
1368 			snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1369 			    workload_id_args->wlida_name);
1370 			thread_group_set_name(tg, tgname);
1371 
1372 			assert(work_interval->wi_group == NULL);
1373 			work_interval->wi_group = tg;
1374 
1375 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1376 			    work_interval->wi_id, work_interval->wi_create_flags,
1377 			    work_interval->wi_creator_pid, thread_group_get_id(tg));
1378 		}
1379 #endif /* CONFIG_THREAD_GROUPS */
1380 	} else {
1381 		/* Workload ID has previously been set (or a thread has already joined). */
1382 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1383 			kr = KERN_INVALID_ARGUMENT;
1384 			goto out;
1385 		}
1386 		/* Treat this request as a query for the out parameters of the ID */
1387 		workload_id_args->wlida_flags = wlid_flags;
1388 	}
1389 
1390 	/*
1391 	 * Emit tracepoints for successfully setting the workload ID.
1392 	 *
1393 	 * After rdar://89342390 has been fixed and a new work interval ktrace
1394 	 * provider has been added, it will be possible to associate a numeric
1395 	 * ID with an ID name. Thus, for those cases where the ID name has been
1396 	 * looked up successfully (`from_workload_config` is true) it will no
1397 	 * longer be necessary to emit a tracepoint with the full ID name.
1398 	 */
1399 	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1400 	    work_interval->wi_id, from_workload_config);
1401 	kernel_debug_string_simple(
1402 		MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1403 		workload_id_args->wlida_name);
1404 
1405 	kr = KERN_SUCCESS;
1406 
1407 out:
1408 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1409 
1410 	return kr;
1411 }
1412 
1413 
1414 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1415 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1416 {
1417 	if (work_interval_id == 0) {
1418 		return KERN_INVALID_ARGUMENT;
1419 	}
1420 
1421 	if (thread->th_work_interval == NULL ||
1422 	    thread->th_work_interval->wi_id != work_interval_id) {
1423 		/* work ID isn't valid or doesn't match joined work interval ID */
1424 		return KERN_INVALID_ARGUMENT;
1425 	}
1426 
1427 	return thread_set_work_interval_explicit_join(thread, NULL);
1428 }
1429 
1430 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1431 kern_work_interval_join(thread_t            thread,
1432     mach_port_name_t    port_name)
1433 {
1434 	struct work_interval *work_interval = NULL;
1435 	kern_return_t kr;
1436 
1437 	if (port_name == MACH_PORT_NULL) {
1438 		/* 'Un-join' the current work interval */
1439 		return thread_set_work_interval_explicit_join(thread, NULL);
1440 	}
1441 
1442 	kr = port_name_to_work_interval(port_name, &work_interval);
1443 	if (kr != KERN_SUCCESS) {
1444 		return kr;
1445 	}
1446 	/* work_interval has a +1 ref */
1447 
1448 	assert(work_interval != NULL);
1449 
1450 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1451 	/* ref was consumed by passing it to the thread in the successful case */
1452 	if (kr != KERN_SUCCESS) {
1453 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1454 	}
1455 	return kr;
1456 }
1457 
1458 /*
1459  * work_interval_port_type_render_server()
1460  *
1461  * Helper routine to determine if the port points to a
1462  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1463  */
1464 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1465 work_interval_port_type_render_server(mach_port_name_t port_name)
1466 {
1467 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1468 }
1469