xref: /xnu-12377.81.4/osfmk/kern/work_interval.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45 
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49 
50 /*
51  * With the introduction of auto-join work intervals, it is possible
52  * to change the work interval (and related thread group) of a thread in a
53  * variety of contexts (thread termination, context switch, thread mode
54  * change etc.). In order to clearly specify the policy expectation and
55  * the locking behavior, all calls to thread_set_work_interval() pass
56  * in a set of flags.
57  */
58 
59 __options_decl(thread_work_interval_options_t, uint32_t, {
60 	/* Change the work interval using the explicit join rules */
61 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
62 	/* Change the work interval using the auto-join rules */
63 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
64 	/* Caller already holds the thread lock */
65 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
66 	/* Caller does not hold the thread lock */
67 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
68 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
69 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
70 });
71 
72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
73 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
74 
75 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
76     .iko_op_movable_send = true,
77     .iko_op_stable     = true,
78     .iko_op_no_senders = work_interval_port_no_senders);
79 
80 #if CONFIG_SCHED_AUTO_JOIN
81 /* MPSC queue used to defer deallocate work intervals */
82 static struct mpsc_daemon_queue work_interval_deallocate_queue;
83 
84 static void work_interval_deferred_release(struct work_interval *);
85 
86 /*
87  * Work Interval Auto-Join Status
88  *
89  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
90  * It packs the following information:
91  * - A bit representing if a "finish" is deferred on the work interval
92  * - Count of number of threads auto-joined to the work interval
93  */
94 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
95 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
97 typedef uint32_t work_interval_auto_join_status_t;
98 
99 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)100 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
101 {
102 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
103 }
104 
105 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)106 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
107 {
108 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
109 }
110 
111 /*
112  * struct work_interval_deferred_finish_state
113  *
114  * Contains the parameters of the finish operation which is being deferred.
115  */
116 struct work_interval_deferred_finish_state {
117 	uint64_t instance_id;
118 	uint64_t start;
119 	uint64_t deadline;
120 	uint64_t complexity;
121 };
122 
123 struct work_interval_auto_join_info {
124 	struct work_interval_deferred_finish_state deferred_finish_state;
125 	work_interval_auto_join_status_t _Atomic status;
126 };
127 #endif /* CONFIG_SCHED_AUTO_JOIN */
128 
129 #if CONFIG_THREAD_GROUPS
130 /* Flags atomically set in wi_group_flags wi_group_flags */
131 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
132 #endif
133 
134 /*
135  * Work Interval struct
136  *
137  * This struct represents a thread group and/or work interval context
138  * in a mechanism that is represented with a kobject.
139  *
140  * Every thread that has joined a WI has a +1 ref, and the port
141  * has a +1 ref as well.
142  *
143  * TODO: groups need to have a 'is for WI' flag
144  *      and they need a flag to create that says 'for WI'
145  *      This would allow CLPC to avoid allocating WI support
146  *      data unless it is needed
147  *
148  * TODO: Enforce not having more than one non-group joinable work
149  *      interval per thread group.
150  *      CLPC only wants to see one WI-notify callout per group.
151  */
152 struct work_interval {
153 	uint64_t wi_id;
154 	struct os_refcnt wi_ref_count;
155 	uint32_t wi_create_flags;
156 
157 	/* for debugging purposes only, does not hold a ref on port */
158 	ipc_port_t wi_port;
159 
160 	/*
161 	 * holds uniqueid and version of creating process,
162 	 * used to permission-gate notify
163 	 * TODO: you'd think there would be a better way to do this
164 	 */
165 	uint64_t wi_creator_uniqueid;
166 	uint32_t wi_creator_pid;
167 	int wi_creator_pidversion;
168 
169 	/* flags set by work_interval_set_workload_id and reflected onto
170 	 *  thread->th_work_interval_flags upon join */
171 	uint32_t wi_wlid_flags;
172 
173 #if CONFIG_THREAD_GROUPS
174 	uint32_t wi_group_flags;
175 	struct thread_group *wi_group;  /* holds +1 ref on group */
176 #endif /* CONFIG_THREAD_GROUPS */
177 
178 #if CONFIG_SCHED_AUTO_JOIN
179 	/* Information related to auto-join and deferred finish for work interval */
180 	struct work_interval_auto_join_info wi_auto_join_info;
181 
182 	/*
183 	 * Since the deallocation of auto-join work intervals
184 	 * can happen in the scheduler when the last thread in
185 	 * the WI blocks and the thread lock is held, the deallocation
186 	 * might have to be done on a separate thread.
187 	 */
188 	struct mpsc_queue_chain   wi_deallocate_link;
189 #endif /* CONFIG_SCHED_AUTO_JOIN */
190 
191 	/*
192 	 * Work interval class info - determines thread priority for threads
193 	 * with a work interval driven policy.
194 	 */
195 	wi_class_t wi_class;
196 	uint8_t wi_class_offset;
197 
198 	struct recount_work_interval wi_recount;
199 };
200 
201 /*
202  * work_interval_telemetry_data_enabled()
203  *
204  * Helper routine to check if work interval has the collection of telemetry data enabled.
205  */
206 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)207 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
208 {
209 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
210 }
211 
212 
213 /*
214  * work_interval_get_recount_tracks()
215  *
216  * Returns the recount tracks associated with a work interval, or NULL
217  * if the work interval is NULL or has telemetry disabled.
218  */
219 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)220 work_interval_get_recount_tracks(struct work_interval *work_interval)
221 {
222 	if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
223 		return work_interval->wi_recount.rwi_current_instance;
224 	}
225 	return NULL;
226 }
227 
228 #if CONFIG_SCHED_AUTO_JOIN
229 
230 /*
231  * work_interval_perform_deferred_finish()
232  *
233  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
234  * argument rather than looking at the work_interval since the deferred finish can race with another
235  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
236  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
237  * the deferred state without issues.
238  */
239 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)240 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
241     __unused struct work_interval *work_interval, __unused thread_t thread)
242 {
243 
244 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
245 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
246 }
247 
248 /*
249  * work_interval_auto_join_increment()
250  *
251  * Routine to increment auto-join counter when a new thread is auto-joined to
252  * the work interval.
253  */
254 static void
work_interval_auto_join_increment(struct work_interval * work_interval)255 work_interval_auto_join_increment(struct work_interval *work_interval)
256 {
257 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
258 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
259 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
260 }
261 
262 /*
263  * work_interval_auto_join_decrement()
264  *
265  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
266  * blocking or termination). If this was the last auto-joined thread in the work interval and
267  * there was a deferred finish, performs the finish operation for the work interval.
268  */
269 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)270 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
271 {
272 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
273 	work_interval_auto_join_status_t old_status, new_status;
274 	struct work_interval_deferred_finish_state deferred_finish_state;
275 	bool perform_finish;
276 
277 	/* Update the auto-join count for the work interval atomically */
278 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
279 		perform_finish = false;
280 		new_status = old_status;
281 		assert(work_interval_status_auto_join_count(old_status) > 0);
282 		new_status -= 1;
283 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
284 		        /* No auto-joined threads remaining and finish is deferred */
285 		        new_status = 0;
286 		        perform_finish = true;
287 		        /*
288 		         * Its important to copy the deferred finish state here so that this works
289 		         * when racing with another start-finish cycle.
290 		         */
291 		        deferred_finish_state = join_info->deferred_finish_state;
292 		}
293 	});
294 
295 	if (perform_finish == true) {
296 		/*
297 		 * Since work_interval_perform_deferred_finish() calls down to
298 		 * the machine layer callout for finish which gets the thread
299 		 * group from the thread passed in here, it is important to
300 		 * make sure that the thread still has the work interval thread
301 		 * group here.
302 		 */
303 		assert(thread->thread_group == work_interval->wi_group);
304 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
305 	}
306 }
307 
308 /*
309  * work_interval_auto_join_enabled()
310  *
311  * Helper routine to check if work interval has auto-join enabled.
312  */
313 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)314 work_interval_auto_join_enabled(struct work_interval *work_interval)
315 {
316 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
317 }
318 
319 /*
320  * work_interval_deferred_finish_enabled()
321  *
322  * Helper routine to check if work interval has deferred finish enabled.
323  */
324 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)325 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
326 {
327 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
328 }
329 
330 #endif /* CONFIG_SCHED_AUTO_JOIN */
331 
332 static inline void
work_interval_retain(struct work_interval * work_interval)333 work_interval_retain(struct work_interval *work_interval)
334 {
335 	/*
336 	 * Even though wi_retain is called under a port lock, we have
337 	 * to use os_ref_retain instead of os_ref_retain_locked
338 	 * because wi_release is not synchronized. wi_release calls
339 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
340 	 */
341 	os_ref_retain(&work_interval->wi_ref_count);
342 }
343 
344 static inline void
work_interval_deallocate(struct work_interval * work_interval)345 work_interval_deallocate(struct work_interval *work_interval)
346 {
347 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
348 	    work_interval->wi_id);
349 	if (work_interval_telemetry_data_enabled(work_interval)) {
350 		recount_work_interval_deinit(&work_interval->wi_recount);
351 	}
352 	kfree_type(struct work_interval, work_interval);
353 }
354 
355 /*
356  * work_interval_release()
357  *
358  * Routine to release a ref count on the work interval. If the refcount goes down
359  * to zero, the work interval needs to be de-allocated.
360  *
361  * For non auto-join work intervals, they are de-allocated in this context.
362  *
363  * For auto-join work intervals, the de-allocation cannot be done from this context
364  * since that might need the kernel memory allocator lock. In that case, the
365  * deallocation is done via a thread-call based mpsc queue.
366  */
367 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)368 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
369 {
370 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
371 #if CONFIG_SCHED_AUTO_JOIN
372 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
373 			work_interval_deferred_release(work_interval);
374 		} else {
375 			work_interval_deallocate(work_interval);
376 		}
377 #else /* CONFIG_SCHED_AUTO_JOIN */
378 		work_interval_deallocate(work_interval);
379 #endif /* CONFIG_SCHED_AUTO_JOIN */
380 	}
381 }
382 
383 void
kern_work_interval_release(struct work_interval * work_interval)384 kern_work_interval_release(struct work_interval *work_interval)
385 {
386 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
387 }
388 
389 #if CONFIG_SCHED_AUTO_JOIN
390 
391 /*
392  * work_interval_deferred_release()
393  *
394  * Routine to enqueue the work interval on the deallocation mpsc queue.
395  */
396 static void
work_interval_deferred_release(struct work_interval * work_interval)397 work_interval_deferred_release(struct work_interval *work_interval)
398 {
399 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
400 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
401 }
402 
403 /*
404  * work_interval_should_propagate()
405  *
406  * Main policy routine to decide if a thread should be auto-joined to
407  * another thread's work interval. The conditions are arranged such that
408  * the most common bailout condition are checked the earliest. This routine
409  * is called from the scheduler context; so it needs to be efficient and
410  * be careful when taking locks or performing wakeups.
411  */
412 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)413 work_interval_should_propagate(thread_t cthread, thread_t thread)
414 {
415 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
416 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
417 		return false;
418 	}
419 
420 	/* Only propagate work intervals which have auto-join enabled */
421 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
422 		return false;
423 	}
424 
425 	/* Work interval propagation is enabled for realtime threads only */
426 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
427 		return false;
428 	}
429 
430 
431 	/* Work interval propagation only works for threads with the same home thread group */
432 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
433 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
434 		return false;
435 	}
436 
437 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
438 	if (thread->thread_group != thread_home_tg) {
439 		return false;
440 	}
441 
442 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
443 	if ((!cthread->active) || (!thread->active)) {
444 		return false;
445 	}
446 
447 	return true;
448 }
449 
450 /*
451  * work_interval_auto_join_propagate()
452  *
453  * Routine to auto-join a thread into another thread's work interval
454  *
455  * Should only be invoked if work_interval_should_propagate() returns
456  * true. Also expects "from" thread to be current thread and "to" thread
457  * to be locked.
458  */
459 void
work_interval_auto_join_propagate(thread_t from,thread_t to)460 work_interval_auto_join_propagate(thread_t from, thread_t to)
461 {
462 	assert(from == current_thread());
463 	work_interval_retain(from->th_work_interval);
464 	work_interval_auto_join_increment(from->th_work_interval);
465 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
466 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
467 	assert(kr == KERN_SUCCESS);
468 }
469 
470 /*
471  * work_interval_auto_join_unwind()
472  *
473  * Routine to un-join an auto-joined work interval for a thread that is blocking.
474  *
475  * Expects thread to be locked.
476  */
477 void
work_interval_auto_join_unwind(thread_t thread)478 work_interval_auto_join_unwind(thread_t thread)
479 {
480 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
481 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
482 	assert(kr == KERN_SUCCESS);
483 }
484 
485 /*
486  * work_interval_auto_join_demote()
487  *
488  * Routine to un-join an auto-joined work interval when a thread is changing from
489  * realtime to non-realtime scheduling mode. This could happen due to multiple
490  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
491  * the thread being demoted may not be the current thread.
492  *
493  * Expects thread to be locked.
494  */
495 void
work_interval_auto_join_demote(thread_t thread)496 work_interval_auto_join_demote(thread_t thread)
497 {
498 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
499 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
500 	assert(kr == KERN_SUCCESS);
501 }
502 
503 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)504 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
505     __assert_only mpsc_daemon_queue_t dq)
506 {
507 	struct work_interval *work_interval = NULL;
508 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
509 	assert(dq == &work_interval_deallocate_queue);
510 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
511 	work_interval_deallocate(work_interval);
512 }
513 
514 #endif /* CONFIG_SCHED_AUTO_JOIN */
515 
516 #if CONFIG_SCHED_AUTO_JOIN
517 __startup_func
518 static void
work_interval_subsystem_init(void)519 work_interval_subsystem_init(void)
520 {
521 	/*
522 	 * The work interval deallocation queue must be a thread call based queue
523 	 * because it is woken up from contexts where the thread lock is held. The
524 	 * only way to perform wakeups safely in those contexts is to wakeup a
525 	 * thread call which is guaranteed to be on a different waitq and would
526 	 * not hash onto the same global waitq which might be currently locked.
527 	 */
528 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
529 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
530 	    MPSC_DAEMON_INIT_NONE);
531 }
532 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
533 #endif /* CONFIG_SCHED_AUTO_JOIN */
534 
535 /*
536  * work_interval_port_convert
537  *
538  * Called with port locked, returns reference to work interval
539  * if indeed the port is a work interval kobject port
540  */
541 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)542 work_interval_port_convert_locked(ipc_port_t port)
543 {
544 	struct work_interval *work_interval = NULL;
545 
546 	if (IP_VALID(port)) {
547 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
548 		if (work_interval) {
549 			work_interval_retain(work_interval);
550 		}
551 	}
552 
553 	return work_interval;
554 }
555 
556 /*
557  * port_name_to_work_interval
558  *
559  * Description: Obtain a reference to the work_interval associated with a given port.
560  *
561  * Parameters:  name    A Mach port name to translate.
562  *
563  * Returns:     NULL    The given Mach port did not reference a work_interval.
564  *              !NULL   The work_interval that is associated with the Mach port.
565  */
566 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)567 port_name_to_work_interval(mach_port_name_t     name,
568     struct work_interval **work_interval)
569 {
570 	if (!MACH_PORT_VALID(name)) {
571 		return KERN_INVALID_NAME;
572 	}
573 
574 	ipc_port_t port = IP_NULL;
575 	kern_return_t kr = KERN_SUCCESS;
576 
577 	kr = ipc_port_translate_send(current_space(), name, &port);
578 	if (kr != KERN_SUCCESS) {
579 		return kr;
580 	}
581 	/* port is locked */
582 
583 	assert(IP_VALID(port));
584 
585 	struct work_interval *converted_work_interval;
586 
587 	converted_work_interval = work_interval_port_convert_locked(port);
588 
589 	/* the port is valid, but doesn't denote a work_interval */
590 	if (converted_work_interval == NULL) {
591 		kr = KERN_INVALID_CAPABILITY;
592 	}
593 
594 	ip_mq_unlock(port);
595 
596 	if (kr == KERN_SUCCESS) {
597 		*work_interval = converted_work_interval;
598 	}
599 
600 	return kr;
601 }
602 
603 kern_return_t
kern_port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)604 kern_port_name_to_work_interval(mach_port_name_t name,
605     struct work_interval **work_interval)
606 {
607 	return port_name_to_work_interval(name, work_interval);
608 }
609 
610 /*
611  * work_interval_port_no_senders
612  *
613  * Description: Handle a no-senders notification for a work interval port.
614  *              Destroys the port and releases its reference on the work interval.
615  *
616  * Parameters:  msg     A Mach no-senders notification message.
617  *
618  * Note: This assumes that there is only one create-right-from-work-interval point,
619  *       if the ability to extract another send right after creation is added,
620  *       this will have to change to handle make-send counts correctly.
621  */
622 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)623 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
624 {
625 	struct work_interval *work_interval = NULL;
626 
627 	work_interval = ipc_kobject_dealloc_port(port, mscount,
628 	    IKOT_WORK_INTERVAL);
629 
630 	work_interval->wi_port = MACH_PORT_NULL;
631 
632 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
633 }
634 
635 /*
636  * work_interval_port_type()
637  *
638  * Converts a port name into the work interval object and returns its type.
639  *
640  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
641  * valid type for work intervals).
642  */
643 static uint32_t
work_interval_port_type(mach_port_name_t port_name)644 work_interval_port_type(mach_port_name_t port_name)
645 {
646 	struct work_interval *work_interval = NULL;
647 	kern_return_t kr;
648 	uint32_t work_interval_type;
649 
650 	if (port_name == MACH_PORT_NULL) {
651 		return WORK_INTERVAL_TYPE_LAST;
652 	}
653 
654 	kr = port_name_to_work_interval(port_name, &work_interval);
655 	if (kr != KERN_SUCCESS) {
656 		return WORK_INTERVAL_TYPE_LAST;
657 	}
658 	/* work_interval has a +1 ref */
659 
660 	assert(work_interval != NULL);
661 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
662 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
663 	return work_interval_type;
664 }
665 
666 /*
667  * Sparse - not all work interval classes imply a scheduling policy change.
668  * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
669  * adopted the REALTIME sched mode to take effect.
670  */
671 static const struct {
672 	int          priority;
673 	sched_mode_t sched_mode;
674 } work_interval_class_data[WI_CLASS_COUNT] = {
675 	[WI_CLASS_BEST_EFFORT] = {
676 		BASEPRI_DEFAULT,        // 31
677 		TH_MODE_TIMESHARE,
678 	},
679 
680 	[WI_CLASS_APP_SUPPORT] = {
681 		BASEPRI_USER_INITIATED, // 37
682 		TH_MODE_TIMESHARE,
683 	},
684 
685 	[WI_CLASS_SYSTEM] = {
686 		BASEPRI_FOREGROUND + 1, // 48
687 		TH_MODE_FIXED,
688 	},
689 
690 	[WI_CLASS_SYSTEM_CRITICAL] = {
691 		MAXPRI_USER + 1,        // 64
692 		TH_MODE_FIXED,
693 	},
694 
695 	[WI_CLASS_REALTIME_CRITICAL] = {
696 		BASEPRI_RTQUEUES + 1,   // 98
697 		TH_MODE_REALTIME,
698 	},
699 };
700 
701 /*
702  * Called when a thread gets its scheduling priority from its associated work
703  * interval.
704  */
705 int
work_interval_get_priority(thread_t thread)706 work_interval_get_priority(thread_t thread)
707 {
708 	const struct work_interval *work_interval = thread->th_work_interval;
709 	assert(work_interval != NULL);
710 
711 	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
712 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
713 	int priority = work_interval_class_data[work_interval->wi_class].priority;
714 	assert(priority != 0);
715 
716 	priority += work_interval->wi_class_offset;
717 	assert3u(priority, <=, MAXPRI);
718 
719 	return priority;
720 }
721 
722 kern_return_t
kern_work_interval_get_policy(struct work_interval * work_interval,integer_t * policy,integer_t * priority)723 kern_work_interval_get_policy(struct work_interval *work_interval,
724     integer_t *policy,
725     integer_t *priority)
726 {
727 	if (!work_interval || !priority || !policy) {
728 		return KERN_INVALID_ARGUMENT;
729 	}
730 
731 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
732 
733 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
734 	if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) {
735 		*policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
736 		*priority = work_interval_class_data[work_interval->wi_class].priority;
737 		assert(*priority != 0);
738 		*priority += work_interval->wi_class_offset;
739 		assert3u(*priority, <=, MAXPRI);
740 	} /* No sched mode change for REALTIME (threads must explicitly opt-in) */
741 	return KERN_SUCCESS;
742 }
743 
744 #if CONFIG_THREAD_GROUPS
745 kern_return_t
kern_work_interval_get_thread_group(struct work_interval * work_interval,struct thread_group ** tg)746 kern_work_interval_get_thread_group(struct work_interval *work_interval,
747     struct thread_group **tg)
748 {
749 	if (!work_interval || !tg) {
750 		return KERN_INVALID_ARGUMENT;
751 	}
752 	if (work_interval->wi_group) {
753 		*tg = thread_group_retain(work_interval->wi_group);
754 		return KERN_SUCCESS;
755 	} else {
756 		return KERN_INVALID_ARGUMENT;
757 	}
758 }
759 #endif /* CONFIG_THREAD_GROUPS */
760 
761 /*
762  * Switch to a policy driven by the work interval (if applicable).
763  */
764 static void
work_interval_set_policy(thread_t thread)765 work_interval_set_policy(thread_t thread)
766 {
767 	assert3p(thread, ==, current_thread());
768 
769 	/*
770 	 * Ignore policy changes if the workload context shouldn't affect the
771 	 * scheduling policy.
772 	 */
773 	workload_config_flags_t flags = WLC_F_NONE;
774 
775 	/* There may be no config at all. That's ok. */
776 	if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
777 	    (flags & WLC_F_THREAD_POLICY) == 0) {
778 		return;
779 	}
780 
781 	const struct work_interval *work_interval = thread->th_work_interval;
782 	assert(work_interval != NULL);
783 
784 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
785 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
786 
787 	/*
788 	 * A mode of TH_MODE_NONE implies that this work interval has no
789 	 * associated scheduler effects.
790 	 */
791 	if (mode == TH_MODE_NONE) {
792 		return;
793 	}
794 
795 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
796 	    TASK_POLICY_WI_DRIVEN, true, mode);
797 	assert(thread->requested_policy.thrp_wi_driven);
798 
799 	return;
800 }
801 
802 /*
803  * Clear a work interval driven policy.
804  */
805 static void
work_interval_clear_policy(thread_t thread)806 work_interval_clear_policy(thread_t thread)
807 {
808 	assert3p(thread, ==, current_thread());
809 
810 	if (!thread->requested_policy.thrp_wi_driven) {
811 		return;
812 	}
813 
814 	const sched_mode_t mode = sched_get_thread_mode_user(thread);
815 
816 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
817 	    TASK_POLICY_WI_DRIVEN, false,
818 	    mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
819 
820 	assert(!thread->requested_policy.thrp_wi_driven);
821 
822 	return;
823 }
824 
825 /*
826  * thread_set_work_interval()
827  *
828  * Change thread's bound work interval to the passed-in work interval
829  * Consumes +1 ref on work_interval upon success.
830  *
831  * May also pass NULL to un-set work_interval on the thread
832  * Will deallocate any old work interval on the thread
833  * Return error if thread does not satisfy requirements to join work interval
834  *
835  * For non auto-join work intervals, deallocate any old work interval on the thread
836  * For auto-join work intervals, the routine may wakeup the work interval deferred
837  * deallocation queue since thread locks might be currently held.
838  */
839 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)840 thread_set_work_interval(thread_t thread,
841     struct work_interval *work_interval, thread_work_interval_options_t options)
842 {
843 	/* All explicit work interval operations should always be from the current thread */
844 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
845 		assert(thread == current_thread());
846 	}
847 
848 	/* All cases of needing the thread lock should be from explicit join scenarios */
849 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
850 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
851 	}
852 
853 	/* For all cases of auto join must come in with the thread lock held */
854 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
855 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
856 	}
857 
858 #if CONFIG_THREAD_GROUPS
859 	if (work_interval && !work_interval->wi_group) {
860 		/* Reject join on work intervals with deferred thread group creation */
861 		return KERN_INVALID_ARGUMENT;
862 	}
863 #endif /* CONFIG_THREAD_GROUPS */
864 
865 	if (work_interval) {
866 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
867 
868 		if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
869 			/* Ensure no kern_work_interval_set_workload_id can happen after this point */
870 			uint32_t wlid_flags;
871 			(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
872 			    WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
873 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
874 				/* For workload IDs with rt-allowed, neuter the check below to
875 				 * enable joining before the thread has become realtime for all
876 				 * work interval types */
877 				work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
878 			}
879 		}
880 
881 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
882 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
883 			return KERN_INVALID_ARGUMENT;
884 		}
885 	}
886 
887 	/*
888 	 * Ensure a work interval scheduling policy is not used if the thread is
889 	 * leaving the work interval.
890 	 */
891 	if (work_interval == NULL &&
892 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
893 		work_interval_clear_policy(thread);
894 	}
895 
896 	struct work_interval *old_th_wi = thread->th_work_interval;
897 #if CONFIG_SCHED_AUTO_JOIN
898 	spl_t s;
899 	/* Take the thread lock if needed */
900 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
901 		s = splsched();
902 		thread_lock(thread);
903 	}
904 
905 	/*
906 	 * Work interval auto-join leak to non-RT threads.
907 	 *
908 	 * If thread might be running on a remote core and it's not in the context switch path (where
909 	 * thread is neither running, blocked or in the runq), its not possible to update the
910 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
911 	 * core. This situation might happen when a thread is transitioning from realtime to
912 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
913 	 * be part of the work interval.
914 	 *
915 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
916 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
917 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
918 	 */
919 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
920 
921 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
922 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
923 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
924 		return KERN_SUCCESS;
925 	}
926 
927 	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
928 
929 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
930 		__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
931 		__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
932 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
933 		    thread_tid(thread), old_tg_id, new_tg_id, options);
934 	}
935 
936 	if (old_wi_auto_joined) {
937 		/*
938 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
939 		 * happened due to the "leak" described above.
940 		 */
941 		if (thread->sched_mode != TH_MODE_REALTIME) {
942 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
943 		}
944 
945 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
946 		work_interval_auto_join_decrement(old_th_wi, thread);
947 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
948 	}
949 
950 #endif /* CONFIG_SCHED_AUTO_JOIN */
951 
952 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
953 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
954 
955 	/* transfer +1 ref to thread */
956 	thread->th_work_interval = work_interval;
957 
958 #if CONFIG_SCHED_AUTO_JOIN
959 
960 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
961 		assert(work_interval_auto_join_enabled(work_interval) == true);
962 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
963 	}
964 
965 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
966 		thread_unlock(thread);
967 		splx(s);
968 	}
969 #endif /* CONFIG_SCHED_AUTO_JOIN */
970 
971 	/*
972 	 * The thread got a new work interval. It may come with a work interval
973 	 * scheduling policy that needs to be applied.
974 	 */
975 	if (work_interval != NULL &&
976 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
977 		work_interval_set_policy(thread);
978 	}
979 
980 #if CONFIG_THREAD_GROUPS
981 	if (work_interval) {
982 		/* Prevent thread_group_set_name after CLPC may have already heard
983 		 * about the thread group */
984 		(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
985 		    WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
986 	}
987 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
988 
989 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
990 #if CONFIG_SCHED_AUTO_JOIN
991 		thread_set_autojoin_thread_group_locked(thread, new_tg);
992 #endif
993 	} else {
994 		thread_set_work_interval_thread_group(thread, new_tg);
995 	}
996 #endif /* CONFIG_THREAD_GROUPS */
997 
998 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
999 		/* Construct mask to XOR with th_work_interval_flags to clear the
1000 		* currently present flags and set the new flags in wlid_flags. */
1001 		uint32_t wlid_flags = 0;
1002 		if (work_interval) {
1003 			wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
1004 		}
1005 		thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
1006 			&thread->th_work_interval_flags, relaxed);
1007 		th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
1008 		    TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
1009 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1010 			th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1011 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1012 				th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1013 			}
1014 		}
1015 		if (th_wi_xor_mask) {
1016 			os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1017 		}
1018 
1019 		/*
1020 		 * Now that the interval flags have been set, re-evaluate
1021 		 * whether the thread needs to be undemoted - the new work
1022 		 * interval may have the RT_ALLOWED flag. and the thread may
1023 		 * have have a realtime policy but be demoted.
1024 		 */
1025 		thread_rt_evaluate(thread);
1026 	}
1027 
1028 	if (old_th_wi != NULL) {
1029 		work_interval_release(old_th_wi, options);
1030 	}
1031 
1032 	return KERN_SUCCESS;
1033 }
1034 
1035 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1036 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1037 {
1038 	assert(thread == current_thread());
1039 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1040 }
1041 
1042 kern_return_t
work_interval_thread_terminate(thread_t thread)1043 work_interval_thread_terminate(thread_t thread)
1044 {
1045 	assert(thread == current_thread());
1046 	if (thread->th_work_interval != NULL) {
1047 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1048 	}
1049 	return KERN_SUCCESS;
1050 }
1051 
1052 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1053 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1054 {
1055 	assert(thread == current_thread());
1056 	assert(kwi_args->work_interval_id != 0);
1057 
1058 	struct work_interval *work_interval = thread->th_work_interval;
1059 
1060 	if (work_interval == NULL ||
1061 	    work_interval->wi_id != kwi_args->work_interval_id) {
1062 		/* This thread must have adopted the work interval to be able to notify */
1063 		return KERN_INVALID_ARGUMENT;
1064 	}
1065 
1066 	task_t notifying_task = current_task();
1067 
1068 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1069 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1070 		/* Only the creating task can do a notify */
1071 		return KERN_INVALID_ARGUMENT;
1072 	}
1073 
1074 	spl_t s = splsched();
1075 
1076 #if CONFIG_THREAD_GROUPS
1077 	assert(work_interval->wi_group == thread->thread_group);
1078 #endif /* CONFIG_THREAD_GROUPS */
1079 
1080 	uint64_t urgency_param1, urgency_param2;
1081 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1082 
1083 	splx(s);
1084 
1085 	/* called without interrupts disabled */
1086 	machine_work_interval_notify(thread, kwi_args);
1087 
1088 	return KERN_SUCCESS;
1089 }
1090 
1091 /* Start at 1, 0 is not a valid work interval ID */
1092 static _Atomic uint64_t unique_work_interval_id = 1;
1093 
1094 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1095 kern_work_interval_create(thread_t thread,
1096     struct kern_work_interval_create_args *create_params)
1097 {
1098 	assert(thread == current_thread());
1099 
1100 	uint32_t create_flags = create_params->wica_create_flags;
1101 
1102 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1103 	    thread->th_work_interval != NULL) {
1104 		/*
1105 		 * If the thread is doing a legacy combined create and join,
1106 		 * it shouldn't already be part of a work interval.
1107 		 *
1108 		 * (Creating a joinable WI is allowed anytime.)
1109 		 */
1110 		return KERN_FAILURE;
1111 	}
1112 
1113 	/*
1114 	 * Check the validity of the create flags before allocating the work
1115 	 * interval.
1116 	 */
1117 	task_t creating_task = current_task();
1118 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1119 		/*
1120 		 * CA_CLIENT work intervals do not create new thread groups.
1121 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1122 		 * per each application task
1123 		 */
1124 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1125 			return KERN_FAILURE;
1126 		}
1127 		if (!task_is_app(creating_task)) {
1128 #if XNU_TARGET_OS_OSX
1129 			/*
1130 			 * Soft-fail the case of a non-app pretending to be an
1131 			 * app, by allowing it to press the buttons, but they're
1132 			 * not actually connected to anything.
1133 			 */
1134 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1135 #else
1136 			/*
1137 			 * On iOS, it's a hard failure to get your apptype
1138 			 * wrong and then try to render something.
1139 			 */
1140 			return KERN_NOT_SUPPORTED;
1141 #endif /* XNU_TARGET_OS_OSX */
1142 		}
1143 		if (task_set_ca_client_wi(creating_task, true) == false) {
1144 			return KERN_FAILURE;
1145 		}
1146 	}
1147 
1148 #if CONFIG_SCHED_AUTO_JOIN
1149 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1150 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1151 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1152 			return KERN_NOT_SUPPORTED;
1153 		}
1154 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1155 			return KERN_NOT_SUPPORTED;
1156 		}
1157 	}
1158 
1159 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1160 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1161 			return KERN_NOT_SUPPORTED;
1162 		}
1163 	}
1164 #endif /* CONFIG_SCHED_AUTO_JOIN */
1165 
1166 	struct work_interval *work_interval = kalloc_type(struct work_interval,
1167 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1168 
1169 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1170 
1171 	*work_interval = (struct work_interval) {
1172 		.wi_id                  = work_interval_id,
1173 		.wi_ref_count           = {},
1174 		.wi_create_flags        = create_flags,
1175 		.wi_creator_pid         = pid_from_task(creating_task),
1176 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
1177 		.wi_creator_pidversion  = get_task_version(creating_task),
1178 	};
1179 	os_ref_init(&work_interval->wi_ref_count, NULL);
1180 
1181 	if (work_interval_telemetry_data_enabled(work_interval)) {
1182 		recount_work_interval_init(&work_interval->wi_recount);
1183 	}
1184 
1185 	__kdebug_only uint64_t tg_id = 0;
1186 #if CONFIG_THREAD_GROUPS
1187 	struct thread_group *tg;
1188 	if ((create_flags &
1189 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1190 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1191 		/* defer creation of the thread group until the
1192 		 * kern_work_interval_set_workload_id() call */
1193 		work_interval->wi_group = NULL;
1194 	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1195 		/* create a new group for the interval to represent */
1196 		char name[THREAD_GROUP_MAXNAME] = "";
1197 
1198 		snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1199 		    work_interval->wi_creator_pid);
1200 
1201 		tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1202 
1203 		thread_group_set_name(tg, name);
1204 
1205 		work_interval->wi_group = tg;
1206 	} else {
1207 		/* the interval represents the thread's home group */
1208 		tg = thread_group_get_home_group(thread);
1209 
1210 		thread_group_retain(tg);
1211 
1212 		work_interval->wi_group = tg;
1213 	}
1214 
1215 	/* Capture the tg_id for tracing purposes */
1216 	tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1217 
1218 #endif /* CONFIG_THREAD_GROUPS */
1219 
1220 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1221 		mach_port_name_t name = MACH_PORT_NULL;
1222 
1223 		/* work_interval has a +1 ref, moves to the port */
1224 		work_interval->wi_port = ipc_kobject_alloc_port(work_interval,
1225 		    IKOT_WORK_INTERVAL, IPC_KOBJECT_ALLOC_MAKE_SEND);
1226 
1227 
1228 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1229 
1230 		if (!MACH_PORT_VALID(name)) {
1231 			/*
1232 			 * copyout failed (port is already deallocated)
1233 			 * Because of the port-destroyed magic,
1234 			 * the work interval is already deallocated too.
1235 			 */
1236 			return KERN_RESOURCE_SHORTAGE;
1237 		}
1238 
1239 		create_params->wica_port = name;
1240 	} else {
1241 		/* work_interval has a +1 ref, moves to the thread */
1242 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1243 		if (kr != KERN_SUCCESS) {
1244 			/* No other thread can join this work interval since it isn't
1245 			 * JOINABLE so release the reference on work interval */
1246 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1247 			return kr;
1248 		}
1249 
1250 		create_params->wica_port = MACH_PORT_NULL;
1251 	}
1252 
1253 	create_params->wica_id = work_interval_id;
1254 
1255 	if (tg_id != ~0) {
1256 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1257 		    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1258 	}
1259 	return KERN_SUCCESS;
1260 }
1261 
1262 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1263 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1264 {
1265 	assert(flags != NULL);
1266 
1267 	kern_return_t kr;
1268 	struct work_interval *work_interval;
1269 
1270 	kr = port_name_to_work_interval(port_name, &work_interval);
1271 	if (kr != KERN_SUCCESS) {
1272 		return kr;
1273 	}
1274 
1275 	assert(work_interval != NULL);
1276 	*flags = work_interval->wi_create_flags;
1277 
1278 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1279 
1280 	return KERN_SUCCESS;
1281 }
1282 
1283 #if CONFIG_THREAD_GROUPS
1284 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1285     "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1286 #endif /* CONFIG_THREAD_GROUPS */
1287 
1288 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1289 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1290     size_t len)
1291 {
1292 	kern_return_t kr;
1293 	struct work_interval *work_interval;
1294 
1295 	if (len > WORK_INTERVAL_NAME_MAX) {
1296 		return KERN_INVALID_ARGUMENT;
1297 	}
1298 	kr = port_name_to_work_interval(port_name, &work_interval);
1299 	if (kr != KERN_SUCCESS) {
1300 		return kr;
1301 	}
1302 
1303 	assert(work_interval != NULL);
1304 
1305 #if CONFIG_THREAD_GROUPS
1306 	uint32_t wi_group_flags = os_atomic_load(
1307 		&work_interval->wi_group_flags, relaxed);
1308 	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1309 		kr = KERN_INVALID_ARGUMENT;
1310 		goto out;
1311 	}
1312 	if (!work_interval->wi_group) {
1313 		kr = KERN_INVALID_ARGUMENT;
1314 		goto out;
1315 	}
1316 
1317 	if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1318 		char tgname[THREAD_GROUP_MAXNAME];
1319 		snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1320 		    name);
1321 		thread_group_set_name(work_interval->wi_group, tgname);
1322 	}
1323 
1324 out:
1325 #endif /* CONFIG_THREAD_GROUPS */
1326 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1327 
1328 	return kr;
1329 }
1330 
1331 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1332 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1333     struct kern_work_interval_workload_id_args *workload_id_args)
1334 {
1335 	kern_return_t kr;
1336 	struct work_interval *work_interval;
1337 	uint32_t wlida_flags = 0;
1338 	uint32_t wlid_flags = 0;
1339 #if CONFIG_THREAD_GROUPS
1340 	uint32_t tg_flags = 0;
1341 #endif
1342 	bool from_workload_config = false;
1343 
1344 	/* Ensure workload ID name is non-empty. */
1345 	if (!workload_id_args->wlida_name[0]) {
1346 		return KERN_INVALID_ARGUMENT;
1347 	}
1348 
1349 	kr = port_name_to_work_interval(port_name, &work_interval);
1350 	if (kr != KERN_SUCCESS) {
1351 		return kr;
1352 	}
1353 
1354 	assert(work_interval != NULL);
1355 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1356 		kr = KERN_INVALID_ARGUMENT;
1357 		goto out;
1358 	}
1359 
1360 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1361 		/* Reject work intervals that didn't indicate they will have a workload ID
1362 		 * at creation. In particular if the work interval has its own thread group,
1363 		 * its creation must have been deferred in kern_work_interval_create */
1364 		kr = KERN_INVALID_ARGUMENT;
1365 		goto out;
1366 	}
1367 
1368 	workload_config_t wl_config = {};
1369 	kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1370 	if (kr == KERN_SUCCESS) {
1371 		if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1372 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1373 			if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1374 			    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1375 				/* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1376 			} else {
1377 				kr = KERN_INVALID_ARGUMENT;
1378 				goto out;
1379 			}
1380 		}
1381 
1382 		wlida_flags = wl_config.wc_flags;
1383 
1384 #if !defined(XNU_TARGET_OS_XR)
1385 		wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1386 #endif /* !XNU_TARGET_OS_XR */
1387 
1388 #if CONFIG_THREAD_GROUPS
1389 		tg_flags = wl_config.wc_thread_group_flags;
1390 		if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1391 		    (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1392 			kr = KERN_INVALID_ARGUMENT;
1393 			goto out;
1394 		}
1395 #endif /* CONFIG_THREAD_GROUPS */
1396 
1397 		from_workload_config = true;
1398 	} else {
1399 		/* If the workload is not present in the table, perform basic validation
1400 		 * that the create flags passed in match the ones used at work interval
1401 		 * create time */
1402 		if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1403 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1404 			kr = KERN_INVALID_ARGUMENT;
1405 			goto out;
1406 		}
1407 
1408 		const bool wc_avail = workload_config_available();
1409 		if (!wc_avail) {
1410 			wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1411 		}
1412 
1413 		if (workload_id_args->wlida_flags & WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED) {
1414 			wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED;
1415 		}
1416 
1417 		/*
1418 		 * If the workload config wasn't even loaded then fallback to
1419 		 * older behaviour where the new thread group gets the default
1420 		 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1421 		 */
1422 #if CONFIG_THREAD_GROUPS
1423 		if (!wc_avail) {
1424 			tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1425 		} else {
1426 			struct thread_group *home_group =
1427 			    thread_group_get_home_group(current_thread());
1428 			if (home_group != NULL) {
1429 				tg_flags = thread_group_get_flags(home_group);
1430 			}
1431 		}
1432 #endif /* CONFIG_THREAD_GROUPS */
1433 	}
1434 
1435 	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1436 
1437 	/* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1438 	 * has been set). */
1439 	wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1440 	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1441 	    &wlid_flags, relaxed)) {
1442 		if (from_workload_config) {
1443 			work_interval->wi_class = wl_config.wc_class;
1444 			work_interval->wi_class_offset = wl_config.wc_class_offset;
1445 		}
1446 #if CONFIG_THREAD_GROUPS
1447 		if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1448 			/* Perform deferred thread group creation, now that tgflags are known */
1449 			struct thread_group *tg;
1450 			tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1451 			    THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1452 
1453 			char tgname[THREAD_GROUP_MAXNAME] = "";
1454 			snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1455 			    workload_id_args->wlida_name);
1456 			thread_group_set_name(tg, tgname);
1457 
1458 			assert(work_interval->wi_group == NULL);
1459 			work_interval->wi_group = tg;
1460 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1461 			    work_interval->wi_id, work_interval->wi_create_flags,
1462 			    work_interval->wi_creator_pid, thread_group_get_id(tg));
1463 		}
1464 #endif /* CONFIG_THREAD_GROUPS */
1465 	} else {
1466 		/* Workload ID has previously been set (or a thread has already joined). */
1467 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1468 			kr = KERN_INVALID_ARGUMENT;
1469 			goto out;
1470 		}
1471 		/* Treat this request as a query for the out parameters of the ID */
1472 		workload_id_args->wlida_flags = wlid_flags;
1473 	}
1474 
1475 	/*
1476 	 * Emit tracepoints for successfully setting the workload ID.
1477 	 *
1478 	 * After rdar://89342390 has been fixed and a new work interval ktrace
1479 	 * provider has been added, it will be possible to associate a numeric
1480 	 * ID with an ID name. Thus, for those cases where the ID name has been
1481 	 * looked up successfully (`from_workload_config` is true) it will no
1482 	 * longer be necessary to emit a tracepoint with the full ID name.
1483 	 */
1484 	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1485 	    work_interval->wi_id, from_workload_config);
1486 	kernel_debug_string_simple(
1487 		MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1488 		workload_id_args->wlida_name);
1489 
1490 	kr = KERN_SUCCESS;
1491 
1492 out:
1493 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1494 
1495 	return kr;
1496 }
1497 
1498 
1499 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1500 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1501 {
1502 	if (work_interval_id == 0) {
1503 		return KERN_INVALID_ARGUMENT;
1504 	}
1505 
1506 	if (thread->th_work_interval == NULL ||
1507 	    thread->th_work_interval->wi_id != work_interval_id) {
1508 		/* work ID isn't valid or doesn't match joined work interval ID */
1509 		return KERN_INVALID_ARGUMENT;
1510 	}
1511 
1512 	return thread_set_work_interval_explicit_join(thread, NULL);
1513 }
1514 
1515 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1516 kern_work_interval_join(thread_t            thread,
1517     mach_port_name_t    port_name)
1518 {
1519 	struct work_interval *work_interval = NULL;
1520 	kern_return_t kr;
1521 
1522 	if (port_name == MACH_PORT_NULL) {
1523 		/* 'Un-join' the current work interval */
1524 		return thread_set_work_interval_explicit_join(thread, NULL);
1525 	}
1526 
1527 	kr = port_name_to_work_interval(port_name, &work_interval);
1528 	if (kr != KERN_SUCCESS) {
1529 		return kr;
1530 	}
1531 	/* work_interval has a +1 ref */
1532 
1533 	assert(work_interval != NULL);
1534 
1535 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1536 	/* ref was consumed by passing it to the thread in the successful case */
1537 	if (kr != KERN_SUCCESS) {
1538 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1539 	}
1540 	return kr;
1541 }
1542 
1543 kern_return_t
kern_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1544 kern_work_interval_explicit_join(thread_t thread,
1545     struct work_interval *work_interval)
1546 {
1547 	kern_return_t kr;
1548 	assert(thread == current_thread());
1549 	assert(work_interval != NULL);
1550 
1551 	/*
1552 	 * We take +1 ref on the work interval which is consumed by passing it
1553 	 * on to the thread below in the successful case.
1554 	 */
1555 	work_interval_retain(work_interval);
1556 
1557 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1558 	if (kr != KERN_SUCCESS) {
1559 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1560 	}
1561 	return kr;
1562 }
1563 
1564 /*
1565  * work_interval_port_type_render_server()
1566  *
1567  * Helper routine to determine if the port points to a
1568  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1569  */
1570 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1571 work_interval_port_type_render_server(mach_port_name_t port_name)
1572 {
1573 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1574 }
1575