xref: /xnu-11215.41.3/osfmk/kern/work_interval.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45 
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49 
50 #include <stdatomic.h>
51 
52 /*
53  * With the introduction of auto-join work intervals, it is possible
54  * to change the work interval (and related thread group) of a thread in a
55  * variety of contexts (thread termination, context switch, thread mode
56  * change etc.). In order to clearly specify the policy expectation and
57  * the locking behavior, all calls to thread_set_work_interval() pass
58  * in a set of flags.
59  */
60 
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 	/* Change the work interval using the explicit join rules */
63 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 	/* Change the work interval using the auto-join rules */
65 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
66 	/* Caller already holds the thread lock */
67 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
68 	/* Caller does not hold the thread lock */
69 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
70 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
72 });
73 
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76 
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78     .iko_op_stable     = true,
79     .iko_op_no_senders = work_interval_port_no_senders);
80 
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84 
85 static void work_interval_deferred_release(struct work_interval *);
86 
87 /*
88  * Work Interval Auto-Join Status
89  *
90  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91  * It packs the following information:
92  * - A bit representing if a "finish" is deferred on the work interval
93  * - Count of number of threads auto-joined to the work interval
94  */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99 
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105 
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111 
112 /*
113  * struct work_interval_deferred_finish_state
114  *
115  * Contains the parameters of the finish operation which is being deferred.
116  */
117 struct work_interval_deferred_finish_state {
118 	uint64_t instance_id;
119 	uint64_t start;
120 	uint64_t deadline;
121 	uint64_t complexity;
122 };
123 
124 struct work_interval_auto_join_info {
125 	struct work_interval_deferred_finish_state deferred_finish_state;
126 	work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129 
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134 
135 /*
136  * Work Interval struct
137  *
138  * This struct represents a thread group and/or work interval context
139  * in a mechanism that is represented with a kobject.
140  *
141  * Every thread that has joined a WI has a +1 ref, and the port
142  * has a +1 ref as well.
143  *
144  * TODO: groups need to have a 'is for WI' flag
145  *      and they need a flag to create that says 'for WI'
146  *      This would allow CLPC to avoid allocating WI support
147  *      data unless it is needed
148  *
149  * TODO: Enforce not having more than one non-group joinable work
150  *      interval per thread group.
151  *      CLPC only wants to see one WI-notify callout per group.
152  */
153 struct work_interval {
154 	uint64_t wi_id;
155 	struct os_refcnt wi_ref_count;
156 	uint32_t wi_create_flags;
157 
158 	/* for debugging purposes only, does not hold a ref on port */
159 	ipc_port_t wi_port;
160 
161 	/*
162 	 * holds uniqueid and version of creating process,
163 	 * used to permission-gate notify
164 	 * TODO: you'd think there would be a better way to do this
165 	 */
166 	uint64_t wi_creator_uniqueid;
167 	uint32_t wi_creator_pid;
168 	int wi_creator_pidversion;
169 
170 	/* flags set by work_interval_set_workload_id and reflected onto
171 	 *  thread->th_work_interval_flags upon join */
172 	uint32_t wi_wlid_flags;
173 
174 #if CONFIG_THREAD_GROUPS
175 	uint32_t wi_group_flags;
176 	struct thread_group *wi_group;  /* holds +1 ref on group */
177 #endif /* CONFIG_THREAD_GROUPS */
178 
179 #if CONFIG_SCHED_AUTO_JOIN
180 	/* Information related to auto-join and deferred finish for work interval */
181 	struct work_interval_auto_join_info wi_auto_join_info;
182 
183 	/*
184 	 * Since the deallocation of auto-join work intervals
185 	 * can happen in the scheduler when the last thread in
186 	 * the WI blocks and the thread lock is held, the deallocation
187 	 * might have to be done on a separate thread.
188 	 */
189 	struct mpsc_queue_chain   wi_deallocate_link;
190 #endif /* CONFIG_SCHED_AUTO_JOIN */
191 
192 	/*
193 	 * Work interval class info - determines thread priority for threads
194 	 * with a work interval driven policy.
195 	 */
196 	wi_class_t wi_class;
197 	uint8_t wi_class_offset;
198 
199 	struct recount_work_interval wi_recount;
200 };
201 
202 /*
203  * work_interval_telemetry_data_enabled()
204  *
205  * Helper routine to check if work interval has the collection of telemetry data enabled.
206  */
207 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)208 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209 {
210 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
211 }
212 
213 
214 /*
215  * work_interval_get_recount_tracks()
216  *
217  * Returns the recount tracks associated with a work interval, or NULL
218  * if the work interval is NULL or has telemetry disabled.
219  */
220 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)221 work_interval_get_recount_tracks(struct work_interval *work_interval)
222 {
223 	if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
224 		return work_interval->wi_recount.rwi_current_instance;
225 	}
226 	return NULL;
227 }
228 
229 #if CONFIG_SCHED_AUTO_JOIN
230 
231 /*
232  * work_interval_perform_deferred_finish()
233  *
234  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
235  * argument rather than looking at the work_interval since the deferred finish can race with another
236  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
237  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
238  * the deferred state without issues.
239  */
240 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)241 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
242     __unused struct work_interval *work_interval, __unused thread_t thread)
243 {
244 
245 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
246 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
247 }
248 
249 /*
250  * work_interval_auto_join_increment()
251  *
252  * Routine to increment auto-join counter when a new thread is auto-joined to
253  * the work interval.
254  */
255 static void
work_interval_auto_join_increment(struct work_interval * work_interval)256 work_interval_auto_join_increment(struct work_interval *work_interval)
257 {
258 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
259 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
260 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
261 }
262 
263 /*
264  * work_interval_auto_join_decrement()
265  *
266  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
267  * blocking or termination). If this was the last auto-joined thread in the work interval and
268  * there was a deferred finish, performs the finish operation for the work interval.
269  */
270 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)271 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
272 {
273 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
274 	work_interval_auto_join_status_t old_status, new_status;
275 	struct work_interval_deferred_finish_state deferred_finish_state;
276 	bool perform_finish;
277 
278 	/* Update the auto-join count for the work interval atomically */
279 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
280 		perform_finish = false;
281 		new_status = old_status;
282 		assert(work_interval_status_auto_join_count(old_status) > 0);
283 		new_status -= 1;
284 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
285 		        /* No auto-joined threads remaining and finish is deferred */
286 		        new_status = 0;
287 		        perform_finish = true;
288 		        /*
289 		         * Its important to copy the deferred finish state here so that this works
290 		         * when racing with another start-finish cycle.
291 		         */
292 		        deferred_finish_state = join_info->deferred_finish_state;
293 		}
294 	});
295 
296 	if (perform_finish == true) {
297 		/*
298 		 * Since work_interval_perform_deferred_finish() calls down to
299 		 * the machine layer callout for finish which gets the thread
300 		 * group from the thread passed in here, it is important to
301 		 * make sure that the thread still has the work interval thread
302 		 * group here.
303 		 */
304 		assert(thread->thread_group == work_interval->wi_group);
305 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
306 	}
307 }
308 
309 /*
310  * work_interval_auto_join_enabled()
311  *
312  * Helper routine to check if work interval has auto-join enabled.
313  */
314 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)315 work_interval_auto_join_enabled(struct work_interval *work_interval)
316 {
317 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
318 }
319 
320 /*
321  * work_interval_deferred_finish_enabled()
322  *
323  * Helper routine to check if work interval has deferred finish enabled.
324  */
325 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)326 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
327 {
328 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
329 }
330 
331 #endif /* CONFIG_SCHED_AUTO_JOIN */
332 
333 static inline void
work_interval_retain(struct work_interval * work_interval)334 work_interval_retain(struct work_interval *work_interval)
335 {
336 	/*
337 	 * Even though wi_retain is called under a port lock, we have
338 	 * to use os_ref_retain instead of os_ref_retain_locked
339 	 * because wi_release is not synchronized. wi_release calls
340 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
341 	 */
342 	os_ref_retain(&work_interval->wi_ref_count);
343 }
344 
345 static inline void
work_interval_deallocate(struct work_interval * work_interval)346 work_interval_deallocate(struct work_interval *work_interval)
347 {
348 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
349 	    work_interval->wi_id);
350 	if (work_interval_telemetry_data_enabled(work_interval)) {
351 		recount_work_interval_deinit(&work_interval->wi_recount);
352 	}
353 	kfree_type(struct work_interval, work_interval);
354 }
355 
356 /*
357  * work_interval_release()
358  *
359  * Routine to release a ref count on the work interval. If the refcount goes down
360  * to zero, the work interval needs to be de-allocated.
361  *
362  * For non auto-join work intervals, they are de-allocated in this context.
363  *
364  * For auto-join work intervals, the de-allocation cannot be done from this context
365  * since that might need the kernel memory allocator lock. In that case, the
366  * deallocation is done via a thread-call based mpsc queue.
367  */
368 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)369 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
370 {
371 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
372 #if CONFIG_SCHED_AUTO_JOIN
373 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
374 			work_interval_deferred_release(work_interval);
375 		} else {
376 			work_interval_deallocate(work_interval);
377 		}
378 #else /* CONFIG_SCHED_AUTO_JOIN */
379 		work_interval_deallocate(work_interval);
380 #endif /* CONFIG_SCHED_AUTO_JOIN */
381 	}
382 }
383 
384 void
kern_work_interval_release(struct work_interval * work_interval)385 kern_work_interval_release(struct work_interval *work_interval)
386 {
387 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
388 }
389 
390 #if CONFIG_SCHED_AUTO_JOIN
391 
392 /*
393  * work_interval_deferred_release()
394  *
395  * Routine to enqueue the work interval on the deallocation mpsc queue.
396  */
397 static void
work_interval_deferred_release(struct work_interval * work_interval)398 work_interval_deferred_release(struct work_interval *work_interval)
399 {
400 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
401 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
402 }
403 
404 /*
405  * work_interval_should_propagate()
406  *
407  * Main policy routine to decide if a thread should be auto-joined to
408  * another thread's work interval. The conditions are arranged such that
409  * the most common bailout condition are checked the earliest. This routine
410  * is called from the scheduler context; so it needs to be efficient and
411  * be careful when taking locks or performing wakeups.
412  */
413 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)414 work_interval_should_propagate(thread_t cthread, thread_t thread)
415 {
416 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
417 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
418 		return false;
419 	}
420 
421 	/* Only propagate work intervals which have auto-join enabled */
422 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
423 		return false;
424 	}
425 
426 	/* Work interval propagation is enabled for realtime threads only */
427 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
428 		return false;
429 	}
430 
431 
432 	/* Work interval propagation only works for threads with the same home thread group */
433 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
434 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
435 		return false;
436 	}
437 
438 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
439 	if (thread->thread_group != thread_home_tg) {
440 		return false;
441 	}
442 
443 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
444 	if ((!cthread->active) || (!thread->active)) {
445 		return false;
446 	}
447 
448 	return true;
449 }
450 
451 /*
452  * work_interval_auto_join_propagate()
453  *
454  * Routine to auto-join a thread into another thread's work interval
455  *
456  * Should only be invoked if work_interval_should_propagate() returns
457  * true. Also expects "from" thread to be current thread and "to" thread
458  * to be locked.
459  */
460 void
work_interval_auto_join_propagate(thread_t from,thread_t to)461 work_interval_auto_join_propagate(thread_t from, thread_t to)
462 {
463 	assert(from == current_thread());
464 	work_interval_retain(from->th_work_interval);
465 	work_interval_auto_join_increment(from->th_work_interval);
466 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
467 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
468 	assert(kr == KERN_SUCCESS);
469 }
470 
471 /*
472  * work_interval_auto_join_unwind()
473  *
474  * Routine to un-join an auto-joined work interval for a thread that is blocking.
475  *
476  * Expects thread to be locked.
477  */
478 void
work_interval_auto_join_unwind(thread_t thread)479 work_interval_auto_join_unwind(thread_t thread)
480 {
481 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
482 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
483 	assert(kr == KERN_SUCCESS);
484 }
485 
486 /*
487  * work_interval_auto_join_demote()
488  *
489  * Routine to un-join an auto-joined work interval when a thread is changing from
490  * realtime to non-realtime scheduling mode. This could happen due to multiple
491  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
492  * the thread being demoted may not be the current thread.
493  *
494  * Expects thread to be locked.
495  */
496 void
work_interval_auto_join_demote(thread_t thread)497 work_interval_auto_join_demote(thread_t thread)
498 {
499 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
500 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
501 	assert(kr == KERN_SUCCESS);
502 }
503 
504 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)505 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
506     __assert_only mpsc_daemon_queue_t dq)
507 {
508 	struct work_interval *work_interval = NULL;
509 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
510 	assert(dq == &work_interval_deallocate_queue);
511 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
512 	work_interval_deallocate(work_interval);
513 }
514 
515 #endif /* CONFIG_SCHED_AUTO_JOIN */
516 
517 #if CONFIG_SCHED_AUTO_JOIN
518 __startup_func
519 static void
work_interval_subsystem_init(void)520 work_interval_subsystem_init(void)
521 {
522 	/*
523 	 * The work interval deallocation queue must be a thread call based queue
524 	 * because it is woken up from contexts where the thread lock is held. The
525 	 * only way to perform wakeups safely in those contexts is to wakeup a
526 	 * thread call which is guaranteed to be on a different waitq and would
527 	 * not hash onto the same global waitq which might be currently locked.
528 	 */
529 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
530 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
531 	    MPSC_DAEMON_INIT_NONE);
532 }
533 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
534 #endif /* CONFIG_SCHED_AUTO_JOIN */
535 
536 /*
537  * work_interval_port_convert
538  *
539  * Called with port locked, returns reference to work interval
540  * if indeed the port is a work interval kobject port
541  */
542 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)543 work_interval_port_convert_locked(ipc_port_t port)
544 {
545 	struct work_interval *work_interval = NULL;
546 
547 	if (IP_VALID(port)) {
548 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
549 		if (work_interval) {
550 			work_interval_retain(work_interval);
551 		}
552 	}
553 
554 	return work_interval;
555 }
556 
557 /*
558  * port_name_to_work_interval
559  *
560  * Description: Obtain a reference to the work_interval associated with a given port.
561  *
562  * Parameters:  name    A Mach port name to translate.
563  *
564  * Returns:     NULL    The given Mach port did not reference a work_interval.
565  *              !NULL   The work_interval that is associated with the Mach port.
566  */
567 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)568 port_name_to_work_interval(mach_port_name_t     name,
569     struct work_interval **work_interval)
570 {
571 	if (!MACH_PORT_VALID(name)) {
572 		return KERN_INVALID_NAME;
573 	}
574 
575 	ipc_port_t port = IP_NULL;
576 	kern_return_t kr = KERN_SUCCESS;
577 
578 	kr = ipc_port_translate_send(current_space(), name, &port);
579 	if (kr != KERN_SUCCESS) {
580 		return kr;
581 	}
582 	/* port is locked */
583 
584 	assert(IP_VALID(port));
585 
586 	struct work_interval *converted_work_interval;
587 
588 	converted_work_interval = work_interval_port_convert_locked(port);
589 
590 	/* the port is valid, but doesn't denote a work_interval */
591 	if (converted_work_interval == NULL) {
592 		kr = KERN_INVALID_CAPABILITY;
593 	}
594 
595 	ip_mq_unlock(port);
596 
597 	if (kr == KERN_SUCCESS) {
598 		*work_interval = converted_work_interval;
599 	}
600 
601 	return kr;
602 }
603 
604 kern_return_t
kern_port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)605 kern_port_name_to_work_interval(mach_port_name_t name,
606     struct work_interval **work_interval)
607 {
608 	return port_name_to_work_interval(name, work_interval);
609 }
610 
611 /*
612  * work_interval_port_no_senders
613  *
614  * Description: Handle a no-senders notification for a work interval port.
615  *              Destroys the port and releases its reference on the work interval.
616  *
617  * Parameters:  msg     A Mach no-senders notification message.
618  *
619  * Note: This assumes that there is only one create-right-from-work-interval point,
620  *       if the ability to extract another send right after creation is added,
621  *       this will have to change to handle make-send counts correctly.
622  */
623 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)624 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
625 {
626 	struct work_interval *work_interval = NULL;
627 
628 	work_interval = ipc_kobject_dealloc_port(port, mscount,
629 	    IKOT_WORK_INTERVAL);
630 
631 	work_interval->wi_port = MACH_PORT_NULL;
632 
633 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
634 }
635 
636 /*
637  * work_interval_port_type()
638  *
639  * Converts a port name into the work interval object and returns its type.
640  *
641  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
642  * valid type for work intervals).
643  */
644 static uint32_t
work_interval_port_type(mach_port_name_t port_name)645 work_interval_port_type(mach_port_name_t port_name)
646 {
647 	struct work_interval *work_interval = NULL;
648 	kern_return_t kr;
649 	uint32_t work_interval_type;
650 
651 	if (port_name == MACH_PORT_NULL) {
652 		return WORK_INTERVAL_TYPE_LAST;
653 	}
654 
655 	kr = port_name_to_work_interval(port_name, &work_interval);
656 	if (kr != KERN_SUCCESS) {
657 		return WORK_INTERVAL_TYPE_LAST;
658 	}
659 	/* work_interval has a +1 ref */
660 
661 	assert(work_interval != NULL);
662 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
663 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
664 	return work_interval_type;
665 }
666 
667 /*
668  * Sparse - not all work interval classes imply a scheduling policy change.
669  * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
670  * adopted the REALTIME sched mode to take effect.
671  */
672 static const struct {
673 	int          priority;
674 	sched_mode_t sched_mode;
675 } work_interval_class_data[WI_CLASS_COUNT] = {
676 	[WI_CLASS_BEST_EFFORT] = {
677 		BASEPRI_DEFAULT,        // 31
678 		TH_MODE_TIMESHARE,
679 	},
680 
681 	[WI_CLASS_APP_SUPPORT] = {
682 		BASEPRI_USER_INITIATED, // 37
683 		TH_MODE_TIMESHARE,
684 	},
685 
686 	[WI_CLASS_SYSTEM] = {
687 		BASEPRI_FOREGROUND + 1, // 48
688 		TH_MODE_FIXED,
689 	},
690 
691 	[WI_CLASS_SYSTEM_CRITICAL] = {
692 		MAXPRI_USER + 1,        // 64
693 		TH_MODE_FIXED,
694 	},
695 
696 	[WI_CLASS_REALTIME_CRITICAL] = {
697 		BASEPRI_RTQUEUES + 1,   // 98
698 		TH_MODE_REALTIME,
699 	},
700 };
701 
702 /*
703  * Called when a thread gets its scheduling priority from its associated work
704  * interval.
705  */
706 int
work_interval_get_priority(thread_t thread)707 work_interval_get_priority(thread_t thread)
708 {
709 	const struct work_interval *work_interval = thread->th_work_interval;
710 	assert(work_interval != NULL);
711 
712 	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
713 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
714 	int priority = work_interval_class_data[work_interval->wi_class].priority;
715 	assert(priority != 0);
716 
717 	priority += work_interval->wi_class_offset;
718 	assert3u(priority, <=, MAXPRI);
719 
720 	return priority;
721 }
722 
723 kern_return_t
kern_work_interval_get_policy(struct work_interval * work_interval,integer_t * policy,integer_t * priority)724 kern_work_interval_get_policy(struct work_interval *work_interval,
725     integer_t *policy,
726     integer_t *priority)
727 {
728 	if (!work_interval || !priority || !policy) {
729 		return KERN_INVALID_ARGUMENT;
730 	}
731 
732 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
733 
734 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
735 	if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) {
736 		*policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
737 		*priority = work_interval_class_data[work_interval->wi_class].priority;
738 		assert(*priority != 0);
739 		*priority += work_interval->wi_class_offset;
740 		assert3u(*priority, <=, MAXPRI);
741 	} /* No sched mode change for REALTIME (threads must explicitly opt-in) */
742 	return KERN_SUCCESS;
743 }
744 
745 #if CONFIG_THREAD_GROUPS
746 kern_return_t
kern_work_interval_get_thread_group(struct work_interval * work_interval,struct thread_group ** tg)747 kern_work_interval_get_thread_group(struct work_interval *work_interval,
748     struct thread_group **tg)
749 {
750 	if (!work_interval || !tg) {
751 		return KERN_INVALID_ARGUMENT;
752 	}
753 	if (work_interval->wi_group) {
754 		*tg = thread_group_retain(work_interval->wi_group);
755 		return KERN_SUCCESS;
756 	} else {
757 		return KERN_INVALID_ARGUMENT;
758 	}
759 }
760 #endif /* CONFIG_THREAD_GROUPS */
761 
762 /*
763  * Switch to a policy driven by the work interval (if applicable).
764  */
765 static void
work_interval_set_policy(thread_t thread)766 work_interval_set_policy(thread_t thread)
767 {
768 	assert3p(thread, ==, current_thread());
769 
770 	/*
771 	 * Ignore policy changes if the workload context shouldn't affect the
772 	 * scheduling policy.
773 	 */
774 	workload_config_flags_t flags = WLC_F_NONE;
775 
776 	/* There may be no config at all. That's ok. */
777 	if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
778 	    (flags & WLC_F_THREAD_POLICY) == 0) {
779 		return;
780 	}
781 
782 	const struct work_interval *work_interval = thread->th_work_interval;
783 	assert(work_interval != NULL);
784 
785 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
786 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
787 
788 	/*
789 	 * A mode of TH_MODE_NONE implies that this work interval has no
790 	 * associated scheduler effects.
791 	 */
792 	if (mode == TH_MODE_NONE) {
793 		return;
794 	}
795 
796 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
797 	    TASK_POLICY_WI_DRIVEN, true, mode);
798 	assert(thread->requested_policy.thrp_wi_driven);
799 
800 	return;
801 }
802 
803 /*
804  * Clear a work interval driven policy.
805  */
806 static void
work_interval_clear_policy(thread_t thread)807 work_interval_clear_policy(thread_t thread)
808 {
809 	assert3p(thread, ==, current_thread());
810 
811 	if (!thread->requested_policy.thrp_wi_driven) {
812 		return;
813 	}
814 
815 	const sched_mode_t mode = sched_get_thread_mode_user(thread);
816 
817 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
818 	    TASK_POLICY_WI_DRIVEN, false,
819 	    mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
820 
821 	assert(!thread->requested_policy.thrp_wi_driven);
822 
823 	return;
824 }
825 
826 /*
827  * thread_set_work_interval()
828  *
829  * Change thread's bound work interval to the passed-in work interval
830  * Consumes +1 ref on work_interval upon success.
831  *
832  * May also pass NULL to un-set work_interval on the thread
833  * Will deallocate any old work interval on the thread
834  * Return error if thread does not satisfy requirements to join work interval
835  *
836  * For non auto-join work intervals, deallocate any old work interval on the thread
837  * For auto-join work intervals, the routine may wakeup the work interval deferred
838  * deallocation queue since thread locks might be currently held.
839  */
840 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)841 thread_set_work_interval(thread_t thread,
842     struct work_interval *work_interval, thread_work_interval_options_t options)
843 {
844 	/* All explicit work interval operations should always be from the current thread */
845 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
846 		assert(thread == current_thread());
847 	}
848 
849 	/* All cases of needing the thread lock should be from explicit join scenarios */
850 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
851 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
852 	}
853 
854 	/* For all cases of auto join must come in with the thread lock held */
855 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
856 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
857 	}
858 
859 #if CONFIG_THREAD_GROUPS
860 	if (work_interval && !work_interval->wi_group) {
861 		/* Reject join on work intervals with deferred thread group creation */
862 		return KERN_INVALID_ARGUMENT;
863 	}
864 #endif /* CONFIG_THREAD_GROUPS */
865 
866 	if (work_interval) {
867 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
868 
869 		if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
870 			/* Ensure no kern_work_interval_set_workload_id can happen after this point */
871 			uint32_t wlid_flags;
872 			(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
873 			    WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
874 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
875 				/* For workload IDs with rt-allowed, neuter the check below to
876 				 * enable joining before the thread has become realtime for all
877 				 * work interval types */
878 				work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
879 			}
880 		}
881 
882 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
883 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
884 			return KERN_INVALID_ARGUMENT;
885 		}
886 	}
887 
888 	/*
889 	 * Ensure a work interval scheduling policy is not used if the thread is
890 	 * leaving the work interval.
891 	 */
892 	if (work_interval == NULL &&
893 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
894 		work_interval_clear_policy(thread);
895 	}
896 
897 	struct work_interval *old_th_wi = thread->th_work_interval;
898 #if CONFIG_SCHED_AUTO_JOIN
899 	spl_t s;
900 	/* Take the thread lock if needed */
901 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
902 		s = splsched();
903 		thread_lock(thread);
904 	}
905 
906 	/*
907 	 * Work interval auto-join leak to non-RT threads.
908 	 *
909 	 * If thread might be running on a remote core and it's not in the context switch path (where
910 	 * thread is neither running, blocked or in the runq), its not possible to update the
911 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
912 	 * core. This situation might happen when a thread is transitioning from realtime to
913 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
914 	 * be part of the work interval.
915 	 *
916 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
917 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
918 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
919 	 */
920 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
921 
922 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
923 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
924 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
925 		return KERN_SUCCESS;
926 	}
927 
928 	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
929 
930 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
931 		__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
932 		__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
933 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
934 		    thread_tid(thread), old_tg_id, new_tg_id, options);
935 	}
936 
937 	if (old_wi_auto_joined) {
938 		/*
939 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
940 		 * happened due to the "leak" described above.
941 		 */
942 		if (thread->sched_mode != TH_MODE_REALTIME) {
943 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
944 		}
945 
946 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
947 		work_interval_auto_join_decrement(old_th_wi, thread);
948 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
949 	}
950 
951 #endif /* CONFIG_SCHED_AUTO_JOIN */
952 
953 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
954 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
955 
956 	/* transfer +1 ref to thread */
957 	thread->th_work_interval = work_interval;
958 
959 #if CONFIG_SCHED_AUTO_JOIN
960 
961 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
962 		assert(work_interval_auto_join_enabled(work_interval) == true);
963 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
964 	}
965 
966 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
967 		thread_unlock(thread);
968 		splx(s);
969 	}
970 #endif /* CONFIG_SCHED_AUTO_JOIN */
971 
972 	/*
973 	 * The thread got a new work interval. It may come with a work interval
974 	 * scheduling policy that needs to be applied.
975 	 */
976 	if (work_interval != NULL &&
977 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
978 		work_interval_set_policy(thread);
979 	}
980 
981 #if CONFIG_THREAD_GROUPS
982 	if (work_interval) {
983 		/* Prevent thread_group_set_name after CLPC may have already heard
984 		 * about the thread group */
985 		(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
986 		    WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
987 	}
988 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
989 
990 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
991 #if CONFIG_SCHED_AUTO_JOIN
992 		thread_set_autojoin_thread_group_locked(thread, new_tg);
993 #endif
994 	} else {
995 		thread_set_work_interval_thread_group(thread, new_tg);
996 	}
997 #endif /* CONFIG_THREAD_GROUPS */
998 
999 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
1000 		/* Construct mask to XOR with th_work_interval_flags to clear the
1001 		* currently present flags and set the new flags in wlid_flags. */
1002 		uint32_t wlid_flags = 0;
1003 		if (work_interval) {
1004 			wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
1005 		}
1006 		thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
1007 			&thread->th_work_interval_flags, relaxed);
1008 		th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
1009 		    TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
1010 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1011 			th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1012 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1013 				th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1014 			}
1015 		}
1016 		if (th_wi_xor_mask) {
1017 			os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1018 		}
1019 
1020 		/*
1021 		 * Now that the interval flags have been set, re-evaluate
1022 		 * whether the thread needs to be undemoted - the new work
1023 		 * interval may have the RT_ALLOWED flag. and the thread may
1024 		 * have have a realtime policy but be demoted.
1025 		 */
1026 		thread_rt_evaluate(thread);
1027 	}
1028 
1029 	if (old_th_wi != NULL) {
1030 		work_interval_release(old_th_wi, options);
1031 	}
1032 
1033 	return KERN_SUCCESS;
1034 }
1035 
1036 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1037 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1038 {
1039 	assert(thread == current_thread());
1040 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1041 }
1042 
1043 kern_return_t
work_interval_thread_terminate(thread_t thread)1044 work_interval_thread_terminate(thread_t thread)
1045 {
1046 	assert(thread == current_thread());
1047 	if (thread->th_work_interval != NULL) {
1048 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1049 	}
1050 	return KERN_SUCCESS;
1051 }
1052 
1053 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1054 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1055 {
1056 	assert(thread == current_thread());
1057 	assert(kwi_args->work_interval_id != 0);
1058 
1059 	struct work_interval *work_interval = thread->th_work_interval;
1060 
1061 	if (work_interval == NULL ||
1062 	    work_interval->wi_id != kwi_args->work_interval_id) {
1063 		/* This thread must have adopted the work interval to be able to notify */
1064 		return KERN_INVALID_ARGUMENT;
1065 	}
1066 
1067 	task_t notifying_task = current_task();
1068 
1069 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1070 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1071 		/* Only the creating task can do a notify */
1072 		return KERN_INVALID_ARGUMENT;
1073 	}
1074 
1075 	spl_t s = splsched();
1076 
1077 #if CONFIG_THREAD_GROUPS
1078 	assert(work_interval->wi_group == thread->thread_group);
1079 #endif /* CONFIG_THREAD_GROUPS */
1080 
1081 	uint64_t urgency_param1, urgency_param2;
1082 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1083 
1084 	splx(s);
1085 
1086 	/* called without interrupts disabled */
1087 	machine_work_interval_notify(thread, kwi_args);
1088 
1089 	return KERN_SUCCESS;
1090 }
1091 
1092 /* Start at 1, 0 is not a valid work interval ID */
1093 static _Atomic uint64_t unique_work_interval_id = 1;
1094 
1095 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1096 kern_work_interval_create(thread_t thread,
1097     struct kern_work_interval_create_args *create_params)
1098 {
1099 	assert(thread == current_thread());
1100 
1101 	uint32_t create_flags = create_params->wica_create_flags;
1102 
1103 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1104 	    thread->th_work_interval != NULL) {
1105 		/*
1106 		 * If the thread is doing a legacy combined create and join,
1107 		 * it shouldn't already be part of a work interval.
1108 		 *
1109 		 * (Creating a joinable WI is allowed anytime.)
1110 		 */
1111 		return KERN_FAILURE;
1112 	}
1113 
1114 	/*
1115 	 * Check the validity of the create flags before allocating the work
1116 	 * interval.
1117 	 */
1118 	task_t creating_task = current_task();
1119 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1120 		/*
1121 		 * CA_CLIENT work intervals do not create new thread groups.
1122 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1123 		 * per each application task
1124 		 */
1125 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1126 			return KERN_FAILURE;
1127 		}
1128 		if (!task_is_app(creating_task)) {
1129 #if XNU_TARGET_OS_OSX
1130 			/*
1131 			 * Soft-fail the case of a non-app pretending to be an
1132 			 * app, by allowing it to press the buttons, but they're
1133 			 * not actually connected to anything.
1134 			 */
1135 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1136 #else
1137 			/*
1138 			 * On iOS, it's a hard failure to get your apptype
1139 			 * wrong and then try to render something.
1140 			 */
1141 			return KERN_NOT_SUPPORTED;
1142 #endif /* XNU_TARGET_OS_OSX */
1143 		}
1144 		if (task_set_ca_client_wi(creating_task, true) == false) {
1145 			return KERN_FAILURE;
1146 		}
1147 	}
1148 
1149 #if CONFIG_SCHED_AUTO_JOIN
1150 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1151 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1152 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1153 			return KERN_NOT_SUPPORTED;
1154 		}
1155 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1156 			return KERN_NOT_SUPPORTED;
1157 		}
1158 	}
1159 
1160 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1161 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1162 			return KERN_NOT_SUPPORTED;
1163 		}
1164 	}
1165 #endif /* CONFIG_SCHED_AUTO_JOIN */
1166 
1167 	struct work_interval *work_interval = kalloc_type(struct work_interval,
1168 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1169 
1170 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1171 
1172 	*work_interval = (struct work_interval) {
1173 		.wi_id                  = work_interval_id,
1174 		.wi_ref_count           = {},
1175 		.wi_create_flags        = create_flags,
1176 		.wi_creator_pid         = pid_from_task(creating_task),
1177 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
1178 		.wi_creator_pidversion  = get_task_version(creating_task),
1179 	};
1180 	os_ref_init(&work_interval->wi_ref_count, NULL);
1181 
1182 	if (work_interval_telemetry_data_enabled(work_interval)) {
1183 		recount_work_interval_init(&work_interval->wi_recount);
1184 	}
1185 
1186 	__kdebug_only uint64_t tg_id = 0;
1187 #if CONFIG_THREAD_GROUPS
1188 	struct thread_group *tg;
1189 	if ((create_flags &
1190 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1191 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1192 		/* defer creation of the thread group until the
1193 		 * kern_work_interval_set_workload_id() call */
1194 		work_interval->wi_group = NULL;
1195 	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1196 		/* create a new group for the interval to represent */
1197 		char name[THREAD_GROUP_MAXNAME] = "";
1198 
1199 		snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1200 		    work_interval->wi_creator_pid);
1201 
1202 		tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1203 
1204 		thread_group_set_name(tg, name);
1205 
1206 		work_interval->wi_group = tg;
1207 	} else {
1208 		/* the interval represents the thread's home group */
1209 		tg = thread_group_get_home_group(thread);
1210 
1211 		thread_group_retain(tg);
1212 
1213 		work_interval->wi_group = tg;
1214 	}
1215 
1216 	/* Capture the tg_id for tracing purposes */
1217 	tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1218 
1219 #endif /* CONFIG_THREAD_GROUPS */
1220 
1221 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1222 		mach_port_name_t name = MACH_PORT_NULL;
1223 
1224 		/* work_interval has a +1 ref, moves to the port */
1225 		work_interval->wi_port = ipc_kobject_alloc_port(
1226 			(ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1227 			IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1228 
1229 
1230 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1231 
1232 		if (!MACH_PORT_VALID(name)) {
1233 			/*
1234 			 * copyout failed (port is already deallocated)
1235 			 * Because of the port-destroyed magic,
1236 			 * the work interval is already deallocated too.
1237 			 */
1238 			return KERN_RESOURCE_SHORTAGE;
1239 		}
1240 
1241 		create_params->wica_port = name;
1242 	} else {
1243 		/* work_interval has a +1 ref, moves to the thread */
1244 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1245 		if (kr != KERN_SUCCESS) {
1246 			/* No other thread can join this work interval since it isn't
1247 			 * JOINABLE so release the reference on work interval */
1248 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1249 			return kr;
1250 		}
1251 
1252 		create_params->wica_port = MACH_PORT_NULL;
1253 	}
1254 
1255 	create_params->wica_id = work_interval_id;
1256 
1257 	if (tg_id != ~0) {
1258 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1259 		    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1260 	}
1261 	return KERN_SUCCESS;
1262 }
1263 
1264 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1265 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1266 {
1267 	assert(flags != NULL);
1268 
1269 	kern_return_t kr;
1270 	struct work_interval *work_interval;
1271 
1272 	kr = port_name_to_work_interval(port_name, &work_interval);
1273 	if (kr != KERN_SUCCESS) {
1274 		return kr;
1275 	}
1276 
1277 	assert(work_interval != NULL);
1278 	*flags = work_interval->wi_create_flags;
1279 
1280 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1281 
1282 	return KERN_SUCCESS;
1283 }
1284 
1285 #if CONFIG_THREAD_GROUPS
1286 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1287     "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1288 #endif /* CONFIG_THREAD_GROUPS */
1289 
1290 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1291 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1292     size_t len)
1293 {
1294 	kern_return_t kr;
1295 	struct work_interval *work_interval;
1296 
1297 	if (len > WORK_INTERVAL_NAME_MAX) {
1298 		return KERN_INVALID_ARGUMENT;
1299 	}
1300 	kr = port_name_to_work_interval(port_name, &work_interval);
1301 	if (kr != KERN_SUCCESS) {
1302 		return kr;
1303 	}
1304 
1305 	assert(work_interval != NULL);
1306 
1307 #if CONFIG_THREAD_GROUPS
1308 	uint32_t wi_group_flags = os_atomic_load(
1309 		&work_interval->wi_group_flags, relaxed);
1310 	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1311 		kr = KERN_INVALID_ARGUMENT;
1312 		goto out;
1313 	}
1314 	if (!work_interval->wi_group) {
1315 		kr = KERN_INVALID_ARGUMENT;
1316 		goto out;
1317 	}
1318 
1319 	if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1320 		char tgname[THREAD_GROUP_MAXNAME];
1321 		snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1322 		    name);
1323 		thread_group_set_name(work_interval->wi_group, tgname);
1324 	}
1325 
1326 out:
1327 #endif /* CONFIG_THREAD_GROUPS */
1328 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1329 
1330 	return kr;
1331 }
1332 
1333 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1334 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1335     struct kern_work_interval_workload_id_args *workload_id_args)
1336 {
1337 	kern_return_t kr;
1338 	struct work_interval *work_interval;
1339 	uint32_t wlida_flags = 0;
1340 	uint32_t wlid_flags = 0;
1341 #if CONFIG_THREAD_GROUPS
1342 	uint32_t tg_flags = 0;
1343 #endif
1344 	bool from_workload_config = false;
1345 
1346 	/* Ensure workload ID name is non-empty. */
1347 	if (!workload_id_args->wlida_name[0]) {
1348 		return KERN_INVALID_ARGUMENT;
1349 	}
1350 
1351 	kr = port_name_to_work_interval(port_name, &work_interval);
1352 	if (kr != KERN_SUCCESS) {
1353 		return kr;
1354 	}
1355 
1356 	assert(work_interval != NULL);
1357 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1358 		kr = KERN_INVALID_ARGUMENT;
1359 		goto out;
1360 	}
1361 
1362 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1363 		/* Reject work intervals that didn't indicate they will have a workload ID
1364 		 * at creation. In particular if the work interval has its own thread group,
1365 		 * its creation must have been deferred in kern_work_interval_create */
1366 		kr = KERN_INVALID_ARGUMENT;
1367 		goto out;
1368 	}
1369 
1370 	workload_config_t wl_config = {};
1371 	kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1372 	if (kr == KERN_SUCCESS) {
1373 		if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1374 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1375 			if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1376 			    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1377 				/* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1378 			} else {
1379 				kr = KERN_INVALID_ARGUMENT;
1380 				goto out;
1381 			}
1382 		}
1383 
1384 		wlida_flags = wl_config.wc_flags;
1385 
1386 #if !defined(XNU_TARGET_OS_XR)
1387 		wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1388 #endif /* !XNU_TARGET_OS_XR */
1389 
1390 #if CONFIG_THREAD_GROUPS
1391 		tg_flags = wl_config.wc_thread_group_flags;
1392 		if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1393 		    (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1394 			kr = KERN_INVALID_ARGUMENT;
1395 			goto out;
1396 		}
1397 #endif /* CONFIG_THREAD_GROUPS */
1398 
1399 		from_workload_config = true;
1400 	} else {
1401 		/* If the workload is not present in the table, perform basic validation
1402 		 * that the create flags passed in match the ones used at work interval
1403 		 * create time */
1404 		if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1405 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1406 			kr = KERN_INVALID_ARGUMENT;
1407 			goto out;
1408 		}
1409 
1410 		const bool wc_avail = workload_config_available();
1411 		if (!wc_avail) {
1412 			wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1413 		}
1414 
1415 		/*
1416 		 * If the workload config wasn't even loaded then fallback to
1417 		 * older behaviour where the new thread group gets the default
1418 		 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1419 		 */
1420 #if CONFIG_THREAD_GROUPS
1421 		if (!wc_avail) {
1422 			tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1423 		} else {
1424 			struct thread_group *home_group =
1425 			    thread_group_get_home_group(current_thread());
1426 			if (home_group != NULL) {
1427 				tg_flags = thread_group_get_flags(home_group);
1428 			}
1429 		}
1430 #endif /* CONFIG_THREAD_GROUPS */
1431 	}
1432 
1433 	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1434 
1435 	/* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1436 	 * has been set). */
1437 	wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1438 	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1439 	    &wlid_flags, relaxed)) {
1440 		if (from_workload_config) {
1441 			work_interval->wi_class = wl_config.wc_class;
1442 			work_interval->wi_class_offset = wl_config.wc_class_offset;
1443 		}
1444 #if CONFIG_THREAD_GROUPS
1445 		if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1446 			/* Perform deferred thread group creation, now that tgflags are known */
1447 			struct thread_group *tg;
1448 			tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1449 			    THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1450 
1451 			char tgname[THREAD_GROUP_MAXNAME] = "";
1452 			snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1453 			    workload_id_args->wlida_name);
1454 			thread_group_set_name(tg, tgname);
1455 
1456 			assert(work_interval->wi_group == NULL);
1457 			work_interval->wi_group = tg;
1458 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1459 			    work_interval->wi_id, work_interval->wi_create_flags,
1460 			    work_interval->wi_creator_pid, thread_group_get_id(tg));
1461 		}
1462 #endif /* CONFIG_THREAD_GROUPS */
1463 	} else {
1464 		/* Workload ID has previously been set (or a thread has already joined). */
1465 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1466 			kr = KERN_INVALID_ARGUMENT;
1467 			goto out;
1468 		}
1469 		/* Treat this request as a query for the out parameters of the ID */
1470 		workload_id_args->wlida_flags = wlid_flags;
1471 	}
1472 
1473 	/*
1474 	 * Emit tracepoints for successfully setting the workload ID.
1475 	 *
1476 	 * After rdar://89342390 has been fixed and a new work interval ktrace
1477 	 * provider has been added, it will be possible to associate a numeric
1478 	 * ID with an ID name. Thus, for those cases where the ID name has been
1479 	 * looked up successfully (`from_workload_config` is true) it will no
1480 	 * longer be necessary to emit a tracepoint with the full ID name.
1481 	 */
1482 	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1483 	    work_interval->wi_id, from_workload_config);
1484 	kernel_debug_string_simple(
1485 		MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1486 		workload_id_args->wlida_name);
1487 
1488 	kr = KERN_SUCCESS;
1489 
1490 out:
1491 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1492 
1493 	return kr;
1494 }
1495 
1496 
1497 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1498 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1499 {
1500 	if (work_interval_id == 0) {
1501 		return KERN_INVALID_ARGUMENT;
1502 	}
1503 
1504 	if (thread->th_work_interval == NULL ||
1505 	    thread->th_work_interval->wi_id != work_interval_id) {
1506 		/* work ID isn't valid or doesn't match joined work interval ID */
1507 		return KERN_INVALID_ARGUMENT;
1508 	}
1509 
1510 	return thread_set_work_interval_explicit_join(thread, NULL);
1511 }
1512 
1513 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1514 kern_work_interval_join(thread_t            thread,
1515     mach_port_name_t    port_name)
1516 {
1517 	struct work_interval *work_interval = NULL;
1518 	kern_return_t kr;
1519 
1520 	if (port_name == MACH_PORT_NULL) {
1521 		/* 'Un-join' the current work interval */
1522 		return thread_set_work_interval_explicit_join(thread, NULL);
1523 	}
1524 
1525 	kr = port_name_to_work_interval(port_name, &work_interval);
1526 	if (kr != KERN_SUCCESS) {
1527 		return kr;
1528 	}
1529 	/* work_interval has a +1 ref */
1530 
1531 	assert(work_interval != NULL);
1532 
1533 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1534 	/* ref was consumed by passing it to the thread in the successful case */
1535 	if (kr != KERN_SUCCESS) {
1536 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1537 	}
1538 	return kr;
1539 }
1540 
1541 kern_return_t
kern_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1542 kern_work_interval_explicit_join(thread_t thread,
1543     struct work_interval *work_interval)
1544 {
1545 	kern_return_t kr;
1546 	assert(thread == current_thread());
1547 	assert(work_interval != NULL);
1548 
1549 	/*
1550 	 * We take +1 ref on the work interval which is consumed by passing it
1551 	 * on to the thread below in the successful case.
1552 	 */
1553 	work_interval_retain(work_interval);
1554 
1555 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1556 	if (kr != KERN_SUCCESS) {
1557 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1558 	}
1559 	return kr;
1560 }
1561 
1562 /*
1563  * work_interval_port_type_render_server()
1564  *
1565  * Helper routine to determine if the port points to a
1566  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1567  */
1568 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1569 work_interval_port_type_render_server(mach_port_name_t port_name)
1570 {
1571 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1572 }
1573