xref: /xnu-10063.121.3/osfmk/kern/work_interval.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45 
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49 
50 #include <stdatomic.h>
51 
52 /*
53  * With the introduction of auto-join work intervals, it is possible
54  * to change the work interval (and related thread group) of a thread in a
55  * variety of contexts (thread termination, context switch, thread mode
56  * change etc.). In order to clearly specify the policy expectation and
57  * the locking behavior, all calls to thread_set_work_interval() pass
58  * in a set of flags.
59  */
60 
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 	/* Change the work interval using the explicit join rules */
63 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 	/* Change the work interval using the auto-join rules */
65 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
66 	/* Caller already holds the thread lock */
67 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
68 	/* Caller does not hold the thread lock */
69 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
70 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
72 });
73 
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76 
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78     .iko_op_stable     = true,
79     .iko_op_no_senders = work_interval_port_no_senders);
80 
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84 
85 static void work_interval_deferred_release(struct work_interval *);
86 
87 /*
88  * Work Interval Auto-Join Status
89  *
90  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91  * It packs the following information:
92  * - A bit representing if a "finish" is deferred on the work interval
93  * - Count of number of threads auto-joined to the work interval
94  */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99 
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105 
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111 
112 /*
113  * struct work_interval_deferred_finish_state
114  *
115  * Contains the parameters of the finish operation which is being deferred.
116  */
117 struct work_interval_deferred_finish_state {
118 	uint64_t instance_id;
119 	uint64_t start;
120 	uint64_t deadline;
121 	uint64_t complexity;
122 };
123 
124 struct work_interval_auto_join_info {
125 	struct work_interval_deferred_finish_state deferred_finish_state;
126 	work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129 
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134 
135 /*
136  * Work Interval struct
137  *
138  * This struct represents a thread group and/or work interval context
139  * in a mechanism that is represented with a kobject.
140  *
141  * Every thread that has joined a WI has a +1 ref, and the port
142  * has a +1 ref as well.
143  *
144  * TODO: groups need to have a 'is for WI' flag
145  *      and they need a flag to create that says 'for WI'
146  *      This would allow CLPC to avoid allocating WI support
147  *      data unless it is needed
148  *
149  * TODO: Enforce not having more than one non-group joinable work
150  *      interval per thread group.
151  *      CLPC only wants to see one WI-notify callout per group.
152  */
153 struct work_interval {
154 	uint64_t wi_id;
155 	struct os_refcnt wi_ref_count;
156 	uint32_t wi_create_flags;
157 
158 	/* for debugging purposes only, does not hold a ref on port */
159 	ipc_port_t wi_port;
160 
161 	/*
162 	 * holds uniqueid and version of creating process,
163 	 * used to permission-gate notify
164 	 * TODO: you'd think there would be a better way to do this
165 	 */
166 	uint64_t wi_creator_uniqueid;
167 	uint32_t wi_creator_pid;
168 	int wi_creator_pidversion;
169 
170 	/* flags set by work_interval_set_workload_id and reflected onto
171 	 *  thread->th_work_interval_flags upon join */
172 	uint32_t wi_wlid_flags;
173 
174 #if CONFIG_THREAD_GROUPS
175 	uint32_t wi_group_flags;
176 	struct thread_group *wi_group;  /* holds +1 ref on group */
177 #endif /* CONFIG_THREAD_GROUPS */
178 
179 #if CONFIG_SCHED_AUTO_JOIN
180 	/* Information related to auto-join and deferred finish for work interval */
181 	struct work_interval_auto_join_info wi_auto_join_info;
182 
183 	/*
184 	 * Since the deallocation of auto-join work intervals
185 	 * can happen in the scheduler when the last thread in
186 	 * the WI blocks and the thread lock is held, the deallocation
187 	 * might have to be done on a separate thread.
188 	 */
189 	struct mpsc_queue_chain   wi_deallocate_link;
190 #endif /* CONFIG_SCHED_AUTO_JOIN */
191 
192 	/*
193 	 * Work interval class info - determines thread priority for threads
194 	 * with a work interval driven policy.
195 	 */
196 	wi_class_t wi_class;
197 	uint8_t wi_class_offset;
198 
199 	struct recount_work_interval wi_recount;
200 };
201 
202 /*
203  * work_interval_telemetry_data_enabled()
204  *
205  * Helper routine to check if work interval has the collection of telemetry data enabled.
206  */
207 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)208 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209 {
210 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
211 }
212 
213 
214 /*
215  * work_interval_get_recount_tracks()
216  *
217  * Returns the recount tracks associated with a work interval, or NULL
218  * if the work interval is NULL or has telemetry disabled.
219  */
220 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)221 work_interval_get_recount_tracks(struct work_interval *work_interval)
222 {
223 	if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
224 		return work_interval->wi_recount.rwi_current_instance;
225 	}
226 	return NULL;
227 }
228 
229 #if CONFIG_SCHED_AUTO_JOIN
230 
231 /*
232  * work_interval_perform_deferred_finish()
233  *
234  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
235  * argument rather than looking at the work_interval since the deferred finish can race with another
236  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
237  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
238  * the deferred state without issues.
239  */
240 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)241 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
242     __unused struct work_interval *work_interval, __unused thread_t thread)
243 {
244 
245 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
246 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
247 }
248 
249 /*
250  * work_interval_auto_join_increment()
251  *
252  * Routine to increment auto-join counter when a new thread is auto-joined to
253  * the work interval.
254  */
255 static void
work_interval_auto_join_increment(struct work_interval * work_interval)256 work_interval_auto_join_increment(struct work_interval *work_interval)
257 {
258 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
259 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
260 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
261 }
262 
263 /*
264  * work_interval_auto_join_decrement()
265  *
266  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
267  * blocking or termination). If this was the last auto-joined thread in the work interval and
268  * there was a deferred finish, performs the finish operation for the work interval.
269  */
270 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)271 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
272 {
273 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
274 	work_interval_auto_join_status_t old_status, new_status;
275 	struct work_interval_deferred_finish_state deferred_finish_state;
276 	bool perform_finish;
277 
278 	/* Update the auto-join count for the work interval atomically */
279 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
280 		perform_finish = false;
281 		new_status = old_status;
282 		assert(work_interval_status_auto_join_count(old_status) > 0);
283 		new_status -= 1;
284 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
285 		        /* No auto-joined threads remaining and finish is deferred */
286 		        new_status = 0;
287 		        perform_finish = true;
288 		        /*
289 		         * Its important to copy the deferred finish state here so that this works
290 		         * when racing with another start-finish cycle.
291 		         */
292 		        deferred_finish_state = join_info->deferred_finish_state;
293 		}
294 	});
295 
296 	if (perform_finish == true) {
297 		/*
298 		 * Since work_interval_perform_deferred_finish() calls down to
299 		 * the machine layer callout for finish which gets the thread
300 		 * group from the thread passed in here, it is important to
301 		 * make sure that the thread still has the work interval thread
302 		 * group here.
303 		 */
304 		assert(thread->thread_group == work_interval->wi_group);
305 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
306 	}
307 }
308 
309 /*
310  * work_interval_auto_join_enabled()
311  *
312  * Helper routine to check if work interval has auto-join enabled.
313  */
314 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)315 work_interval_auto_join_enabled(struct work_interval *work_interval)
316 {
317 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
318 }
319 
320 /*
321  * work_interval_deferred_finish_enabled()
322  *
323  * Helper routine to check if work interval has deferred finish enabled.
324  */
325 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)326 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
327 {
328 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
329 }
330 
331 #endif /* CONFIG_SCHED_AUTO_JOIN */
332 
333 static inline void
work_interval_retain(struct work_interval * work_interval)334 work_interval_retain(struct work_interval *work_interval)
335 {
336 	/*
337 	 * Even though wi_retain is called under a port lock, we have
338 	 * to use os_ref_retain instead of os_ref_retain_locked
339 	 * because wi_release is not synchronized. wi_release calls
340 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
341 	 */
342 	os_ref_retain(&work_interval->wi_ref_count);
343 }
344 
345 static inline void
work_interval_deallocate(struct work_interval * work_interval)346 work_interval_deallocate(struct work_interval *work_interval)
347 {
348 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
349 	    work_interval->wi_id);
350 	if (work_interval_telemetry_data_enabled(work_interval)) {
351 		recount_work_interval_deinit(&work_interval->wi_recount);
352 	}
353 	kfree_type(struct work_interval, work_interval);
354 }
355 
356 /*
357  * work_interval_release()
358  *
359  * Routine to release a ref count on the work interval. If the refcount goes down
360  * to zero, the work interval needs to be de-allocated.
361  *
362  * For non auto-join work intervals, they are de-allocated in this context.
363  *
364  * For auto-join work intervals, the de-allocation cannot be done from this context
365  * since that might need the kernel memory allocator lock. In that case, the
366  * deallocation is done via a thread-call based mpsc queue.
367  */
368 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)369 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
370 {
371 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
372 #if CONFIG_SCHED_AUTO_JOIN
373 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
374 			work_interval_deferred_release(work_interval);
375 		} else {
376 			work_interval_deallocate(work_interval);
377 		}
378 #else /* CONFIG_SCHED_AUTO_JOIN */
379 		work_interval_deallocate(work_interval);
380 #endif /* CONFIG_SCHED_AUTO_JOIN */
381 	}
382 }
383 
384 #if CONFIG_SCHED_AUTO_JOIN
385 
386 /*
387  * work_interval_deferred_release()
388  *
389  * Routine to enqueue the work interval on the deallocation mpsc queue.
390  */
391 static void
work_interval_deferred_release(struct work_interval * work_interval)392 work_interval_deferred_release(struct work_interval *work_interval)
393 {
394 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
395 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
396 }
397 
398 /*
399  * work_interval_should_propagate()
400  *
401  * Main policy routine to decide if a thread should be auto-joined to
402  * another thread's work interval. The conditions are arranged such that
403  * the most common bailout condition are checked the earliest. This routine
404  * is called from the scheduler context; so it needs to be efficient and
405  * be careful when taking locks or performing wakeups.
406  */
407 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)408 work_interval_should_propagate(thread_t cthread, thread_t thread)
409 {
410 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
411 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
412 		return false;
413 	}
414 
415 	/* Only propagate work intervals which have auto-join enabled */
416 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
417 		return false;
418 	}
419 
420 	/* Work interval propagation is enabled for realtime threads only */
421 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
422 		return false;
423 	}
424 
425 
426 	/* Work interval propagation only works for threads with the same home thread group */
427 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
428 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
429 		return false;
430 	}
431 
432 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
433 	if (thread->thread_group != thread_home_tg) {
434 		return false;
435 	}
436 
437 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
438 	if ((!cthread->active) || (!thread->active)) {
439 		return false;
440 	}
441 
442 	return true;
443 }
444 
445 /*
446  * work_interval_auto_join_propagate()
447  *
448  * Routine to auto-join a thread into another thread's work interval
449  *
450  * Should only be invoked if work_interval_should_propagate() returns
451  * true. Also expects "from" thread to be current thread and "to" thread
452  * to be locked.
453  */
454 void
work_interval_auto_join_propagate(thread_t from,thread_t to)455 work_interval_auto_join_propagate(thread_t from, thread_t to)
456 {
457 	assert(from == current_thread());
458 	work_interval_retain(from->th_work_interval);
459 	work_interval_auto_join_increment(from->th_work_interval);
460 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
461 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
462 	assert(kr == KERN_SUCCESS);
463 }
464 
465 /*
466  * work_interval_auto_join_unwind()
467  *
468  * Routine to un-join an auto-joined work interval for a thread that is blocking.
469  *
470  * Expects thread to be locked.
471  */
472 void
work_interval_auto_join_unwind(thread_t thread)473 work_interval_auto_join_unwind(thread_t thread)
474 {
475 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
476 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
477 	assert(kr == KERN_SUCCESS);
478 }
479 
480 /*
481  * work_interval_auto_join_demote()
482  *
483  * Routine to un-join an auto-joined work interval when a thread is changing from
484  * realtime to non-realtime scheduling mode. This could happen due to multiple
485  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
486  * the thread being demoted may not be the current thread.
487  *
488  * Expects thread to be locked.
489  */
490 void
work_interval_auto_join_demote(thread_t thread)491 work_interval_auto_join_demote(thread_t thread)
492 {
493 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
494 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
495 	assert(kr == KERN_SUCCESS);
496 }
497 
498 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)499 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
500     __assert_only mpsc_daemon_queue_t dq)
501 {
502 	struct work_interval *work_interval = NULL;
503 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
504 	assert(dq == &work_interval_deallocate_queue);
505 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
506 	work_interval_deallocate(work_interval);
507 }
508 
509 #endif /* CONFIG_SCHED_AUTO_JOIN */
510 
511 #if CONFIG_SCHED_AUTO_JOIN
512 __startup_func
513 static void
work_interval_subsystem_init(void)514 work_interval_subsystem_init(void)
515 {
516 	/*
517 	 * The work interval deallocation queue must be a thread call based queue
518 	 * because it is woken up from contexts where the thread lock is held. The
519 	 * only way to perform wakeups safely in those contexts is to wakeup a
520 	 * thread call which is guaranteed to be on a different waitq and would
521 	 * not hash onto the same global waitq which might be currently locked.
522 	 */
523 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
524 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
525 	    MPSC_DAEMON_INIT_NONE);
526 }
527 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
528 #endif /* CONFIG_SCHED_AUTO_JOIN */
529 
530 /*
531  * work_interval_port_convert
532  *
533  * Called with port locked, returns reference to work interval
534  * if indeed the port is a work interval kobject port
535  */
536 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)537 work_interval_port_convert_locked(ipc_port_t port)
538 {
539 	struct work_interval *work_interval = NULL;
540 
541 	if (IP_VALID(port)) {
542 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
543 		if (work_interval) {
544 			work_interval_retain(work_interval);
545 		}
546 	}
547 
548 	return work_interval;
549 }
550 
551 /*
552  * port_name_to_work_interval
553  *
554  * Description: Obtain a reference to the work_interval associated with a given port.
555  *
556  * Parameters:  name    A Mach port name to translate.
557  *
558  * Returns:     NULL    The given Mach port did not reference a work_interval.
559  *              !NULL   The work_interval that is associated with the Mach port.
560  */
561 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)562 port_name_to_work_interval(mach_port_name_t     name,
563     struct work_interval **work_interval)
564 {
565 	if (!MACH_PORT_VALID(name)) {
566 		return KERN_INVALID_NAME;
567 	}
568 
569 	ipc_port_t port = IP_NULL;
570 	kern_return_t kr = KERN_SUCCESS;
571 
572 	kr = ipc_port_translate_send(current_space(), name, &port);
573 	if (kr != KERN_SUCCESS) {
574 		return kr;
575 	}
576 	/* port is locked */
577 
578 	assert(IP_VALID(port));
579 
580 	struct work_interval *converted_work_interval;
581 
582 	converted_work_interval = work_interval_port_convert_locked(port);
583 
584 	/* the port is valid, but doesn't denote a work_interval */
585 	if (converted_work_interval == NULL) {
586 		kr = KERN_INVALID_CAPABILITY;
587 	}
588 
589 	ip_mq_unlock(port);
590 
591 	if (kr == KERN_SUCCESS) {
592 		*work_interval = converted_work_interval;
593 	}
594 
595 	return kr;
596 }
597 
598 
599 /*
600  * work_interval_port_no_senders
601  *
602  * Description: Handle a no-senders notification for a work interval port.
603  *              Destroys the port and releases its reference on the work interval.
604  *
605  * Parameters:  msg     A Mach no-senders notification message.
606  *
607  * Note: This assumes that there is only one create-right-from-work-interval point,
608  *       if the ability to extract another send right after creation is added,
609  *       this will have to change to handle make-send counts correctly.
610  */
611 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)612 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
613 {
614 	struct work_interval *work_interval = NULL;
615 
616 	work_interval = ipc_kobject_dealloc_port(port, mscount,
617 	    IKOT_WORK_INTERVAL);
618 
619 	work_interval->wi_port = MACH_PORT_NULL;
620 
621 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
622 }
623 
624 /*
625  * work_interval_port_type()
626  *
627  * Converts a port name into the work interval object and returns its type.
628  *
629  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
630  * valid type for work intervals).
631  */
632 static uint32_t
work_interval_port_type(mach_port_name_t port_name)633 work_interval_port_type(mach_port_name_t port_name)
634 {
635 	struct work_interval *work_interval = NULL;
636 	kern_return_t kr;
637 	uint32_t work_interval_type;
638 
639 	if (port_name == MACH_PORT_NULL) {
640 		return WORK_INTERVAL_TYPE_LAST;
641 	}
642 
643 	kr = port_name_to_work_interval(port_name, &work_interval);
644 	if (kr != KERN_SUCCESS) {
645 		return WORK_INTERVAL_TYPE_LAST;
646 	}
647 	/* work_interval has a +1 ref */
648 
649 	assert(work_interval != NULL);
650 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
651 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
652 	return work_interval_type;
653 }
654 
655 /*
656  * Sparse - not all work interval classes imply a scheduling policy change.
657  * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
658  * adopted the REALTIME sched mode to take effect.
659  */
660 static const struct {
661 	int          priority;
662 	sched_mode_t sched_mode;
663 } work_interval_class_data[WI_CLASS_COUNT] = {
664 	[WI_CLASS_BEST_EFFORT] = {
665 		BASEPRI_DEFAULT,        // 31
666 		TH_MODE_TIMESHARE,
667 	},
668 
669 	[WI_CLASS_APP_SUPPORT] = {
670 		BASEPRI_DEFAULT,        // 31
671 		TH_MODE_TIMESHARE,
672 	},
673 
674 	[WI_CLASS_SYSTEM] = {
675 		BASEPRI_FOREGROUND + 1, // 48
676 		TH_MODE_FIXED,
677 	},
678 
679 	[WI_CLASS_SYSTEM_CRITICAL] = {
680 		MAXPRI_USER + 1,        // 64
681 		TH_MODE_FIXED,
682 	},
683 
684 	[WI_CLASS_REALTIME_CRITICAL] = {
685 		BASEPRI_RTQUEUES + 1,   // 98
686 		TH_MODE_REALTIME,
687 	},
688 };
689 
690 /*
691  * Called when a thread gets its scheduling priority from its associated work
692  * interval.
693  */
694 int
work_interval_get_priority(thread_t thread)695 work_interval_get_priority(thread_t thread)
696 {
697 	const struct work_interval *work_interval = thread->th_work_interval;
698 	assert(work_interval != NULL);
699 
700 	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
701 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
702 	int priority = work_interval_class_data[work_interval->wi_class].priority;
703 	assert(priority != 0);
704 
705 	priority += work_interval->wi_class_offset;
706 	assert3u(priority, <=, MAXPRI);
707 
708 	return priority;
709 }
710 
711 #if CONFIG_THREAD_GROUPS
712 extern kern_return_t
kern_work_interval_get_policy_from_port(mach_port_name_t port_name,integer_t * policy,integer_t * priority,struct thread_group ** tg)713 kern_work_interval_get_policy_from_port(mach_port_name_t port_name,
714     integer_t *policy,
715     integer_t *priority,
716     struct thread_group **tg)
717 {
718 	assert((priority != NULL) && (policy != NULL) && (tg != NULL));
719 
720 	kern_return_t kr;
721 	struct work_interval *work_interval;
722 
723 	kr = port_name_to_work_interval(port_name, &work_interval);
724 	if (kr != KERN_SUCCESS) {
725 		return kr;
726 	}
727 
728 	/* work_interval has a +1 ref */
729 	assert(work_interval != NULL);
730 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
731 
732 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
733 
734 	if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) {
735 		*policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
736 		*priority = work_interval_class_data[work_interval->wi_class].priority;
737 		assert(*priority != 0);
738 		*priority += work_interval->wi_class_offset;
739 		assert3u(*priority, <=, MAXPRI);
740 	} /* No sched mode change for REALTIME (threads must explicitly opt-in) */
741 
742 	if (work_interval->wi_group) {
743 		*tg = thread_group_retain(work_interval->wi_group);
744 	}
745 
746 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
747 	return KERN_SUCCESS;
748 }
749 #endif /* CONFIG_THREAD_GROUPS */
750 
751 /*
752  * Switch to a policy driven by the work interval (if applicable).
753  */
754 static void
work_interval_set_policy(thread_t thread)755 work_interval_set_policy(thread_t thread)
756 {
757 	assert3p(thread, ==, current_thread());
758 
759 	/*
760 	 * Ignore policy changes if the workload context shouldn't affect the
761 	 * scheduling policy.
762 	 */
763 	workload_config_flags_t flags = WLC_F_NONE;
764 
765 	/* There may be no config at all. That's ok. */
766 	if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
767 	    (flags & WLC_F_THREAD_POLICY) == 0) {
768 		return;
769 	}
770 
771 	const struct work_interval *work_interval = thread->th_work_interval;
772 	assert(work_interval != NULL);
773 
774 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
775 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
776 
777 	/*
778 	 * A mode of TH_MODE_NONE implies that this work interval has no
779 	 * associated scheduler effects.
780 	 */
781 	if (mode == TH_MODE_NONE) {
782 		return;
783 	}
784 
785 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
786 	    TASK_POLICY_WI_DRIVEN, true, mode);
787 	assert(thread->requested_policy.thrp_wi_driven);
788 
789 	return;
790 }
791 
792 /*
793  * Clear a work interval driven policy.
794  */
795 static void
work_interval_clear_policy(thread_t thread)796 work_interval_clear_policy(thread_t thread)
797 {
798 	assert3p(thread, ==, current_thread());
799 
800 	if (!thread->requested_policy.thrp_wi_driven) {
801 		return;
802 	}
803 
804 	const sched_mode_t mode = sched_get_thread_mode_user(thread);
805 
806 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
807 	    TASK_POLICY_WI_DRIVEN, false,
808 	    mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
809 
810 	assert(!thread->requested_policy.thrp_wi_driven);
811 
812 	return;
813 }
814 
815 /*
816  * thread_set_work_interval()
817  *
818  * Change thread's bound work interval to the passed-in work interval
819  * Consumes +1 ref on work_interval upon success.
820  *
821  * May also pass NULL to un-set work_interval on the thread
822  * Will deallocate any old work interval on the thread
823  * Return error if thread does not satisfy requirements to join work interval
824  *
825  * For non auto-join work intervals, deallocate any old work interval on the thread
826  * For auto-join work intervals, the routine may wakeup the work interval deferred
827  * deallocation queue since thread locks might be currently held.
828  */
829 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)830 thread_set_work_interval(thread_t thread,
831     struct work_interval *work_interval, thread_work_interval_options_t options)
832 {
833 	/* All explicit work interval operations should always be from the current thread */
834 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
835 		assert(thread == current_thread());
836 	}
837 
838 	/* All cases of needing the thread lock should be from explicit join scenarios */
839 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
840 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
841 	}
842 
843 	/* For all cases of auto join must come in with the thread lock held */
844 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
845 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
846 	}
847 
848 #if CONFIG_THREAD_GROUPS
849 	if (work_interval && !work_interval->wi_group) {
850 		/* Reject join on work intervals with deferred thread group creation */
851 		return KERN_INVALID_ARGUMENT;
852 	}
853 #endif /* CONFIG_THREAD_GROUPS */
854 
855 	if (work_interval) {
856 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
857 
858 		if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
859 			/* Ensure no kern_work_interval_set_workload_id can happen after this point */
860 			uint32_t wlid_flags;
861 			(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
862 			    WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
863 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
864 				/* For workload IDs with rt-allowed, neuter the check below to
865 				 * enable joining before the thread has become realtime for all
866 				 * work interval types */
867 				work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
868 			}
869 		}
870 
871 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
872 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
873 			return KERN_INVALID_ARGUMENT;
874 		}
875 	}
876 
877 	/*
878 	 * Ensure a work interval scheduling policy is not used if the thread is
879 	 * leaving the work interval.
880 	 */
881 	if (work_interval == NULL &&
882 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
883 		work_interval_clear_policy(thread);
884 	}
885 
886 	struct work_interval *old_th_wi = thread->th_work_interval;
887 #if CONFIG_SCHED_AUTO_JOIN
888 	spl_t s;
889 	/* Take the thread lock if needed */
890 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
891 		s = splsched();
892 		thread_lock(thread);
893 	}
894 
895 	/*
896 	 * Work interval auto-join leak to non-RT threads.
897 	 *
898 	 * If thread might be running on a remote core and it's not in the context switch path (where
899 	 * thread is neither running, blocked or in the runq), its not possible to update the
900 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
901 	 * core. This situation might happen when a thread is transitioning from realtime to
902 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
903 	 * be part of the work interval.
904 	 *
905 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
906 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
907 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
908 	 */
909 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
910 
911 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
912 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
913 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
914 		return KERN_SUCCESS;
915 	}
916 
917 	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
918 
919 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
920 		__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
921 		__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
922 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
923 		    thread_tid(thread), old_tg_id, new_tg_id, options);
924 	}
925 
926 	if (old_wi_auto_joined) {
927 		/*
928 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
929 		 * happened due to the "leak" described above.
930 		 */
931 		if (thread->sched_mode != TH_MODE_REALTIME) {
932 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
933 		}
934 
935 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
936 		work_interval_auto_join_decrement(old_th_wi, thread);
937 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
938 	}
939 
940 #endif /* CONFIG_SCHED_AUTO_JOIN */
941 
942 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
943 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
944 
945 	/* transfer +1 ref to thread */
946 	thread->th_work_interval = work_interval;
947 
948 #if CONFIG_SCHED_AUTO_JOIN
949 
950 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
951 		assert(work_interval_auto_join_enabled(work_interval) == true);
952 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
953 	}
954 
955 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
956 		thread_unlock(thread);
957 		splx(s);
958 	}
959 #endif /* CONFIG_SCHED_AUTO_JOIN */
960 
961 	/*
962 	 * The thread got a new work interval. It may come with a work interval
963 	 * scheduling policy that needs to be applied.
964 	 */
965 	if (work_interval != NULL &&
966 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
967 		work_interval_set_policy(thread);
968 	}
969 
970 #if CONFIG_THREAD_GROUPS
971 	if (work_interval) {
972 		/* Prevent thread_group_set_name after CLPC may have already heard
973 		 * about the thread group */
974 		(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
975 		    WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
976 	}
977 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
978 
979 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
980 #if CONFIG_SCHED_AUTO_JOIN
981 		thread_set_autojoin_thread_group_locked(thread, new_tg);
982 #endif
983 	} else {
984 		thread_set_work_interval_thread_group(thread, new_tg);
985 	}
986 #endif /* CONFIG_THREAD_GROUPS */
987 
988 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
989 		/* Construct mask to XOR with th_work_interval_flags to clear the
990 		* currently present flags and set the new flags in wlid_flags. */
991 		uint32_t wlid_flags = 0;
992 		if (work_interval) {
993 			wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
994 		}
995 		thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
996 			&thread->th_work_interval_flags, relaxed);
997 		th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
998 		    TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
999 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1000 			th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1001 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1002 				th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1003 			}
1004 		}
1005 		if (th_wi_xor_mask) {
1006 			os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1007 		}
1008 
1009 		/*
1010 		 * Now that the interval flags have been set, re-evaluate
1011 		 * whether the thread needs to be undemoted - the new work
1012 		 * interval may have the RT_ALLOWED flag. and the thread may
1013 		 * have have a realtime policy but be demoted.
1014 		 */
1015 		thread_rt_evaluate(thread);
1016 	}
1017 
1018 	if (old_th_wi != NULL) {
1019 		work_interval_release(old_th_wi, options);
1020 	}
1021 
1022 	return KERN_SUCCESS;
1023 }
1024 
1025 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1026 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1027 {
1028 	assert(thread == current_thread());
1029 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1030 }
1031 
1032 kern_return_t
work_interval_thread_terminate(thread_t thread)1033 work_interval_thread_terminate(thread_t thread)
1034 {
1035 	assert(thread == current_thread());
1036 	if (thread->th_work_interval != NULL) {
1037 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1038 	}
1039 	return KERN_SUCCESS;
1040 }
1041 
1042 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1043 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1044 {
1045 	assert(thread == current_thread());
1046 	assert(kwi_args->work_interval_id != 0);
1047 
1048 	struct work_interval *work_interval = thread->th_work_interval;
1049 
1050 	if (work_interval == NULL ||
1051 	    work_interval->wi_id != kwi_args->work_interval_id) {
1052 		/* This thread must have adopted the work interval to be able to notify */
1053 		return KERN_INVALID_ARGUMENT;
1054 	}
1055 
1056 	task_t notifying_task = current_task();
1057 
1058 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1059 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1060 		/* Only the creating task can do a notify */
1061 		return KERN_INVALID_ARGUMENT;
1062 	}
1063 
1064 	spl_t s = splsched();
1065 
1066 #if CONFIG_THREAD_GROUPS
1067 	assert(work_interval->wi_group == thread->thread_group);
1068 #endif /* CONFIG_THREAD_GROUPS */
1069 
1070 	uint64_t urgency_param1, urgency_param2;
1071 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1072 
1073 	splx(s);
1074 
1075 	/* called without interrupts disabled */
1076 	machine_work_interval_notify(thread, kwi_args);
1077 
1078 	return KERN_SUCCESS;
1079 }
1080 
1081 /* Start at 1, 0 is not a valid work interval ID */
1082 static _Atomic uint64_t unique_work_interval_id = 1;
1083 
1084 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1085 kern_work_interval_create(thread_t thread,
1086     struct kern_work_interval_create_args *create_params)
1087 {
1088 	assert(thread == current_thread());
1089 
1090 	uint32_t create_flags = create_params->wica_create_flags;
1091 
1092 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1093 	    thread->th_work_interval != NULL) {
1094 		/*
1095 		 * If the thread is doing a legacy combined create and join,
1096 		 * it shouldn't already be part of a work interval.
1097 		 *
1098 		 * (Creating a joinable WI is allowed anytime.)
1099 		 */
1100 		return KERN_FAILURE;
1101 	}
1102 
1103 	/*
1104 	 * Check the validity of the create flags before allocating the work
1105 	 * interval.
1106 	 */
1107 	task_t creating_task = current_task();
1108 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1109 		/*
1110 		 * CA_CLIENT work intervals do not create new thread groups.
1111 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1112 		 * per each application task
1113 		 */
1114 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1115 			return KERN_FAILURE;
1116 		}
1117 		if (!task_is_app(creating_task)) {
1118 #if XNU_TARGET_OS_OSX
1119 			/*
1120 			 * Soft-fail the case of a non-app pretending to be an
1121 			 * app, by allowing it to press the buttons, but they're
1122 			 * not actually connected to anything.
1123 			 */
1124 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1125 #else
1126 			/*
1127 			 * On iOS, it's a hard failure to get your apptype
1128 			 * wrong and then try to render something.
1129 			 */
1130 			return KERN_NOT_SUPPORTED;
1131 #endif /* XNU_TARGET_OS_OSX */
1132 		}
1133 		if (task_set_ca_client_wi(creating_task, true) == false) {
1134 			return KERN_FAILURE;
1135 		}
1136 	}
1137 
1138 #if CONFIG_SCHED_AUTO_JOIN
1139 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1140 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1141 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1142 			return KERN_NOT_SUPPORTED;
1143 		}
1144 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1145 			return KERN_NOT_SUPPORTED;
1146 		}
1147 	}
1148 
1149 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1150 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1151 			return KERN_NOT_SUPPORTED;
1152 		}
1153 	}
1154 #endif /* CONFIG_SCHED_AUTO_JOIN */
1155 
1156 	struct work_interval *work_interval = kalloc_type(struct work_interval,
1157 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1158 
1159 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1160 
1161 	*work_interval = (struct work_interval) {
1162 		.wi_id                  = work_interval_id,
1163 		.wi_ref_count           = {},
1164 		.wi_create_flags        = create_flags,
1165 		.wi_creator_pid         = pid_from_task(creating_task),
1166 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
1167 		.wi_creator_pidversion  = get_task_version(creating_task),
1168 	};
1169 	os_ref_init(&work_interval->wi_ref_count, NULL);
1170 
1171 	if (work_interval_telemetry_data_enabled(work_interval)) {
1172 		recount_work_interval_init(&work_interval->wi_recount);
1173 	}
1174 
1175 	__kdebug_only uint64_t tg_id = 0;
1176 #if CONFIG_THREAD_GROUPS
1177 	struct thread_group *tg;
1178 	if ((create_flags &
1179 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1180 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1181 		/* defer creation of the thread group until the
1182 		 * kern_work_interval_set_workload_id() call */
1183 		work_interval->wi_group = NULL;
1184 	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1185 		/* create a new group for the interval to represent */
1186 		char name[THREAD_GROUP_MAXNAME] = "";
1187 
1188 		snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1189 		    work_interval->wi_creator_pid);
1190 
1191 		tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1192 
1193 		thread_group_set_name(tg, name);
1194 
1195 		work_interval->wi_group = tg;
1196 	} else {
1197 		/* the interval represents the thread's home group */
1198 		tg = thread_group_get_home_group(thread);
1199 
1200 		thread_group_retain(tg);
1201 
1202 		work_interval->wi_group = tg;
1203 	}
1204 
1205 	/* Capture the tg_id for tracing purposes */
1206 	tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1207 
1208 #endif /* CONFIG_THREAD_GROUPS */
1209 
1210 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1211 		mach_port_name_t name = MACH_PORT_NULL;
1212 
1213 		/* work_interval has a +1 ref, moves to the port */
1214 		work_interval->wi_port = ipc_kobject_alloc_port(
1215 			(ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1216 			IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1217 
1218 
1219 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1220 
1221 		if (!MACH_PORT_VALID(name)) {
1222 			/*
1223 			 * copyout failed (port is already deallocated)
1224 			 * Because of the port-destroyed magic,
1225 			 * the work interval is already deallocated too.
1226 			 */
1227 			return KERN_RESOURCE_SHORTAGE;
1228 		}
1229 
1230 		create_params->wica_port = name;
1231 	} else {
1232 		/* work_interval has a +1 ref, moves to the thread */
1233 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1234 		if (kr != KERN_SUCCESS) {
1235 			/* No other thread can join this work interval since it isn't
1236 			 * JOINABLE so release the reference on work interval */
1237 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1238 			return kr;
1239 		}
1240 
1241 		create_params->wica_port = MACH_PORT_NULL;
1242 	}
1243 
1244 	create_params->wica_id = work_interval_id;
1245 
1246 	if (tg_id != ~0) {
1247 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1248 		    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1249 	}
1250 	return KERN_SUCCESS;
1251 }
1252 
1253 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1254 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1255 {
1256 	assert(flags != NULL);
1257 
1258 	kern_return_t kr;
1259 	struct work_interval *work_interval;
1260 
1261 	kr = port_name_to_work_interval(port_name, &work_interval);
1262 	if (kr != KERN_SUCCESS) {
1263 		return kr;
1264 	}
1265 
1266 	assert(work_interval != NULL);
1267 	*flags = work_interval->wi_create_flags;
1268 
1269 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1270 
1271 	return KERN_SUCCESS;
1272 }
1273 
1274 #if CONFIG_THREAD_GROUPS
1275 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1276     "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1277 #endif /* CONFIG_THREAD_GROUPS */
1278 
1279 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1280 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1281     size_t len)
1282 {
1283 	kern_return_t kr;
1284 	struct work_interval *work_interval;
1285 
1286 	if (len > WORK_INTERVAL_NAME_MAX) {
1287 		return KERN_INVALID_ARGUMENT;
1288 	}
1289 	kr = port_name_to_work_interval(port_name, &work_interval);
1290 	if (kr != KERN_SUCCESS) {
1291 		return kr;
1292 	}
1293 
1294 	assert(work_interval != NULL);
1295 
1296 #if CONFIG_THREAD_GROUPS
1297 	uint32_t wi_group_flags = os_atomic_load(
1298 		&work_interval->wi_group_flags, relaxed);
1299 	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1300 		kr = KERN_INVALID_ARGUMENT;
1301 		goto out;
1302 	}
1303 	if (!work_interval->wi_group) {
1304 		kr = KERN_INVALID_ARGUMENT;
1305 		goto out;
1306 	}
1307 
1308 	if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1309 		char tgname[THREAD_GROUP_MAXNAME];
1310 		snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1311 		    name);
1312 		thread_group_set_name(work_interval->wi_group, tgname);
1313 	}
1314 
1315 out:
1316 #endif /* CONFIG_THREAD_GROUPS */
1317 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1318 
1319 	return kr;
1320 }
1321 
1322 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1323 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1324     struct kern_work_interval_workload_id_args *workload_id_args)
1325 {
1326 	kern_return_t kr;
1327 	struct work_interval *work_interval;
1328 	uint32_t wlida_flags = 0;
1329 	uint32_t wlid_flags = 0;
1330 #if CONFIG_THREAD_GROUPS
1331 	uint32_t tg_flags = 0;
1332 #endif
1333 	bool from_workload_config = false;
1334 
1335 	/* Ensure workload ID name is non-empty. */
1336 	if (!workload_id_args->wlida_name[0]) {
1337 		return KERN_INVALID_ARGUMENT;
1338 	}
1339 
1340 	kr = port_name_to_work_interval(port_name, &work_interval);
1341 	if (kr != KERN_SUCCESS) {
1342 		return kr;
1343 	}
1344 
1345 	assert(work_interval != NULL);
1346 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1347 		kr = KERN_INVALID_ARGUMENT;
1348 		goto out;
1349 	}
1350 
1351 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1352 		/* Reject work intervals that didn't indicate they will have a workload ID
1353 		 * at creation. In particular if the work interval has its own thread group,
1354 		 * its creation must have been deferred in kern_work_interval_create */
1355 		kr = KERN_INVALID_ARGUMENT;
1356 		goto out;
1357 	}
1358 
1359 	workload_config_t wl_config = {};
1360 	kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1361 	if (kr == KERN_SUCCESS) {
1362 		if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1363 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1364 			if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1365 			    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1366 				/* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1367 			} else {
1368 				kr = KERN_INVALID_ARGUMENT;
1369 				goto out;
1370 			}
1371 		}
1372 
1373 		wlida_flags = wl_config.wc_flags;
1374 
1375 		wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1376 
1377 #if CONFIG_THREAD_GROUPS
1378 		tg_flags = wl_config.wc_thread_group_flags;
1379 		if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1380 		    (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1381 			kr = KERN_INVALID_ARGUMENT;
1382 			goto out;
1383 		}
1384 #endif /* CONFIG_THREAD_GROUPS */
1385 
1386 		from_workload_config = true;
1387 	} else {
1388 		/* If the workload is not present in the table, perform basic validation
1389 		 * that the create flags passed in match the ones used at work interval
1390 		 * create time */
1391 		if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1392 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1393 			kr = KERN_INVALID_ARGUMENT;
1394 			goto out;
1395 		}
1396 
1397 		const bool wc_avail = workload_config_available();
1398 		if (!wc_avail) {
1399 			wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1400 		}
1401 
1402 		/*
1403 		 * If the workload config wasn't even loaded then fallback to
1404 		 * older behaviour where the new thread group gets the default
1405 		 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1406 		 */
1407 #if CONFIG_THREAD_GROUPS
1408 		if (!wc_avail) {
1409 			tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1410 		} else {
1411 			struct thread_group *home_group =
1412 			    thread_group_get_home_group(current_thread());
1413 			if (home_group != NULL) {
1414 				tg_flags = thread_group_get_flags(home_group);
1415 			}
1416 		}
1417 #endif /* CONFIG_THREAD_GROUPS */
1418 	}
1419 
1420 	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1421 
1422 	/* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1423 	 * has been set). */
1424 	wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1425 	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1426 	    &wlid_flags, relaxed)) {
1427 		if (from_workload_config) {
1428 			work_interval->wi_class = wl_config.wc_class;
1429 			work_interval->wi_class_offset = wl_config.wc_class_offset;
1430 		}
1431 #if CONFIG_THREAD_GROUPS
1432 		if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1433 			/* Perform deferred thread group creation, now that tgflags are known */
1434 			struct thread_group *tg;
1435 			tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1436 			    THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1437 
1438 			char tgname[THREAD_GROUP_MAXNAME] = "";
1439 			snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1440 			    workload_id_args->wlida_name);
1441 			thread_group_set_name(tg, tgname);
1442 
1443 			assert(work_interval->wi_group == NULL);
1444 			work_interval->wi_group = tg;
1445 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1446 			    work_interval->wi_id, work_interval->wi_create_flags,
1447 			    work_interval->wi_creator_pid, thread_group_get_id(tg));
1448 		}
1449 #endif /* CONFIG_THREAD_GROUPS */
1450 	} else {
1451 		/* Workload ID has previously been set (or a thread has already joined). */
1452 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1453 			kr = KERN_INVALID_ARGUMENT;
1454 			goto out;
1455 		}
1456 		/* Treat this request as a query for the out parameters of the ID */
1457 		workload_id_args->wlida_flags = wlid_flags;
1458 	}
1459 
1460 	/*
1461 	 * Emit tracepoints for successfully setting the workload ID.
1462 	 *
1463 	 * After rdar://89342390 has been fixed and a new work interval ktrace
1464 	 * provider has been added, it will be possible to associate a numeric
1465 	 * ID with an ID name. Thus, for those cases where the ID name has been
1466 	 * looked up successfully (`from_workload_config` is true) it will no
1467 	 * longer be necessary to emit a tracepoint with the full ID name.
1468 	 */
1469 	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1470 	    work_interval->wi_id, from_workload_config);
1471 	kernel_debug_string_simple(
1472 		MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1473 		workload_id_args->wlida_name);
1474 
1475 	kr = KERN_SUCCESS;
1476 
1477 out:
1478 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1479 
1480 	return kr;
1481 }
1482 
1483 
1484 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1485 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1486 {
1487 	if (work_interval_id == 0) {
1488 		return KERN_INVALID_ARGUMENT;
1489 	}
1490 
1491 	if (thread->th_work_interval == NULL ||
1492 	    thread->th_work_interval->wi_id != work_interval_id) {
1493 		/* work ID isn't valid or doesn't match joined work interval ID */
1494 		return KERN_INVALID_ARGUMENT;
1495 	}
1496 
1497 	return thread_set_work_interval_explicit_join(thread, NULL);
1498 }
1499 
1500 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1501 kern_work_interval_join(thread_t            thread,
1502     mach_port_name_t    port_name)
1503 {
1504 	struct work_interval *work_interval = NULL;
1505 	kern_return_t kr;
1506 
1507 	if (port_name == MACH_PORT_NULL) {
1508 		/* 'Un-join' the current work interval */
1509 		return thread_set_work_interval_explicit_join(thread, NULL);
1510 	}
1511 
1512 	kr = port_name_to_work_interval(port_name, &work_interval);
1513 	if (kr != KERN_SUCCESS) {
1514 		return kr;
1515 	}
1516 	/* work_interval has a +1 ref */
1517 
1518 	assert(work_interval != NULL);
1519 
1520 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1521 	/* ref was consumed by passing it to the thread in the successful case */
1522 	if (kr != KERN_SUCCESS) {
1523 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1524 	}
1525 	return kr;
1526 }
1527 
1528 /*
1529  * work_interval_port_type_render_server()
1530  *
1531  * Helper routine to determine if the port points to a
1532  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1533  */
1534 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1535 work_interval_port_type_render_server(mach_port_name_t port_name)
1536 {
1537 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1538 }
1539