xref: /xnu-10002.81.5/osfmk/kern/work_interval.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <sys/work_interval.h>
31 
32 #include <kern/work_interval.h>
33 
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43 #include <kern/workload_config.h>
44 #include <kern/assert.h>
45 
46 #include <mach/kern_return.h>
47 #include <mach/notify.h>
48 #include <os/refcnt.h>
49 
50 #include <stdatomic.h>
51 
52 /*
53  * With the introduction of auto-join work intervals, it is possible
54  * to change the work interval (and related thread group) of a thread in a
55  * variety of contexts (thread termination, context switch, thread mode
56  * change etc.). In order to clearly specify the policy expectation and
57  * the locking behavior, all calls to thread_set_work_interval() pass
58  * in a set of flags.
59  */
60 
61 __options_decl(thread_work_interval_options_t, uint32_t, {
62 	/* Change the work interval using the explicit join rules */
63 	THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 	/* Change the work interval using the auto-join rules */
65 	THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
66 	/* Caller already holds the thread lock */
67 	THREAD_WI_THREAD_LOCK_HELD     = 0x4,
68 	/* Caller does not hold the thread lock */
69 	THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
70 	/* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 	THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
72 });
73 
74 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75 static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76 
77 IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78     .iko_op_stable     = true,
79     .iko_op_no_senders = work_interval_port_no_senders);
80 
81 #if CONFIG_SCHED_AUTO_JOIN
82 /* MPSC queue used to defer deallocate work intervals */
83 static struct mpsc_daemon_queue work_interval_deallocate_queue;
84 
85 static void work_interval_deferred_release(struct work_interval *);
86 
87 /*
88  * Work Interval Auto-Join Status
89  *
90  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91  * It packs the following information:
92  * - A bit representing if a "finish" is deferred on the work interval
93  * - Count of number of threads auto-joined to the work interval
94  */
95 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
96 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98 typedef uint32_t work_interval_auto_join_status_t;
99 
100 static inline bool __unused
work_interval_status_deferred_finish(work_interval_auto_join_status_t status)101 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102 {
103 	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104 }
105 
106 static inline uint32_t __unused
work_interval_status_auto_join_count(work_interval_auto_join_status_t status)107 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108 {
109 	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110 }
111 
112 /*
113  * struct work_interval_deferred_finish_state
114  *
115  * Contains the parameters of the finish operation which is being deferred.
116  */
117 struct work_interval_deferred_finish_state {
118 	uint64_t instance_id;
119 	uint64_t start;
120 	uint64_t deadline;
121 	uint64_t complexity;
122 };
123 
124 struct work_interval_auto_join_info {
125 	struct work_interval_deferred_finish_state deferred_finish_state;
126 	work_interval_auto_join_status_t _Atomic status;
127 };
128 #endif /* CONFIG_SCHED_AUTO_JOIN */
129 
130 #if CONFIG_THREAD_GROUPS
131 /* Flags atomically set in wi_group_flags wi_group_flags */
132 #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133 #endif
134 
135 /*
136  * Work Interval struct
137  *
138  * This struct represents a thread group and/or work interval context
139  * in a mechanism that is represented with a kobject.
140  *
141  * Every thread that has joined a WI has a +1 ref, and the port
142  * has a +1 ref as well.
143  *
144  * TODO: groups need to have a 'is for WI' flag
145  *      and they need a flag to create that says 'for WI'
146  *      This would allow CLPC to avoid allocating WI support
147  *      data unless it is needed
148  *
149  * TODO: Enforce not having more than one non-group joinable work
150  *      interval per thread group.
151  *      CLPC only wants to see one WI-notify callout per group.
152  */
153 struct work_interval {
154 	uint64_t wi_id;
155 	struct os_refcnt wi_ref_count;
156 	uint32_t wi_create_flags;
157 
158 	/* for debugging purposes only, does not hold a ref on port */
159 	ipc_port_t wi_port;
160 
161 	/*
162 	 * holds uniqueid and version of creating process,
163 	 * used to permission-gate notify
164 	 * TODO: you'd think there would be a better way to do this
165 	 */
166 	uint64_t wi_creator_uniqueid;
167 	uint32_t wi_creator_pid;
168 	int wi_creator_pidversion;
169 
170 	/* flags set by work_interval_set_workload_id and reflected onto
171 	 *  thread->th_work_interval_flags upon join */
172 	uint32_t wi_wlid_flags;
173 
174 #if CONFIG_THREAD_GROUPS
175 	uint32_t wi_group_flags;
176 	struct thread_group *wi_group;  /* holds +1 ref on group */
177 #endif /* CONFIG_THREAD_GROUPS */
178 
179 #if CONFIG_SCHED_AUTO_JOIN
180 	/* Information related to auto-join and deferred finish for work interval */
181 	struct work_interval_auto_join_info wi_auto_join_info;
182 
183 	/*
184 	 * Since the deallocation of auto-join work intervals
185 	 * can happen in the scheduler when the last thread in
186 	 * the WI blocks and the thread lock is held, the deallocation
187 	 * might have to be done on a separate thread.
188 	 */
189 	struct mpsc_queue_chain   wi_deallocate_link;
190 #endif /* CONFIG_SCHED_AUTO_JOIN */
191 
192 	/*
193 	 * Work interval class info - determines thread priority for threads
194 	 * with a work interval driven policy.
195 	 */
196 	wi_class_t wi_class;
197 	uint8_t wi_class_offset;
198 
199 	struct recount_work_interval wi_recount;
200 };
201 
202 /*
203  * work_interval_telemetry_data_enabled()
204  *
205  * Helper routine to check if work interval has the collection of telemetry data enabled.
206  */
207 static inline bool
work_interval_telemetry_data_enabled(struct work_interval * work_interval)208 work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209 {
210 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
211 }
212 
213 /*
214  * work_interval_should_collect_telemetry_from_thread()
215  *
216  * Helper routine to determine whether any work interval telemetry should be collected
217  * for a thread.
218  */
219 static inline bool
work_interval_should_collect_telemetry_from_thread(thread_t thread)220 work_interval_should_collect_telemetry_from_thread(thread_t thread)
221 {
222 	if (thread->th_work_interval == NULL) {
223 		return false;
224 	}
225 	return work_interval_telemetry_data_enabled(thread->th_work_interval);
226 }
227 
228 /*
229  * work_interval_get_recount_tracks()
230  *
231  * Returns the recount tracks associated with a work interval, or NULL
232  * if the work interval is NULL or has telemetry disabled.
233  */
234 inline struct recount_track *
work_interval_get_recount_tracks(struct work_interval * work_interval)235 work_interval_get_recount_tracks(struct work_interval *work_interval)
236 {
237 	if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
238 		return work_interval->wi_recount.rwi_current_instance;
239 	}
240 	return NULL;
241 }
242 
243 #if CONFIG_SCHED_AUTO_JOIN
244 
245 /*
246  * work_interval_perform_deferred_finish()
247  *
248  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
249  * argument rather than looking at the work_interval since the deferred finish can race with another
250  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
251  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
252  * the deferred state without issues.
253  */
254 static inline void
work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state * deferred_finish_state,__unused struct work_interval * work_interval,__unused thread_t thread)255 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
256     __unused struct work_interval *work_interval, __unused thread_t thread)
257 {
258 
259 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
260 	    thread_tid(thread), thread_group_get_id(work_interval->wi_group));
261 }
262 
263 /*
264  * work_interval_auto_join_increment()
265  *
266  * Routine to increment auto-join counter when a new thread is auto-joined to
267  * the work interval.
268  */
269 static void
work_interval_auto_join_increment(struct work_interval * work_interval)270 work_interval_auto_join_increment(struct work_interval *work_interval)
271 {
272 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
273 	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
274 	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
275 }
276 
277 /*
278  * work_interval_auto_join_decrement()
279  *
280  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
281  * blocking or termination). If this was the last auto-joined thread in the work interval and
282  * there was a deferred finish, performs the finish operation for the work interval.
283  */
284 static void
work_interval_auto_join_decrement(struct work_interval * work_interval,thread_t thread)285 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
286 {
287 	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
288 	work_interval_auto_join_status_t old_status, new_status;
289 	struct work_interval_deferred_finish_state deferred_finish_state;
290 	bool perform_finish;
291 
292 	/* Update the auto-join count for the work interval atomically */
293 	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
294 		perform_finish = false;
295 		new_status = old_status;
296 		assert(work_interval_status_auto_join_count(old_status) > 0);
297 		new_status -= 1;
298 		if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
299 		        /* No auto-joined threads remaining and finish is deferred */
300 		        new_status = 0;
301 		        perform_finish = true;
302 		        /*
303 		         * Its important to copy the deferred finish state here so that this works
304 		         * when racing with another start-finish cycle.
305 		         */
306 		        deferred_finish_state = join_info->deferred_finish_state;
307 		}
308 	});
309 
310 	if (perform_finish == true) {
311 		/*
312 		 * Since work_interval_perform_deferred_finish() calls down to
313 		 * the machine layer callout for finish which gets the thread
314 		 * group from the thread passed in here, it is important to
315 		 * make sure that the thread still has the work interval thread
316 		 * group here.
317 		 */
318 		assert(thread->thread_group == work_interval->wi_group);
319 		work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
320 	}
321 }
322 
323 /*
324  * work_interval_auto_join_enabled()
325  *
326  * Helper routine to check if work interval has auto-join enabled.
327  */
328 static inline bool
work_interval_auto_join_enabled(struct work_interval * work_interval)329 work_interval_auto_join_enabled(struct work_interval *work_interval)
330 {
331 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
332 }
333 
334 /*
335  * work_interval_deferred_finish_enabled()
336  *
337  * Helper routine to check if work interval has deferred finish enabled.
338  */
339 static inline bool __unused
work_interval_deferred_finish_enabled(struct work_interval * work_interval)340 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
341 {
342 	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
343 }
344 
345 #endif /* CONFIG_SCHED_AUTO_JOIN */
346 
347 static inline void
work_interval_retain(struct work_interval * work_interval)348 work_interval_retain(struct work_interval *work_interval)
349 {
350 	/*
351 	 * Even though wi_retain is called under a port lock, we have
352 	 * to use os_ref_retain instead of os_ref_retain_locked
353 	 * because wi_release is not synchronized. wi_release calls
354 	 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
355 	 */
356 	os_ref_retain(&work_interval->wi_ref_count);
357 }
358 
359 static inline void
work_interval_deallocate(struct work_interval * work_interval)360 work_interval_deallocate(struct work_interval *work_interval)
361 {
362 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
363 	    work_interval->wi_id);
364 	if (work_interval_telemetry_data_enabled(work_interval)) {
365 		recount_work_interval_deinit(&work_interval->wi_recount);
366 	}
367 	kfree_type(struct work_interval, work_interval);
368 }
369 
370 /*
371  * work_interval_release()
372  *
373  * Routine to release a ref count on the work interval. If the refcount goes down
374  * to zero, the work interval needs to be de-allocated.
375  *
376  * For non auto-join work intervals, they are de-allocated in this context.
377  *
378  * For auto-join work intervals, the de-allocation cannot be done from this context
379  * since that might need the kernel memory allocator lock. In that case, the
380  * deallocation is done via a thread-call based mpsc queue.
381  */
382 static void
work_interval_release(struct work_interval * work_interval,__unused thread_work_interval_options_t options)383 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
384 {
385 	if (os_ref_release(&work_interval->wi_ref_count) == 0) {
386 #if CONFIG_SCHED_AUTO_JOIN
387 		if (options & THREAD_WI_THREAD_LOCK_HELD) {
388 			work_interval_deferred_release(work_interval);
389 		} else {
390 			work_interval_deallocate(work_interval);
391 		}
392 #else /* CONFIG_SCHED_AUTO_JOIN */
393 		work_interval_deallocate(work_interval);
394 #endif /* CONFIG_SCHED_AUTO_JOIN */
395 	}
396 }
397 
398 #if CONFIG_SCHED_AUTO_JOIN
399 
400 /*
401  * work_interval_deferred_release()
402  *
403  * Routine to enqueue the work interval on the deallocation mpsc queue.
404  */
405 static void
work_interval_deferred_release(struct work_interval * work_interval)406 work_interval_deferred_release(struct work_interval *work_interval)
407 {
408 	mpsc_daemon_enqueue(&work_interval_deallocate_queue,
409 	    &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
410 }
411 
412 /*
413  * work_interval_should_propagate()
414  *
415  * Main policy routine to decide if a thread should be auto-joined to
416  * another thread's work interval. The conditions are arranged such that
417  * the most common bailout condition are checked the earliest. This routine
418  * is called from the scheduler context; so it needs to be efficient and
419  * be careful when taking locks or performing wakeups.
420  */
421 inline bool
work_interval_should_propagate(thread_t cthread,thread_t thread)422 work_interval_should_propagate(thread_t cthread, thread_t thread)
423 {
424 	/* Only allow propagation if the current thread has a work interval and the woken up thread does not */
425 	if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
426 		return false;
427 	}
428 
429 	/* Only propagate work intervals which have auto-join enabled */
430 	if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
431 		return false;
432 	}
433 
434 	/* Work interval propagation is enabled for realtime threads only */
435 	if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
436 		return false;
437 	}
438 
439 
440 	/* Work interval propagation only works for threads with the same home thread group */
441 	struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
442 	if (thread_group_get_home_group(cthread) != thread_home_tg) {
443 		return false;
444 	}
445 
446 	/* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
447 	if (thread->thread_group != thread_home_tg) {
448 		return false;
449 	}
450 
451 	/* If either thread is inactive (in the termination path), do not propagate auto-join */
452 	if ((!cthread->active) || (!thread->active)) {
453 		return false;
454 	}
455 
456 	return true;
457 }
458 
459 /*
460  * work_interval_auto_join_propagate()
461  *
462  * Routine to auto-join a thread into another thread's work interval
463  *
464  * Should only be invoked if work_interval_should_propagate() returns
465  * true. Also expects "from" thread to be current thread and "to" thread
466  * to be locked.
467  */
468 void
work_interval_auto_join_propagate(thread_t from,thread_t to)469 work_interval_auto_join_propagate(thread_t from, thread_t to)
470 {
471 	assert(from == current_thread());
472 	work_interval_retain(from->th_work_interval);
473 	work_interval_auto_join_increment(from->th_work_interval);
474 	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
475 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
476 	assert(kr == KERN_SUCCESS);
477 }
478 
479 /*
480  * work_interval_auto_join_unwind()
481  *
482  * Routine to un-join an auto-joined work interval for a thread that is blocking.
483  *
484  * Expects thread to be locked.
485  */
486 void
work_interval_auto_join_unwind(thread_t thread)487 work_interval_auto_join_unwind(thread_t thread)
488 {
489 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
490 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
491 	assert(kr == KERN_SUCCESS);
492 }
493 
494 /*
495  * work_interval_auto_join_demote()
496  *
497  * Routine to un-join an auto-joined work interval when a thread is changing from
498  * realtime to non-realtime scheduling mode. This could happen due to multiple
499  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
500  * the thread being demoted may not be the current thread.
501  *
502  * Expects thread to be locked.
503  */
504 void
work_interval_auto_join_demote(thread_t thread)505 work_interval_auto_join_demote(thread_t thread)
506 {
507 	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
508 	    THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
509 	assert(kr == KERN_SUCCESS);
510 }
511 
512 static void
work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)513 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
514     __assert_only mpsc_daemon_queue_t dq)
515 {
516 	struct work_interval *work_interval = NULL;
517 	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
518 	assert(dq == &work_interval_deallocate_queue);
519 	assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
520 	work_interval_deallocate(work_interval);
521 }
522 
523 #endif /* CONFIG_SCHED_AUTO_JOIN */
524 
525 #if CONFIG_SCHED_AUTO_JOIN
526 __startup_func
527 static void
work_interval_subsystem_init(void)528 work_interval_subsystem_init(void)
529 {
530 	/*
531 	 * The work interval deallocation queue must be a thread call based queue
532 	 * because it is woken up from contexts where the thread lock is held. The
533 	 * only way to perform wakeups safely in those contexts is to wakeup a
534 	 * thread call which is guaranteed to be on a different waitq and would
535 	 * not hash onto the same global waitq which might be currently locked.
536 	 */
537 	mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
538 	    work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL,
539 	    MPSC_DAEMON_INIT_NONE);
540 }
541 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
542 #endif /* CONFIG_SCHED_AUTO_JOIN */
543 
544 /*
545  * work_interval_port_convert
546  *
547  * Called with port locked, returns reference to work interval
548  * if indeed the port is a work interval kobject port
549  */
550 static struct work_interval *
work_interval_port_convert_locked(ipc_port_t port)551 work_interval_port_convert_locked(ipc_port_t port)
552 {
553 	struct work_interval *work_interval = NULL;
554 
555 	if (IP_VALID(port)) {
556 		work_interval = ipc_kobject_get_stable(port, IKOT_WORK_INTERVAL);
557 		if (work_interval) {
558 			work_interval_retain(work_interval);
559 		}
560 	}
561 
562 	return work_interval;
563 }
564 
565 /*
566  * port_name_to_work_interval
567  *
568  * Description: Obtain a reference to the work_interval associated with a given port.
569  *
570  * Parameters:  name    A Mach port name to translate.
571  *
572  * Returns:     NULL    The given Mach port did not reference a work_interval.
573  *              !NULL   The work_interval that is associated with the Mach port.
574  */
575 static kern_return_t
port_name_to_work_interval(mach_port_name_t name,struct work_interval ** work_interval)576 port_name_to_work_interval(mach_port_name_t     name,
577     struct work_interval **work_interval)
578 {
579 	if (!MACH_PORT_VALID(name)) {
580 		return KERN_INVALID_NAME;
581 	}
582 
583 	ipc_port_t port = IP_NULL;
584 	kern_return_t kr = KERN_SUCCESS;
585 
586 	kr = ipc_port_translate_send(current_space(), name, &port);
587 	if (kr != KERN_SUCCESS) {
588 		return kr;
589 	}
590 	/* port is locked */
591 
592 	assert(IP_VALID(port));
593 
594 	struct work_interval *converted_work_interval;
595 
596 	converted_work_interval = work_interval_port_convert_locked(port);
597 
598 	/* the port is valid, but doesn't denote a work_interval */
599 	if (converted_work_interval == NULL) {
600 		kr = KERN_INVALID_CAPABILITY;
601 	}
602 
603 	ip_mq_unlock(port);
604 
605 	if (kr == KERN_SUCCESS) {
606 		*work_interval = converted_work_interval;
607 	}
608 
609 	return kr;
610 }
611 
612 
613 /*
614  * work_interval_port_no_senders
615  *
616  * Description: Handle a no-senders notification for a work interval port.
617  *              Destroys the port and releases its reference on the work interval.
618  *
619  * Parameters:  msg     A Mach no-senders notification message.
620  *
621  * Note: This assumes that there is only one create-right-from-work-interval point,
622  *       if the ability to extract another send right after creation is added,
623  *       this will have to change to handle make-send counts correctly.
624  */
625 static void
work_interval_port_no_senders(ipc_port_t port,mach_port_mscount_t mscount)626 work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
627 {
628 	struct work_interval *work_interval = NULL;
629 
630 	work_interval = ipc_kobject_dealloc_port(port, mscount,
631 	    IKOT_WORK_INTERVAL);
632 
633 	work_interval->wi_port = MACH_PORT_NULL;
634 
635 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
636 }
637 
638 /*
639  * work_interval_port_type()
640  *
641  * Converts a port name into the work interval object and returns its type.
642  *
643  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
644  * valid type for work intervals).
645  */
646 static uint32_t
work_interval_port_type(mach_port_name_t port_name)647 work_interval_port_type(mach_port_name_t port_name)
648 {
649 	struct work_interval *work_interval = NULL;
650 	kern_return_t kr;
651 	uint32_t work_interval_type;
652 
653 	if (port_name == MACH_PORT_NULL) {
654 		return WORK_INTERVAL_TYPE_LAST;
655 	}
656 
657 	kr = port_name_to_work_interval(port_name, &work_interval);
658 	if (kr != KERN_SUCCESS) {
659 		return WORK_INTERVAL_TYPE_LAST;
660 	}
661 	/* work_interval has a +1 ref */
662 
663 	assert(work_interval != NULL);
664 	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
665 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
666 	return work_interval_type;
667 }
668 
669 /*
670  * Sparse - not all work interval classes imply a scheduling policy change.
671  * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
672  * adopted the REALTIME sched mode to take effect.
673  */
674 static const struct {
675 	int          priority;
676 	sched_mode_t sched_mode;
677 } work_interval_class_data[WI_CLASS_COUNT] = {
678 	[WI_CLASS_BEST_EFFORT] = {
679 		BASEPRI_DEFAULT,        // 31
680 		TH_MODE_TIMESHARE,
681 	},
682 
683 	[WI_CLASS_APP_SUPPORT] = {
684 		BASEPRI_DEFAULT,        // 31
685 		TH_MODE_TIMESHARE,
686 	},
687 
688 	[WI_CLASS_SYSTEM] = {
689 		BASEPRI_FOREGROUND + 1, // 48
690 		TH_MODE_FIXED,
691 	},
692 
693 	[WI_CLASS_SYSTEM_CRITICAL] = {
694 		MAXPRI_USER + 1,        // 64
695 		TH_MODE_FIXED,
696 	},
697 
698 	[WI_CLASS_REALTIME_CRITICAL] = {
699 		BASEPRI_RTQUEUES + 1,   // 98
700 		TH_MODE_REALTIME,
701 	},
702 };
703 
704 /*
705  * Called when a thread gets its scheduling priority from its associated work
706  * interval.
707  */
708 int
work_interval_get_priority(thread_t thread)709 work_interval_get_priority(thread_t thread)
710 {
711 	const struct work_interval *work_interval = thread->th_work_interval;
712 	assert(work_interval != NULL);
713 
714 	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
715 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
716 	int priority = work_interval_class_data[work_interval->wi_class].priority;
717 	assert(priority != 0);
718 
719 	priority += work_interval->wi_class_offset;
720 	assert3u(priority, <=, MAXPRI);
721 
722 	return priority;
723 }
724 
725 /*
726  * Switch to a policy driven by the work interval (if applicable).
727  */
728 static void
work_interval_set_policy(thread_t thread)729 work_interval_set_policy(thread_t thread)
730 {
731 	assert3p(thread, ==, current_thread());
732 
733 	/*
734 	 * Ignore policy changes if the workload context shouldn't affect the
735 	 * scheduling policy.
736 	 */
737 	workload_config_flags_t flags = WLC_F_NONE;
738 
739 	/* There may be no config at all. That's ok. */
740 	if (workload_config_get_flags(&flags) != KERN_SUCCESS ||
741 	    (flags & WLC_F_THREAD_POLICY) == 0) {
742 		return;
743 	}
744 
745 	const struct work_interval *work_interval = thread->th_work_interval;
746 	assert(work_interval != NULL);
747 
748 	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
749 	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
750 
751 	/*
752 	 * A mode of TH_MODE_NONE implies that this work interval has no
753 	 * associated scheduler effects.
754 	 */
755 	if (mode == TH_MODE_NONE) {
756 		return;
757 	}
758 
759 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
760 	    TASK_POLICY_WI_DRIVEN, true, mode);
761 	assert(thread->requested_policy.thrp_wi_driven);
762 
763 	return;
764 }
765 
766 /*
767  * Clear a work interval driven policy.
768  */
769 static void
work_interval_clear_policy(thread_t thread)770 work_interval_clear_policy(thread_t thread)
771 {
772 	assert3p(thread, ==, current_thread());
773 
774 	if (!thread->requested_policy.thrp_wi_driven) {
775 		return;
776 	}
777 
778 	const sched_mode_t mode = sched_get_thread_mode_user(thread);
779 
780 	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
781 	    TASK_POLICY_WI_DRIVEN, false,
782 	    mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
783 
784 	assert(!thread->requested_policy.thrp_wi_driven);
785 
786 	return;
787 }
788 
789 /*
790  * thread_set_work_interval()
791  *
792  * Change thread's bound work interval to the passed-in work interval
793  * Consumes +1 ref on work_interval upon success.
794  *
795  * May also pass NULL to un-set work_interval on the thread
796  * Will deallocate any old work interval on the thread
797  * Return error if thread does not satisfy requirements to join work interval
798  *
799  * For non auto-join work intervals, deallocate any old work interval on the thread
800  * For auto-join work intervals, the routine may wakeup the work interval deferred
801  * deallocation queue since thread locks might be currently held.
802  */
803 static kern_return_t
thread_set_work_interval(thread_t thread,struct work_interval * work_interval,thread_work_interval_options_t options)804 thread_set_work_interval(thread_t thread,
805     struct work_interval *work_interval, thread_work_interval_options_t options)
806 {
807 	/* All explicit work interval operations should always be from the current thread */
808 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
809 		assert(thread == current_thread());
810 	}
811 
812 	/* All cases of needing the thread lock should be from explicit join scenarios */
813 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
814 		assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
815 	}
816 
817 	/* For all cases of auto join must come in with the thread lock held */
818 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
819 		assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
820 	}
821 
822 #if CONFIG_THREAD_GROUPS
823 	if (work_interval && !work_interval->wi_group) {
824 		/* Reject join on work intervals with deferred thread group creation */
825 		return KERN_INVALID_ARGUMENT;
826 	}
827 #endif /* CONFIG_THREAD_GROUPS */
828 
829 	if (work_interval) {
830 		uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
831 
832 		if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
833 			/* Ensure no kern_work_interval_set_workload_id can happen after this point */
834 			uint32_t wlid_flags;
835 			(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
836 			    WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
837 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
838 				/* For workload IDs with rt-allowed, neuter the check below to
839 				 * enable joining before the thread has become realtime for all
840 				 * work interval types */
841 				work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
842 			}
843 		}
844 
845 		if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
846 		    (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
847 			return KERN_INVALID_ARGUMENT;
848 		}
849 	}
850 
851 	/*
852 	 * Ensure a work interval scheduling policy is not used if the thread is
853 	 * leaving the work interval.
854 	 */
855 	if (work_interval == NULL &&
856 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
857 		work_interval_clear_policy(thread);
858 	}
859 
860 	struct work_interval *old_th_wi = thread->th_work_interval;
861 #if CONFIG_SCHED_AUTO_JOIN
862 	spl_t s;
863 	/* Take the thread lock if needed */
864 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
865 		s = splsched();
866 		thread_lock(thread);
867 	}
868 
869 	/*
870 	 * Work interval auto-join leak to non-RT threads.
871 	 *
872 	 * If thread might be running on a remote core and it's not in the context switch path (where
873 	 * thread is neither running, blocked or in the runq), its not possible to update the
874 	 * work interval & thread group remotely since its not possible to update CLPC for a remote
875 	 * core. This situation might happen when a thread is transitioning from realtime to
876 	 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
877 	 * be part of the work interval.
878 	 *
879 	 * Since there is no immediate mitigation to this issue, the policy is to set a new
880 	 * flag on the thread which indicates that such a "leak" has happened. This flag will
881 	 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
882 	 */
883 	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
884 
885 	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
886 		assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
887 		os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
888 		return KERN_SUCCESS;
889 	}
890 
891 	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
892 
893 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
894 		__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
895 		__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(work_interval->wi_group) : ~0;
896 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
897 		    thread_tid(thread), old_tg_id, new_tg_id, options);
898 	}
899 
900 	if (old_wi_auto_joined) {
901 		/*
902 		 * If thread was auto-joined to a work interval and is not realtime, make sure it
903 		 * happened due to the "leak" described above.
904 		 */
905 		if (thread->sched_mode != TH_MODE_REALTIME) {
906 			assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
907 		}
908 
909 		os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
910 		work_interval_auto_join_decrement(old_th_wi, thread);
911 		thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
912 	}
913 
914 #endif /* CONFIG_SCHED_AUTO_JOIN */
915 
916 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
917 	    thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
918 
919 	/* transfer +1 ref to thread */
920 	thread->th_work_interval = work_interval;
921 
922 #if CONFIG_SCHED_AUTO_JOIN
923 
924 	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
925 		assert(work_interval_auto_join_enabled(work_interval) == true);
926 		thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
927 	}
928 
929 	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
930 		thread_unlock(thread);
931 		splx(s);
932 	}
933 #endif /* CONFIG_SCHED_AUTO_JOIN */
934 
935 	/*
936 	 * The thread got a new work interval. It may come with a work interval
937 	 * scheduling policy that needs to be applied.
938 	 */
939 	if (work_interval != NULL &&
940 	    (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
941 		work_interval_set_policy(thread);
942 	}
943 
944 #if CONFIG_THREAD_GROUPS
945 	if (work_interval) {
946 		/* Prevent thread_group_set_name after CLPC may have already heard
947 		 * about the thread group */
948 		(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
949 		    WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
950 	}
951 	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
952 
953 	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
954 #if CONFIG_SCHED_AUTO_JOIN
955 		thread_set_autojoin_thread_group_locked(thread, new_tg);
956 #endif
957 	} else {
958 		thread_set_work_interval_thread_group(thread, new_tg);
959 	}
960 #endif /* CONFIG_THREAD_GROUPS */
961 
962 	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
963 		/* Construct mask to XOR with th_work_interval_flags to clear the
964 		* currently present flags and set the new flags in wlid_flags. */
965 		uint32_t wlid_flags = 0;
966 		if (work_interval) {
967 			wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
968 		}
969 		thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
970 			&thread->th_work_interval_flags, relaxed);
971 		th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
972 		    TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
973 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
974 			th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
975 			if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
976 				th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
977 			}
978 		}
979 		if (th_wi_xor_mask) {
980 			os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
981 		}
982 
983 		/*
984 		 * Now that the interval flags have been set, re-evaluate
985 		 * whether the thread needs to be undemoted - the new work
986 		 * interval may have the RT_ALLOWED flag. and the thread may
987 		 * have have a realtime policy but be demoted.
988 		 */
989 		thread_rt_evaluate(thread);
990 	}
991 
992 	if (old_th_wi != NULL) {
993 		work_interval_release(old_th_wi, options);
994 	}
995 
996 	return KERN_SUCCESS;
997 }
998 
999 static kern_return_t
thread_set_work_interval_explicit_join(thread_t thread,struct work_interval * work_interval)1000 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1001 {
1002 	assert(thread == current_thread());
1003 	return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1004 }
1005 
1006 kern_return_t
work_interval_thread_terminate(thread_t thread)1007 work_interval_thread_terminate(thread_t thread)
1008 {
1009 	assert(thread == current_thread());
1010 	if (thread->th_work_interval != NULL) {
1011 		return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1012 	}
1013 	return KERN_SUCCESS;
1014 }
1015 
1016 kern_return_t
kern_work_interval_notify(thread_t thread,struct kern_work_interval_args * kwi_args)1017 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1018 {
1019 	assert(thread == current_thread());
1020 	assert(kwi_args->work_interval_id != 0);
1021 
1022 	struct work_interval *work_interval = thread->th_work_interval;
1023 
1024 	if (work_interval == NULL ||
1025 	    work_interval->wi_id != kwi_args->work_interval_id) {
1026 		/* This thread must have adopted the work interval to be able to notify */
1027 		return KERN_INVALID_ARGUMENT;
1028 	}
1029 
1030 	task_t notifying_task = current_task();
1031 
1032 	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
1033 	    work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
1034 		/* Only the creating task can do a notify */
1035 		return KERN_INVALID_ARGUMENT;
1036 	}
1037 
1038 	spl_t s = splsched();
1039 
1040 #if CONFIG_THREAD_GROUPS
1041 	assert(work_interval->wi_group == thread->thread_group);
1042 #endif /* CONFIG_THREAD_GROUPS */
1043 
1044 	uint64_t urgency_param1, urgency_param2;
1045 	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
1046 
1047 	splx(s);
1048 
1049 	/* called without interrupts disabled */
1050 	machine_work_interval_notify(thread, kwi_args);
1051 
1052 	return KERN_SUCCESS;
1053 }
1054 
1055 /* Start at 1, 0 is not a valid work interval ID */
1056 static _Atomic uint64_t unique_work_interval_id = 1;
1057 
1058 kern_return_t
kern_work_interval_create(thread_t thread,struct kern_work_interval_create_args * create_params)1059 kern_work_interval_create(thread_t thread,
1060     struct kern_work_interval_create_args *create_params)
1061 {
1062 	assert(thread == current_thread());
1063 
1064 	uint32_t create_flags = create_params->wica_create_flags;
1065 
1066 	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1067 	    thread->th_work_interval != NULL) {
1068 		/*
1069 		 * If the thread is doing a legacy combined create and join,
1070 		 * it shouldn't already be part of a work interval.
1071 		 *
1072 		 * (Creating a joinable WI is allowed anytime.)
1073 		 */
1074 		return KERN_FAILURE;
1075 	}
1076 
1077 	/*
1078 	 * Check the validity of the create flags before allocating the work
1079 	 * interval.
1080 	 */
1081 	task_t creating_task = current_task();
1082 	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1083 		/*
1084 		 * CA_CLIENT work intervals do not create new thread groups.
1085 		 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1086 		 * per each application task
1087 		 */
1088 		if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1089 			return KERN_FAILURE;
1090 		}
1091 		if (!task_is_app(creating_task)) {
1092 #if XNU_TARGET_OS_OSX
1093 			/*
1094 			 * Soft-fail the case of a non-app pretending to be an
1095 			 * app, by allowing it to press the buttons, but they're
1096 			 * not actually connected to anything.
1097 			 */
1098 			create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1099 #else
1100 			/*
1101 			 * On iOS, it's a hard failure to get your apptype
1102 			 * wrong and then try to render something.
1103 			 */
1104 			return KERN_NOT_SUPPORTED;
1105 #endif /* XNU_TARGET_OS_OSX */
1106 		}
1107 		if (task_set_ca_client_wi(creating_task, true) == false) {
1108 			return KERN_FAILURE;
1109 		}
1110 	}
1111 
1112 #if CONFIG_SCHED_AUTO_JOIN
1113 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1114 		uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1115 		if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1116 			return KERN_NOT_SUPPORTED;
1117 		}
1118 		if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1119 			return KERN_NOT_SUPPORTED;
1120 		}
1121 	}
1122 
1123 	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1124 		if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1125 			return KERN_NOT_SUPPORTED;
1126 		}
1127 	}
1128 #endif /* CONFIG_SCHED_AUTO_JOIN */
1129 
1130 	struct work_interval *work_interval = kalloc_type(struct work_interval,
1131 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1132 
1133 	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1134 
1135 	*work_interval = (struct work_interval) {
1136 		.wi_id                  = work_interval_id,
1137 		.wi_ref_count           = {},
1138 		.wi_create_flags        = create_flags,
1139 		.wi_creator_pid         = pid_from_task(creating_task),
1140 		.wi_creator_uniqueid    = get_task_uniqueid(creating_task),
1141 		.wi_creator_pidversion  = get_task_version(creating_task),
1142 	};
1143 	os_ref_init(&work_interval->wi_ref_count, NULL);
1144 
1145 	if (work_interval_telemetry_data_enabled(work_interval)) {
1146 		recount_work_interval_init(&work_interval->wi_recount);
1147 	}
1148 
1149 	__kdebug_only uint64_t tg_id = 0;
1150 #if CONFIG_THREAD_GROUPS
1151 	struct thread_group *tg;
1152 	if ((create_flags &
1153 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1154 	    (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1155 		/* defer creation of the thread group until the
1156 		 * kern_work_interval_set_workload_id() call */
1157 		work_interval->wi_group = NULL;
1158 	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1159 		/* create a new group for the interval to represent */
1160 		char name[THREAD_GROUP_MAXNAME] = "";
1161 
1162 		snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1163 		    work_interval->wi_creator_pid);
1164 
1165 		tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1166 
1167 		thread_group_set_name(tg, name);
1168 
1169 		work_interval->wi_group = tg;
1170 	} else {
1171 		/* the interval represents the thread's home group */
1172 		tg = thread_group_get_home_group(thread);
1173 
1174 		thread_group_retain(tg);
1175 
1176 		work_interval->wi_group = tg;
1177 	}
1178 
1179 	/* Capture the tg_id for tracing purposes */
1180 	tg_id = work_interval->wi_group ? thread_group_get_id(work_interval->wi_group) : ~0;
1181 
1182 #endif /* CONFIG_THREAD_GROUPS */
1183 
1184 	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1185 		mach_port_name_t name = MACH_PORT_NULL;
1186 
1187 		/* work_interval has a +1 ref, moves to the port */
1188 		work_interval->wi_port = ipc_kobject_alloc_port(
1189 			(ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
1190 			IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1191 
1192 
1193 		name = ipc_port_copyout_send(work_interval->wi_port, current_space());
1194 
1195 		if (!MACH_PORT_VALID(name)) {
1196 			/*
1197 			 * copyout failed (port is already deallocated)
1198 			 * Because of the port-destroyed magic,
1199 			 * the work interval is already deallocated too.
1200 			 */
1201 			return KERN_RESOURCE_SHORTAGE;
1202 		}
1203 
1204 		create_params->wica_port = name;
1205 	} else {
1206 		/* work_interval has a +1 ref, moves to the thread */
1207 		kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1208 		if (kr != KERN_SUCCESS) {
1209 			/* No other thread can join this work interval since it isn't
1210 			 * JOINABLE so release the reference on work interval */
1211 			work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1212 			return kr;
1213 		}
1214 
1215 		create_params->wica_port = MACH_PORT_NULL;
1216 	}
1217 
1218 	create_params->wica_id = work_interval_id;
1219 
1220 	if (tg_id != ~0) {
1221 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1222 		    work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1223 	}
1224 	return KERN_SUCCESS;
1225 }
1226 
1227 kern_return_t
kern_work_interval_get_flags_from_port(mach_port_name_t port_name,uint32_t * flags)1228 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1229 {
1230 	assert(flags != NULL);
1231 
1232 	kern_return_t kr;
1233 	struct work_interval *work_interval;
1234 
1235 	kr = port_name_to_work_interval(port_name, &work_interval);
1236 	if (kr != KERN_SUCCESS) {
1237 		return kr;
1238 	}
1239 
1240 	assert(work_interval != NULL);
1241 	*flags = work_interval->wi_create_flags;
1242 
1243 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1244 
1245 	return KERN_SUCCESS;
1246 }
1247 
1248 #if CONFIG_THREAD_GROUPS
1249 _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1250     "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1251 #endif /* CONFIG_THREAD_GROUPS */
1252 
1253 kern_return_t
kern_work_interval_set_name(mach_port_name_t port_name,__unused char * name,size_t len)1254 kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1255     size_t len)
1256 {
1257 	kern_return_t kr;
1258 	struct work_interval *work_interval;
1259 
1260 	if (len > WORK_INTERVAL_NAME_MAX) {
1261 		return KERN_INVALID_ARGUMENT;
1262 	}
1263 	kr = port_name_to_work_interval(port_name, &work_interval);
1264 	if (kr != KERN_SUCCESS) {
1265 		return kr;
1266 	}
1267 
1268 	assert(work_interval != NULL);
1269 
1270 #if CONFIG_THREAD_GROUPS
1271 	uint32_t wi_group_flags = os_atomic_load(
1272 		&work_interval->wi_group_flags, relaxed);
1273 	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1274 		kr = KERN_INVALID_ARGUMENT;
1275 		goto out;
1276 	}
1277 	if (!work_interval->wi_group) {
1278 		kr = KERN_INVALID_ARGUMENT;
1279 		goto out;
1280 	}
1281 
1282 	if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1283 		char tgname[THREAD_GROUP_MAXNAME];
1284 		snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1285 		    name);
1286 		thread_group_set_name(work_interval->wi_group, tgname);
1287 	}
1288 
1289 out:
1290 #endif /* CONFIG_THREAD_GROUPS */
1291 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1292 
1293 	return kr;
1294 }
1295 
1296 kern_return_t
kern_work_interval_set_workload_id(mach_port_name_t port_name,struct kern_work_interval_workload_id_args * workload_id_args)1297 kern_work_interval_set_workload_id(mach_port_name_t port_name,
1298     struct kern_work_interval_workload_id_args *workload_id_args)
1299 {
1300 	kern_return_t kr;
1301 	struct work_interval *work_interval;
1302 	uint32_t wlida_flags = 0;
1303 	uint32_t wlid_flags = 0;
1304 #if CONFIG_THREAD_GROUPS
1305 	uint32_t tg_flags = 0;
1306 #endif
1307 	bool from_workload_config = false;
1308 
1309 	/* Ensure workload ID name is non-empty. */
1310 	if (!workload_id_args->wlida_name[0]) {
1311 		return KERN_INVALID_ARGUMENT;
1312 	}
1313 
1314 	kr = port_name_to_work_interval(port_name, &work_interval);
1315 	if (kr != KERN_SUCCESS) {
1316 		return kr;
1317 	}
1318 
1319 	assert(work_interval != NULL);
1320 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1321 		kr = KERN_INVALID_ARGUMENT;
1322 		goto out;
1323 	}
1324 
1325 	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1326 		/* Reject work intervals that didn't indicate they will have a workload ID
1327 		 * at creation. In particular if the work interval has its own thread group,
1328 		 * its creation must have been deferred in kern_work_interval_create */
1329 		kr = KERN_INVALID_ARGUMENT;
1330 		goto out;
1331 	}
1332 
1333 	workload_config_t wl_config = {};
1334 	kr = workload_config_lookup_default(workload_id_args->wlida_name, &wl_config);
1335 	if (kr == KERN_SUCCESS) {
1336 		if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1337 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1338 			if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1339 			    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1340 				/* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1341 			} else {
1342 				kr = KERN_INVALID_ARGUMENT;
1343 				goto out;
1344 			}
1345 		}
1346 
1347 		wlida_flags = wl_config.wc_flags;
1348 
1349 		wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1350 
1351 #if CONFIG_THREAD_GROUPS
1352 		tg_flags = wl_config.wc_thread_group_flags;
1353 		if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1354 		    (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1355 			kr = KERN_INVALID_ARGUMENT;
1356 			goto out;
1357 		}
1358 #endif /* CONFIG_THREAD_GROUPS */
1359 
1360 		from_workload_config = true;
1361 	} else {
1362 		/* If the workload is not present in the table, perform basic validation
1363 		 * that the create flags passed in match the ones used at work interval
1364 		 * create time */
1365 		if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1366 		    (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1367 			kr = KERN_INVALID_ARGUMENT;
1368 			goto out;
1369 		}
1370 
1371 		const bool wc_avail = workload_config_available();
1372 		if (!wc_avail) {
1373 			wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1374 		}
1375 
1376 		/*
1377 		 * If the workload config wasn't even loaded then fallback to
1378 		 * older behaviour where the new thread group gets the default
1379 		 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1380 		 */
1381 #if CONFIG_THREAD_GROUPS
1382 		if (!wc_avail) {
1383 			tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1384 		} else {
1385 			struct thread_group *home_group =
1386 			    thread_group_get_home_group(current_thread());
1387 			if (home_group != NULL) {
1388 				tg_flags = thread_group_get_flags(home_group);
1389 			}
1390 		}
1391 #endif /* CONFIG_THREAD_GROUPS */
1392 	}
1393 
1394 	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1395 
1396 	/* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1397 	 * has been set). */
1398 	wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1399 	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1400 	    &wlid_flags, relaxed)) {
1401 		if (from_workload_config) {
1402 			work_interval->wi_class = wl_config.wc_class;
1403 			work_interval->wi_class_offset = wl_config.wc_class_offset;
1404 		}
1405 #if CONFIG_THREAD_GROUPS
1406 		if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1407 			/* Perform deferred thread group creation, now that tgflags are known */
1408 			struct thread_group *tg;
1409 			tg = thread_group_create_and_retain(tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1410 			    THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1411 
1412 			char tgname[THREAD_GROUP_MAXNAME] = "";
1413 			snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1414 			    workload_id_args->wlida_name);
1415 			thread_group_set_name(tg, tgname);
1416 
1417 			assert(work_interval->wi_group == NULL);
1418 			work_interval->wi_group = tg;
1419 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1420 			    work_interval->wi_id, work_interval->wi_create_flags,
1421 			    work_interval->wi_creator_pid, thread_group_get_id(tg));
1422 		}
1423 #endif /* CONFIG_THREAD_GROUPS */
1424 	} else {
1425 		/* Workload ID has previously been set (or a thread has already joined). */
1426 		if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1427 			kr = KERN_INVALID_ARGUMENT;
1428 			goto out;
1429 		}
1430 		/* Treat this request as a query for the out parameters of the ID */
1431 		workload_id_args->wlida_flags = wlid_flags;
1432 	}
1433 
1434 	/*
1435 	 * Emit tracepoints for successfully setting the workload ID.
1436 	 *
1437 	 * After rdar://89342390 has been fixed and a new work interval ktrace
1438 	 * provider has been added, it will be possible to associate a numeric
1439 	 * ID with an ID name. Thus, for those cases where the ID name has been
1440 	 * looked up successfully (`from_workload_config` is true) it will no
1441 	 * longer be necessary to emit a tracepoint with the full ID name.
1442 	 */
1443 	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1444 	    work_interval->wi_id, from_workload_config);
1445 	kernel_debug_string_simple(
1446 		MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1447 		workload_id_args->wlida_name);
1448 
1449 	kr = KERN_SUCCESS;
1450 
1451 out:
1452 	work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1453 
1454 	return kr;
1455 }
1456 
1457 
1458 kern_return_t
kern_work_interval_destroy(thread_t thread,uint64_t work_interval_id)1459 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1460 {
1461 	if (work_interval_id == 0) {
1462 		return KERN_INVALID_ARGUMENT;
1463 	}
1464 
1465 	if (thread->th_work_interval == NULL ||
1466 	    thread->th_work_interval->wi_id != work_interval_id) {
1467 		/* work ID isn't valid or doesn't match joined work interval ID */
1468 		return KERN_INVALID_ARGUMENT;
1469 	}
1470 
1471 	return thread_set_work_interval_explicit_join(thread, NULL);
1472 }
1473 
1474 kern_return_t
kern_work_interval_join(thread_t thread,mach_port_name_t port_name)1475 kern_work_interval_join(thread_t            thread,
1476     mach_port_name_t    port_name)
1477 {
1478 	struct work_interval *work_interval = NULL;
1479 	kern_return_t kr;
1480 
1481 	if (port_name == MACH_PORT_NULL) {
1482 		/* 'Un-join' the current work interval */
1483 		return thread_set_work_interval_explicit_join(thread, NULL);
1484 	}
1485 
1486 	kr = port_name_to_work_interval(port_name, &work_interval);
1487 	if (kr != KERN_SUCCESS) {
1488 		return kr;
1489 	}
1490 	/* work_interval has a +1 ref */
1491 
1492 	assert(work_interval != NULL);
1493 
1494 	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1495 	/* ref was consumed by passing it to the thread in the successful case */
1496 	if (kr != KERN_SUCCESS) {
1497 		work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1498 	}
1499 	return kr;
1500 }
1501 
1502 /*
1503  * work_interval_port_type_render_server()
1504  *
1505  * Helper routine to determine if the port points to a
1506  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1507  */
1508 bool
work_interval_port_type_render_server(mach_port_name_t port_name)1509 work_interval_port_type_render_server(mach_port_name_t port_name)
1510 {
1511 	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1512 }
1513