xref: /xnu-10063.101.15/osfmk/kern/sched_prim.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	sched_prim.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1986
62  *
63  *	Scheduling primitives
64  *
65  */
66 
67 #include <debug.h>
68 
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74 
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80 
81 #include <machine/commpage.h>
82 
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #include <kern/monotonic.h>
94 #include <kern/processor.h>
95 #include <kern/queue.h>
96 #include <kern/recount.h>
97 #include <kern/restartable.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109 
110 #include <vm/pmap.h>
111 #include <vm/vm_kern.h>
112 #include <vm/vm_map.h>
113 #include <vm/vm_pageout.h>
114 
115 #include <mach/sdt.h>
116 #include <mach/mach_host.h>
117 #include <mach/host_info.h>
118 
119 #include <sys/kdebug.h>
120 #include <kperf/kperf.h>
121 #include <kern/kpc.h>
122 #include <san/kasan.h>
123 #include <kern/pms.h>
124 #include <kern/host.h>
125 #include <stdatomic.h>
126 #include <os/atomic_private.h>
127 
128 #ifdef KDBG_MACOS_RELEASE
129 #define KTRC KDBG_MACOS_RELEASE
130 #else
131 #define KTRC KDBG_RELEASE
132 #endif
133 
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136 
137 static uint64_t
deadline_add(uint64_t d,uint64_t e)138 deadline_add(uint64_t d, uint64_t e)
139 {
140 	uint64_t sum;
141 	return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
142 }
143 
144 int
rt_runq_count(processor_set_t pset)145 rt_runq_count(processor_set_t pset)
146 {
147 	return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
148 }
149 
150 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)151 rt_runq_earliest_deadline(processor_set_t pset)
152 {
153 	return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
154 }
155 
156 static int
rt_runq_priority(processor_set_t pset)157 rt_runq_priority(processor_set_t pset)
158 {
159 	pset_assert_locked(pset);
160 	rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
161 
162 	bitmap_t *map = rt_run_queue->bitmap;
163 	int i = bitmap_first(map, NRTQS);
164 	assert(i < NRTQS);
165 
166 	if (i >= 0) {
167 		return i + BASEPRI_RTQUEUES;
168 	}
169 
170 	return i;
171 }
172 
173 static thread_t rt_runq_first(rt_queue_t rt_runq);
174 
175 #if DEBUG
176 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)177 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
178 {
179 	bitmap_t *map = rt_run_queue->bitmap;
180 
181 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
182 	uint32_t constraint = RT_CONSTRAINT_NONE;
183 	int ed_index = NOPRI;
184 	int count = 0;
185 	bool found_thread = false;
186 
187 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
188 		int i = pri - BASEPRI_RTQUEUES;
189 		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
190 		queue_t queue = &rt_runq->pri_queue;
191 		queue_entry_t iter;
192 		int n = 0;
193 		uint64_t previous_deadline = 0;
194 		qe_foreach(iter, queue) {
195 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
196 			assert_thread_magic(iter_thread);
197 			if (iter_thread == thread) {
198 				found_thread = true;
199 			}
200 			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
201 			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
202 			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
203 			assert(previous_deadline <= iter_thread->realtime.deadline);
204 			n++;
205 			if (iter == queue_first(queue)) {
206 				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
207 				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
208 			}
209 			previous_deadline = iter_thread->realtime.deadline;
210 		}
211 		assert(n == rt_runq->pri_count);
212 		if (n == 0) {
213 			assert(bitmap_test(map, i) == false);
214 			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
215 			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
216 		} else {
217 			assert(bitmap_test(map, i) == true);
218 		}
219 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
220 			earliest_deadline = rt_runq->pri_earliest_deadline;
221 			constraint = rt_runq->pri_constraint;
222 			ed_index = i;
223 		}
224 		count += n;
225 	}
226 	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
227 	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
228 	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
229 	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
230 	if (thread) {
231 		assert(found_thread);
232 	}
233 }
234 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    check_rt_runq_consistency(q, th)
235 #else
236 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    do {} while (0)
237 #endif
238 
239 uint32_t rt_constraint_threshold;
240 
241 static bool
rt_runq_is_low_latency(processor_set_t pset)242 rt_runq_is_low_latency(processor_set_t pset)
243 {
244 	return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
245 }
246 
247 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
248 
249 /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
250 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
251 static uint64_t nonurgent_preemption_timer_abs = 0;
252 
253 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
254 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
255 
256 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
257 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
258 
259 #define         MAX_UNSAFE_RT_QUANTA               100
260 #define         SAFE_RT_MULTIPLIER                 2
261 
262 #define         MAX_UNSAFE_FIXED_QUANTA               100
263 #define         SAFE_FIXED_MULTIPLIER                 2
264 
265 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
266 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
267 
268 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
269 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
270 
271 #define         MAX_POLL_QUANTA                 2
272 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
273 
274 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
275 int             sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
276 
277 uint64_t        max_poll_computation;
278 
279 uint64_t        max_unsafe_rt_computation;
280 uint64_t        max_unsafe_fixed_computation;
281 uint64_t        sched_safe_rt_duration;
282 uint64_t        sched_safe_fixed_duration;
283 
284 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
285 
286 uint32_t        std_quantum;
287 uint32_t        min_std_quantum;
288 uint32_t        bg_quantum;
289 
290 uint32_t        std_quantum_us;
291 uint32_t        bg_quantum_us;
292 
293 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
294 
295 uint32_t        thread_depress_time;
296 uint32_t        default_timeshare_computation;
297 uint32_t        default_timeshare_constraint;
298 
299 uint32_t        max_rt_quantum;
300 uint32_t        min_rt_quantum;
301 
302 uint32_t        rt_deadline_epsilon;
303 
304 uint32_t        rt_constraint_threshold;
305 
306 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
307 
308 unsigned                sched_tick;
309 uint32_t                sched_tick_interval;
310 
311 /* Timeshare load calculation interval (15ms) */
312 uint32_t                sched_load_compute_interval_us = 15000;
313 uint64_t                sched_load_compute_interval_abs;
314 static _Atomic uint64_t sched_load_compute_deadline;
315 
316 uint32_t        sched_pri_shifts[TH_BUCKET_MAX];
317 uint32_t        sched_fixed_shift;
318 
319 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
320 
321 /* Allow foreground to decay past default to resolve inversions */
322 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
323 int             sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324 
325 /* Defaults for timer deadline profiling */
326 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
327 	                                               * 2ms */
328 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
329 	                                               *   <= 5ms */
330 
331 uint64_t timer_deadline_tracking_bin_1;
332 uint64_t timer_deadline_tracking_bin_2;
333 
334 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
335 
336 thread_t sched_maintenance_thread;
337 
338 /* interrupts disabled lock to guard recommended cores state */
339 decl_simple_lock_data(, sched_available_cores_lock);
340 uint64_t        perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
341 uint64_t        perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
342 uint64_t        perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344 static uint64_t sched_online_processors = 0;
345 static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
346 static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
347 
348 #if __arm64__
349 static void sched_recommended_cores_maintenance(void);
350 uint64_t    perfcontrol_failsafe_starvation_threshold;
351 extern char *proc_name_address(struct proc *p);
352 #endif /* __arm64__ */
353 
354 uint64_t        sched_one_second_interval;
355 boolean_t       allow_direct_handoff = TRUE;
356 
357 /* Forwards */
358 
359 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
360 
361 static void load_shift_init(void);
362 static void preempt_pri_init(void);
363 
364 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
365 
366 thread_t        processor_idle(
367 	thread_t                        thread,
368 	processor_t                     processor);
369 
370 static ast_t
371 csw_check_locked(
372 	thread_t        thread,
373 	processor_t     processor,
374 	processor_set_t pset,
375 	ast_t           check_reason);
376 
377 static void processor_setrun(
378 	processor_t                    processor,
379 	thread_t                       thread,
380 	integer_t                      options);
381 
382 static void
383 sched_realtime_timebase_init(void);
384 
385 static void
386 sched_timer_deadline_tracking_init(void);
387 
388 #if     DEBUG
389 extern int debug_task;
390 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
391 #else
392 #define TLOG(a, fmt, args...) do {} while (0)
393 #endif
394 
395 static processor_t
396 thread_bind_internal(
397 	thread_t                thread,
398 	processor_t             processor);
399 
400 static void
401 sched_vm_group_maintenance(void);
402 
403 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
404 int8_t          sched_load_shifts[NRQS];
405 bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
406 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
407 
408 /*
409  * Statically allocate a buffer to hold the longest possible
410  * scheduler description string, as currently implemented.
411  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
412  * to export to userspace via sysctl(3). If either version
413  * changes, update the other.
414  *
415  * Note that in addition to being an upper bound on the strings
416  * in the kernel, it's also an exact parameter to PE_get_default(),
417  * which interrogates the device tree on some platforms. That
418  * API requires the caller know the exact size of the device tree
419  * property, so we need both a legacy size (32) and the current size
420  * (48) to deal with old and new device trees. The device tree property
421  * is similarly padded to a fixed size so that the same kernel image
422  * can run on multiple devices with different schedulers configured
423  * in the device tree.
424  */
425 char sched_string[SCHED_STRING_MAX_LENGTH];
426 
427 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
428 
429 /* Global flag which indicates whether Background Stepper Context is enabled */
430 static int cpu_throttle_enabled = 1;
431 
432 #if DEVELOPMENT || DEBUG
433 int enable_task_set_cluster_type = 0;
434 bool system_ecore_only = false;
435 #endif /* DEVELOPMENT || DEBUG */
436 
437 void
sched_init(void)438 sched_init(void)
439 {
440 	boolean_t direct_handoff = FALSE;
441 	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
442 
443 	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
444 		/* No boot-args, check in device tree */
445 		if (!PE_get_default("kern.sched_pri_decay_limit",
446 		    &sched_pri_decay_band_limit,
447 		    sizeof(sched_pri_decay_band_limit))) {
448 			/* Allow decay all the way to normal limits */
449 			sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
450 		}
451 	}
452 
453 	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
454 
455 	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
456 		kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
457 	}
458 	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
459 
460 #if __arm64__
461 	clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
462 #endif /* __arm64__ */
463 
464 	SCHED(init)();
465 	SCHED(rt_init)(&pset0);
466 	sched_timer_deadline_tracking_init();
467 
468 	SCHED(pset_init)(&pset0);
469 	SCHED(processor_init)(master_processor);
470 
471 	if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
472 		allow_direct_handoff = direct_handoff;
473 	}
474 
475 #if DEVELOPMENT || DEBUG
476 	if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
477 		system_ecore_only = (enable_task_set_cluster_type == 2);
478 	}
479 #endif /* DEVELOPMENT || DEBUG */
480 
481 	simple_lock_init(&sched_available_cores_lock, 0);
482 }
483 
484 void
sched_timebase_init(void)485 sched_timebase_init(void)
486 {
487 	uint64_t        abstime;
488 
489 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
490 	sched_one_second_interval = abstime;
491 
492 	SCHED(timebase_init)();
493 	sched_realtime_timebase_init();
494 }
495 
496 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
497 
498 void
sched_timeshare_init(void)499 sched_timeshare_init(void)
500 {
501 	/*
502 	 * Calculate the timeslicing quantum
503 	 * in us.
504 	 */
505 	if (default_preemption_rate < 1) {
506 		default_preemption_rate = DEFAULT_PREEMPTION_RATE;
507 	}
508 	std_quantum_us = (1000 * 1000) / default_preemption_rate;
509 
510 	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
511 
512 	if (default_bg_preemption_rate < 1) {
513 		default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
514 	}
515 	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
516 
517 	printf("standard background quantum is %d us\n", bg_quantum_us);
518 
519 	load_shift_init();
520 	preempt_pri_init();
521 	sched_tick = 0;
522 }
523 
524 void
sched_set_max_unsafe_rt_quanta(int max)525 sched_set_max_unsafe_rt_quanta(int max)
526 {
527 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
528 
529 	max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
530 
531 	const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
532 	sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
533 
534 
535 #if DEVELOPMENT || DEBUG
536 	max_unsafe_rt_quanta = max;
537 #else
538 	/*
539 	 * On RELEASE kernels, this is only called on boot where
540 	 * max is already equal to max_unsafe_rt_quanta.
541 	 */
542 	assert3s(max, ==, max_unsafe_rt_quanta);
543 #endif
544 }
545 
546 void
sched_set_max_unsafe_fixed_quanta(int max)547 sched_set_max_unsafe_fixed_quanta(int max)
548 {
549 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
550 
551 	max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
552 
553 	const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
554 	sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
555 
556 #if DEVELOPMENT || DEBUG
557 	max_unsafe_fixed_quanta = max;
558 #else
559 	/*
560 	 * On RELEASE kernels, this is only called on boot where
561 	 * max is already equal to max_unsafe_fixed_quanta.
562 	 */
563 	assert3s(max, ==, max_unsafe_fixed_quanta);
564 #endif
565 }
566 
567 void
sched_timeshare_timebase_init(void)568 sched_timeshare_timebase_init(void)
569 {
570 	uint64_t        abstime;
571 	uint32_t        shift;
572 
573 	/* standard timeslicing quantum */
574 	clock_interval_to_absolutetime_interval(
575 		std_quantum_us, NSEC_PER_USEC, &abstime);
576 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
577 	std_quantum = (uint32_t)abstime;
578 
579 	/* smallest remaining quantum (250 us) */
580 	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
581 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
582 	min_std_quantum = (uint32_t)abstime;
583 
584 	/* quantum for background tasks */
585 	clock_interval_to_absolutetime_interval(
586 		bg_quantum_us, NSEC_PER_USEC, &abstime);
587 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
588 	bg_quantum = (uint32_t)abstime;
589 
590 	/* scheduler tick interval */
591 	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
592 	    NSEC_PER_USEC, &abstime);
593 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
594 	sched_tick_interval = (uint32_t)abstime;
595 
596 	/* timeshare load calculation interval & deadline initialization */
597 	clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
598 	os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
599 
600 	/*
601 	 * Compute conversion factor from usage to
602 	 * timesharing priorities with 5/8 ** n aging.
603 	 */
604 	abstime = (abstime * 5) / 3;
605 	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
606 		abstime >>= 1;
607 	}
608 	sched_fixed_shift = shift;
609 
610 	for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
611 		sched_pri_shifts[i] = INT8_MAX;
612 	}
613 
614 	sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
615 	sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
616 
617 	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
618 	thread_depress_time = 1 * std_quantum;
619 	default_timeshare_computation = std_quantum / 2;
620 	default_timeshare_constraint = std_quantum;
621 
622 #if __arm64__
623 	perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
624 #endif /* __arm64__ */
625 
626 	if (nonurgent_preemption_timer_us) {
627 		clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
628 		nonurgent_preemption_timer_abs = abstime;
629 	}
630 }
631 
632 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
633 
634 void
pset_rt_init(processor_set_t pset)635 pset_rt_init(processor_set_t pset)
636 {
637 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
638 		int i = pri - BASEPRI_RTQUEUES;
639 		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
640 		queue_init(&rqi->pri_queue);
641 		rqi->pri_count = 0;
642 		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
643 		rqi->pri_constraint = RT_CONSTRAINT_NONE;
644 	}
645 	os_atomic_init(&pset->rt_runq.count, 0);
646 	os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
647 	os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
648 	os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
649 	memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
650 }
651 
652 /* epsilon for comparing RT deadlines */
653 int rt_deadline_epsilon_us = 100;
654 
655 int
sched_get_rt_deadline_epsilon(void)656 sched_get_rt_deadline_epsilon(void)
657 {
658 	return rt_deadline_epsilon_us;
659 }
660 
661 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)662 sched_set_rt_deadline_epsilon(int new_epsilon_us)
663 {
664 	rt_deadline_epsilon_us = new_epsilon_us;
665 
666 	uint64_t abstime;
667 	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
668 	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
669 	rt_deadline_epsilon = (uint32_t)abstime;
670 }
671 
672 static void
sched_realtime_timebase_init(void)673 sched_realtime_timebase_init(void)
674 {
675 	uint64_t abstime;
676 
677 	/* smallest rt computation (50 us) */
678 	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
679 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
680 	min_rt_quantum = (uint32_t)abstime;
681 
682 	/* maximum rt computation (50 ms) */
683 	clock_interval_to_absolutetime_interval(
684 		50, 1000 * NSEC_PER_USEC, &abstime);
685 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
686 	max_rt_quantum = (uint32_t)abstime;
687 
688 	/* constraint threshold for sending backup IPIs (4 ms) */
689 	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
690 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
691 	rt_constraint_threshold = (uint32_t)abstime;
692 
693 	/* epsilon for comparing deadlines */
694 	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
695 }
696 
697 void
sched_check_spill(processor_set_t pset,thread_t thread)698 sched_check_spill(processor_set_t pset, thread_t thread)
699 {
700 	(void)pset;
701 	(void)thread;
702 
703 	return;
704 }
705 
706 bool
sched_thread_should_yield(processor_t processor,thread_t thread)707 sched_thread_should_yield(processor_t processor, thread_t thread)
708 {
709 	(void)thread;
710 
711 	return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
712 }
713 
714 /* Default implementations of .steal_thread_enabled */
715 bool
sched_steal_thread_DISABLED(processor_set_t pset)716 sched_steal_thread_DISABLED(processor_set_t pset)
717 {
718 	(void)pset;
719 	return false;
720 }
721 
722 bool
sched_steal_thread_enabled(processor_set_t pset)723 sched_steal_thread_enabled(processor_set_t pset)
724 {
725 	return bit_count(pset->node->pset_map) > 1;
726 }
727 
728 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
729 
730 /*
731  * Set up values for timeshare
732  * loading factors.
733  */
734 static void
load_shift_init(void)735 load_shift_init(void)
736 {
737 	int8_t          k, *p = sched_load_shifts;
738 	uint32_t        i, j;
739 
740 	uint32_t        sched_decay_penalty = 1;
741 
742 	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
743 		kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
744 	}
745 
746 	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
747 		kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
748 	}
749 
750 	if (sched_decay_penalty == 0) {
751 		/*
752 		 * There is no penalty for timeshare threads for using too much
753 		 * CPU, so set all load shifts to INT8_MIN. Even under high load,
754 		 * sched_pri_shift will be >INT8_MAX, and there will be no
755 		 * penalty applied to threads (nor will sched_usage be updated per
756 		 * thread).
757 		 */
758 		for (i = 0; i < NRQS; i++) {
759 			sched_load_shifts[i] = INT8_MIN;
760 		}
761 
762 		return;
763 	}
764 
765 	*p++ = INT8_MIN; *p++ = 0;
766 
767 	/*
768 	 * For a given system load "i", the per-thread priority
769 	 * penalty per quantum of CPU usage is ~2^k priority
770 	 * levels. "sched_decay_penalty" can cause more
771 	 * array entries to be filled with smaller "k" values
772 	 */
773 	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
774 		for (j <<= 1; (i < j) && (i < NRQS); ++i) {
775 			*p++ = k;
776 		}
777 	}
778 }
779 
780 static void
preempt_pri_init(void)781 preempt_pri_init(void)
782 {
783 	bitmap_t *p = sched_preempt_pri;
784 
785 	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
786 		bitmap_set(p, i);
787 	}
788 
789 	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
790 		bitmap_set(p, i);
791 	}
792 }
793 
794 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
795 
796 void
check_monotonic_time(uint64_t ctime)797 check_monotonic_time(uint64_t ctime)
798 {
799 	processor_t processor = current_processor();
800 	uint64_t last_dispatch = processor->last_dispatch;
801 
802 	if (last_dispatch > ctime) {
803 		panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
804 		    last_dispatch, ctime);
805 	}
806 }
807 
808 
809 /*
810  *	Thread wait timer expiration.
811  *	Runs in timer interrupt context with interrupts disabled.
812  */
813 void
thread_timer_expire(void * p0,__unused void * p1)814 thread_timer_expire(void *p0, __unused void *p1)
815 {
816 	thread_t thread = (thread_t)p0;
817 
818 	assert_thread_magic(thread);
819 
820 	assert(ml_get_interrupts_enabled() == FALSE);
821 
822 	thread_lock(thread);
823 
824 	if (thread->wait_timer_armed) {
825 		thread->wait_timer_armed = false;
826 		clear_wait_internal(thread, THREAD_TIMED_OUT);
827 		/* clear_wait_internal may have dropped and retaken the thread lock */
828 	}
829 
830 	thread->wait_timer_active--;
831 
832 	thread_unlock(thread);
833 }
834 
835 /*
836  *	thread_unblock:
837  *
838  *	Unblock thread on wake up.
839  *
840  *	Returns TRUE if the thread should now be placed on the runqueue.
841  *
842  *	Thread must be locked.
843  *
844  *	Called at splsched().
845  */
846 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)847 thread_unblock(
848 	thread_t                thread,
849 	wait_result_t   wresult)
850 {
851 	boolean_t               ready_for_runq = FALSE;
852 	thread_t                cthread = current_thread();
853 	uint32_t                new_run_count;
854 	int                             old_thread_state;
855 
856 	/*
857 	 *	Set wait_result.
858 	 */
859 	thread->wait_result = wresult;
860 
861 	/*
862 	 *	Cancel pending wait timer.
863 	 */
864 	if (thread->wait_timer_armed) {
865 		if (timer_call_cancel(thread->wait_timer)) {
866 			thread->wait_timer_active--;
867 		}
868 		thread->wait_timer_armed = false;
869 	}
870 
871 	boolean_t aticontext, pidle;
872 	ml_get_power_state(&aticontext, &pidle);
873 
874 	/*
875 	 *	Update scheduling state: not waiting,
876 	 *	set running.
877 	 */
878 	old_thread_state = thread->state;
879 	thread->state = (old_thread_state | TH_RUN) &
880 	    ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
881 
882 	if ((old_thread_state & TH_RUN) == 0) {
883 		uint64_t ctime = mach_approximate_time();
884 
885 		check_monotonic_time(ctime);
886 
887 		thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
888 		timer_start(&thread->runnable_timer, ctime);
889 
890 		ready_for_runq = TRUE;
891 
892 		if (old_thread_state & TH_WAIT_REPORT) {
893 			(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
894 		}
895 
896 		/* Update the runnable thread count */
897 		new_run_count = SCHED(run_count_incr)(thread);
898 
899 #if CONFIG_SCHED_AUTO_JOIN
900 		if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
901 			work_interval_auto_join_propagate(cthread, thread);
902 		}
903 #endif /*CONFIG_SCHED_AUTO_JOIN */
904 
905 	} else {
906 		/*
907 		 * Either the thread is idling in place on another processor,
908 		 * or it hasn't finished context switching yet.
909 		 */
910 		assert((thread->state & TH_IDLE) == 0);
911 		/*
912 		 * The run count is only dropped after the context switch completes
913 		 * and the thread is still waiting, so we should not run_incr here
914 		 */
915 		new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
916 	}
917 
918 	/*
919 	 * Calculate deadline for real-time threads.
920 	 */
921 	if (thread->sched_mode == TH_MODE_REALTIME) {
922 		uint64_t ctime = mach_absolute_time();
923 		thread->realtime.deadline = thread->realtime.constraint + ctime;
924 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
925 		    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
926 	}
927 
928 	/*
929 	 * Clear old quantum, fail-safe computation, etc.
930 	 */
931 	thread->quantum_remaining = 0;
932 	thread->computation_metered = 0;
933 	thread->reason = AST_NONE;
934 	thread->block_hint = kThreadWaitNone;
935 
936 	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
937 	 * We also account for "double hop" thread signaling via
938 	 * the thread callout infrastructure.
939 	 * DRK: consider removing the callout wakeup counters in the future
940 	 * they're present for verification at the moment.
941 	 */
942 
943 	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
944 		DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
945 
946 		uint64_t ttd = current_processor()->timer_call_ttd;
947 
948 		if (ttd) {
949 			if (ttd <= timer_deadline_tracking_bin_1) {
950 				thread->thread_timer_wakeups_bin_1++;
951 			} else if (ttd <= timer_deadline_tracking_bin_2) {
952 				thread->thread_timer_wakeups_bin_2++;
953 			}
954 		}
955 
956 		ledger_credit_thread(thread, thread->t_ledger,
957 		    task_ledgers.interrupt_wakeups, 1);
958 		if (pidle) {
959 			ledger_credit_thread(thread, thread->t_ledger,
960 			    task_ledgers.platform_idle_wakeups, 1);
961 		}
962 	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
963 		/* TODO: what about an interrupt that does a wake taken on a callout thread? */
964 		if (cthread->callout_woken_from_icontext) {
965 			ledger_credit_thread(thread, thread->t_ledger,
966 			    task_ledgers.interrupt_wakeups, 1);
967 			thread->thread_callout_interrupt_wakeups++;
968 
969 			if (cthread->callout_woken_from_platform_idle) {
970 				ledger_credit_thread(thread, thread->t_ledger,
971 				    task_ledgers.platform_idle_wakeups, 1);
972 				thread->thread_callout_platform_idle_wakeups++;
973 			}
974 
975 			cthread->callout_woke_thread = TRUE;
976 		}
977 	}
978 
979 	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
980 		thread->callout_woken_from_icontext = !!aticontext;
981 		thread->callout_woken_from_platform_idle = !!pidle;
982 		thread->callout_woke_thread = FALSE;
983 	}
984 
985 #if KPERF
986 	if (ready_for_runq) {
987 		kperf_make_runnable(thread, aticontext);
988 	}
989 #endif /* KPERF */
990 
991 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
992 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
993 	    (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
994 	    sched_run_buckets[TH_BUCKET_RUN], 0);
995 
996 	DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
997 
998 	return ready_for_runq;
999 }
1000 
1001 /*
1002  *	Routine:	thread_allowed_for_handoff
1003  *	Purpose:
1004  *		Check if the thread is allowed for handoff operation
1005  *	Conditions:
1006  *		thread lock held, IPC locks may be held.
1007  *	TODO: In future, do not allow handoff if threads have different cluster
1008  *	recommendations.
1009  */
1010 boolean_t
thread_allowed_for_handoff(thread_t thread)1011 thread_allowed_for_handoff(
1012 	thread_t         thread)
1013 {
1014 	thread_t self = current_thread();
1015 
1016 	if (allow_direct_handoff &&
1017 	    thread->sched_mode == TH_MODE_REALTIME &&
1018 	    self->sched_mode == TH_MODE_REALTIME) {
1019 		return TRUE;
1020 	}
1021 
1022 	return FALSE;
1023 }
1024 
1025 /*
1026  *	Routine:	thread_go
1027  *	Purpose:
1028  *		Unblock and dispatch thread.
1029  *	Conditions:
1030  *		thread lock held, IPC locks may be held.
1031  *		thread must have been waiting
1032  */
1033 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1034 thread_go(
1035 	thread_t                thread,
1036 	wait_result_t           wresult,
1037 	bool                    try_handoff)
1038 {
1039 	thread_t self = current_thread();
1040 
1041 	assert_thread_magic(thread);
1042 
1043 	assert(thread->at_safe_point == FALSE);
1044 	assert(thread->wait_event == NO_EVENT64);
1045 	assert(waitq_is_null(thread->waitq));
1046 
1047 	assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1048 	assert(thread->state & TH_WAIT);
1049 
1050 	if (thread->started) {
1051 		assert(thread->state & TH_WAKING);
1052 	}
1053 
1054 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1055 
1056 	assert(ml_get_interrupts_enabled() == false);
1057 
1058 	if (thread_unblock(thread, wresult)) {
1059 #if SCHED_TRACE_THREAD_WAKEUPS
1060 		backtrace(&thread->thread_wakeup_bt[0],
1061 		    (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1062 		    NULL);
1063 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1064 		if (try_handoff && thread_allowed_for_handoff(thread)) {
1065 			thread_reference(thread);
1066 			assert(self->handoff_thread == NULL);
1067 			self->handoff_thread = thread;
1068 		} else {
1069 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1070 		}
1071 	}
1072 }
1073 
1074 /*
1075  *	Routine:	thread_mark_wait_locked
1076  *	Purpose:
1077  *		Mark a thread as waiting.  If, given the circumstances,
1078  *		it doesn't want to wait (i.e. already aborted), then
1079  *		indicate that in the return value.
1080  *	Conditions:
1081  *		at splsched() and thread is locked.
1082  */
1083 __private_extern__
1084 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1085 thread_mark_wait_locked(
1086 	thread_t                        thread,
1087 	wait_interrupt_t        interruptible_orig)
1088 {
1089 	boolean_t                       at_safe_point;
1090 	wait_interrupt_t        interruptible = interruptible_orig;
1091 
1092 	if (thread->state & TH_IDLE) {
1093 		panic("Invalid attempt to wait while running the idle thread");
1094 	}
1095 
1096 	assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1097 
1098 	/*
1099 	 *	The thread may have certain types of interrupts/aborts masked
1100 	 *	off.  Even if the wait location says these types of interrupts
1101 	 *	are OK, we have to honor mask settings (outer-scoped code may
1102 	 *	not be able to handle aborts at the moment).
1103 	 */
1104 	interruptible &= TH_OPT_INTMASK;
1105 	if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1106 		interruptible = thread->options & TH_OPT_INTMASK;
1107 	}
1108 
1109 	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1110 
1111 	if (interruptible == THREAD_UNINT ||
1112 	    !(thread->sched_flags & TH_SFLAG_ABORT) ||
1113 	    (!at_safe_point &&
1114 	    (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1115 		if (!(thread->state & TH_TERMINATE)) {
1116 			DTRACE_SCHED(sleep);
1117 		}
1118 
1119 		int state_bits = TH_WAIT;
1120 		if (!interruptible) {
1121 			state_bits |= TH_UNINT;
1122 		}
1123 		if (thread->sched_call) {
1124 			wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1125 			if (is_kerneltask(get_threadtask(thread))) {
1126 				mask = THREAD_WAIT_NOREPORT_KERNEL;
1127 			}
1128 			if ((interruptible_orig & mask) == 0) {
1129 				state_bits |= TH_WAIT_REPORT;
1130 			}
1131 		}
1132 		thread->state |= state_bits;
1133 		thread->at_safe_point = at_safe_point;
1134 
1135 		/* TODO: pass this through assert_wait instead, have
1136 		 * assert_wait just take a struct as an argument */
1137 		assert(!thread->block_hint);
1138 		thread->block_hint = thread->pending_block_hint;
1139 		thread->pending_block_hint = kThreadWaitNone;
1140 
1141 		return thread->wait_result = THREAD_WAITING;
1142 	} else {
1143 		if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1144 			thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1145 		}
1146 	}
1147 	thread->pending_block_hint = kThreadWaitNone;
1148 
1149 	return thread->wait_result = THREAD_INTERRUPTED;
1150 }
1151 
1152 /*
1153  *	Routine:	thread_interrupt_level
1154  *	Purpose:
1155  *	        Set the maximum interruptible state for the
1156  *		current thread.  The effective value of any
1157  *		interruptible flag passed into assert_wait
1158  *		will never exceed this.
1159  *
1160  *		Useful for code that must not be interrupted,
1161  *		but which calls code that doesn't know that.
1162  *	Returns:
1163  *		The old interrupt level for the thread.
1164  */
1165 __private_extern__
1166 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1167 thread_interrupt_level(
1168 	wait_interrupt_t new_level)
1169 {
1170 	thread_t thread = current_thread();
1171 	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1172 
1173 	thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1174 
1175 	return result;
1176 }
1177 
1178 /*
1179  *	assert_wait:
1180  *
1181  *	Assert that the current thread is about to go to
1182  *	sleep until the specified event occurs.
1183  */
1184 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1185 assert_wait(
1186 	event_t                         event,
1187 	wait_interrupt_t        interruptible)
1188 {
1189 	if (__improbable(event == NO_EVENT)) {
1190 		panic("%s() called with NO_EVENT", __func__);
1191 	}
1192 
1193 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1194 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1195 	    VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1196 
1197 	struct waitq *waitq;
1198 	waitq = global_eventq(event);
1199 	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1200 }
1201 
1202 /*
1203  *	assert_wait_queue:
1204  *
1205  *	Return the global waitq for the specified event
1206  */
1207 struct waitq *
assert_wait_queue(event_t event)1208 assert_wait_queue(
1209 	event_t                         event)
1210 {
1211 	return global_eventq(event);
1212 }
1213 
1214 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1215 assert_wait_timeout(
1216 	event_t                         event,
1217 	wait_interrupt_t        interruptible,
1218 	uint32_t                        interval,
1219 	uint32_t                        scale_factor)
1220 {
1221 	thread_t                        thread = current_thread();
1222 	wait_result_t           wresult;
1223 	uint64_t                        deadline;
1224 	spl_t                           s;
1225 
1226 	if (__improbable(event == NO_EVENT)) {
1227 		panic("%s() called with NO_EVENT", __func__);
1228 	}
1229 
1230 	struct waitq *waitq;
1231 	waitq = global_eventq(event);
1232 
1233 	s = splsched();
1234 	waitq_lock(waitq);
1235 
1236 	clock_interval_to_deadline(interval, scale_factor, &deadline);
1237 
1238 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1239 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1240 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1241 
1242 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1243 	    interruptible,
1244 	    TIMEOUT_URGENCY_SYS_NORMAL,
1245 	    deadline, TIMEOUT_NO_LEEWAY,
1246 	    thread);
1247 
1248 	waitq_unlock(waitq);
1249 	splx(s);
1250 	return wresult;
1251 }
1252 
1253 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1254 assert_wait_timeout_with_leeway(
1255 	event_t                         event,
1256 	wait_interrupt_t        interruptible,
1257 	wait_timeout_urgency_t  urgency,
1258 	uint32_t                        interval,
1259 	uint32_t                        leeway,
1260 	uint32_t                        scale_factor)
1261 {
1262 	thread_t                        thread = current_thread();
1263 	wait_result_t           wresult;
1264 	uint64_t                        deadline;
1265 	uint64_t                        abstime;
1266 	uint64_t                        slop;
1267 	uint64_t                        now;
1268 	spl_t                           s;
1269 
1270 	if (__improbable(event == NO_EVENT)) {
1271 		panic("%s() called with NO_EVENT", __func__);
1272 	}
1273 
1274 	now = mach_absolute_time();
1275 	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1276 	deadline = now + abstime;
1277 
1278 	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1279 
1280 	struct waitq *waitq;
1281 	waitq = global_eventq(event);
1282 
1283 	s = splsched();
1284 	waitq_lock(waitq);
1285 
1286 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1287 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1288 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1289 
1290 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1291 	    interruptible,
1292 	    urgency, deadline, slop,
1293 	    thread);
1294 
1295 	waitq_unlock(waitq);
1296 	splx(s);
1297 	return wresult;
1298 }
1299 
1300 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1301 assert_wait_deadline(
1302 	event_t                         event,
1303 	wait_interrupt_t        interruptible,
1304 	uint64_t                        deadline)
1305 {
1306 	thread_t                        thread = current_thread();
1307 	wait_result_t           wresult;
1308 	spl_t                           s;
1309 
1310 	if (__improbable(event == NO_EVENT)) {
1311 		panic("%s() called with NO_EVENT", __func__);
1312 	}
1313 
1314 	struct waitq *waitq;
1315 	waitq = global_eventq(event);
1316 
1317 	s = splsched();
1318 	waitq_lock(waitq);
1319 
1320 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1321 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1322 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1323 
1324 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1325 	    interruptible,
1326 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1327 	    TIMEOUT_NO_LEEWAY, thread);
1328 	waitq_unlock(waitq);
1329 	splx(s);
1330 	return wresult;
1331 }
1332 
1333 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1334 assert_wait_deadline_with_leeway(
1335 	event_t                         event,
1336 	wait_interrupt_t        interruptible,
1337 	wait_timeout_urgency_t  urgency,
1338 	uint64_t                        deadline,
1339 	uint64_t                        leeway)
1340 {
1341 	thread_t                        thread = current_thread();
1342 	wait_result_t           wresult;
1343 	spl_t                           s;
1344 
1345 	if (__improbable(event == NO_EVENT)) {
1346 		panic("%s() called with NO_EVENT", __func__);
1347 	}
1348 
1349 	struct waitq *waitq;
1350 	waitq = global_eventq(event);
1351 
1352 	s = splsched();
1353 	waitq_lock(waitq);
1354 
1355 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1356 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1357 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1358 
1359 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1360 	    interruptible,
1361 	    urgency, deadline, leeway,
1362 	    thread);
1363 	waitq_unlock(waitq);
1364 	splx(s);
1365 	return wresult;
1366 }
1367 
1368 void
sched_cond_init(sched_cond_atomic_t * cond)1369 sched_cond_init(
1370 	sched_cond_atomic_t *cond)
1371 {
1372 	os_atomic_init(cond, SCHED_COND_INIT);
1373 }
1374 
1375 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1376 sched_cond_wait_parameter(
1377 	sched_cond_atomic_t *cond,
1378 	wait_interrupt_t interruptible,
1379 	thread_continue_t continuation,
1380 	void *parameter)
1381 {
1382 	assert_wait((event_t) cond, interruptible);
1383 	/* clear active bit to indicate future wakeups will have to unblock this thread */
1384 	sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1385 	if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1386 		/* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1387 		thread_t thread = current_thread();
1388 		clear_wait(thread, THREAD_AWAKENED);
1389 		sched_cond_ack(cond);
1390 		return THREAD_AWAKENED;
1391 	}
1392 	return thread_block_parameter(continuation, parameter);
1393 }
1394 
1395 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1396 sched_cond_wait(
1397 	sched_cond_atomic_t *cond,
1398 	wait_interrupt_t interruptible,
1399 	thread_continue_t continuation)
1400 {
1401 	return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1402 }
1403 
1404 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1405 sched_cond_ack(
1406 	sched_cond_atomic_t *cond)
1407 {
1408 	sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1409 	assert(new_cond & SCHED_COND_ACTIVE);
1410 	return new_cond;
1411 }
1412 
1413 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1414 sched_cond_signal(
1415 	sched_cond_atomic_t  *cond,
1416 	thread_t thread)
1417 {
1418 	disable_preemption();
1419 	sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1420 	if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1421 		/* this was the first wakeup to be issued AND the thread was inactive */
1422 		thread_wakeup_thread((event_t) cond, thread);
1423 	}
1424 	enable_preemption();
1425 	return KERN_SUCCESS;
1426 }
1427 
1428 /*
1429  * thread_isoncpu:
1430  *
1431  * Return TRUE if a thread is running on a processor such that an AST
1432  * is needed to pull it out of userspace execution, or if executing in
1433  * the kernel, bring to a context switch boundary that would cause
1434  * thread state to be serialized in the thread PCB.
1435  *
1436  * Thread locked, returns the same way. While locked, fields
1437  * like "state" cannot change. "runq" can change only from set to unset.
1438  */
1439 static inline boolean_t
thread_isoncpu(thread_t thread)1440 thread_isoncpu(thread_t thread)
1441 {
1442 	/* Not running or runnable */
1443 	if (!(thread->state & TH_RUN)) {
1444 		return FALSE;
1445 	}
1446 
1447 	/* Waiting on a runqueue, not currently running */
1448 	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1449 	if (thread_get_runq(thread) != PROCESSOR_NULL) {
1450 		return FALSE;
1451 	}
1452 
1453 	/*
1454 	 * Thread does not have a stack yet
1455 	 * It could be on the stack alloc queue or preparing to be invoked
1456 	 */
1457 	if (!thread->kernel_stack) {
1458 		return FALSE;
1459 	}
1460 
1461 	/*
1462 	 * Thread must be running on a processor, or
1463 	 * about to run, or just did run. In all these
1464 	 * cases, an AST to the processor is needed
1465 	 * to guarantee that the thread is kicked out
1466 	 * of userspace and the processor has
1467 	 * context switched (and saved register state).
1468 	 */
1469 	return TRUE;
1470 }
1471 
1472 /*
1473  * thread_stop:
1474  *
1475  * Force a preemption point for a thread and wait
1476  * for it to stop running on a CPU. If a stronger
1477  * guarantee is requested, wait until no longer
1478  * runnable. Arbitrates access among
1479  * multiple stop requests. (released by unstop)
1480  *
1481  * The thread must enter a wait state and stop via a
1482  * separate means.
1483  *
1484  * Returns FALSE if interrupted.
1485  */
1486 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1487 thread_stop(
1488 	thread_t                thread,
1489 	boolean_t       until_not_runnable)
1490 {
1491 	wait_result_t   wresult;
1492 	spl_t                   s = splsched();
1493 	boolean_t               oncpu;
1494 
1495 	wake_lock(thread);
1496 	thread_lock(thread);
1497 
1498 	while (thread->state & TH_SUSP) {
1499 		thread->wake_active = TRUE;
1500 		thread_unlock(thread);
1501 
1502 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1503 		wake_unlock(thread);
1504 		splx(s);
1505 
1506 		if (wresult == THREAD_WAITING) {
1507 			wresult = thread_block(THREAD_CONTINUE_NULL);
1508 		}
1509 
1510 		if (wresult != THREAD_AWAKENED) {
1511 			return FALSE;
1512 		}
1513 
1514 		s = splsched();
1515 		wake_lock(thread);
1516 		thread_lock(thread);
1517 	}
1518 
1519 	thread->state |= TH_SUSP;
1520 
1521 	while ((oncpu = thread_isoncpu(thread)) ||
1522 	    (until_not_runnable && (thread->state & TH_RUN))) {
1523 		processor_t             processor;
1524 
1525 		if (oncpu) {
1526 			assert(thread->state & TH_RUN);
1527 			processor = thread->chosen_processor;
1528 			cause_ast_check(processor);
1529 		}
1530 
1531 		thread->wake_active = TRUE;
1532 		thread_unlock(thread);
1533 
1534 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1535 		wake_unlock(thread);
1536 		splx(s);
1537 
1538 		if (wresult == THREAD_WAITING) {
1539 			wresult = thread_block(THREAD_CONTINUE_NULL);
1540 		}
1541 
1542 		if (wresult != THREAD_AWAKENED) {
1543 			thread_unstop(thread);
1544 			return FALSE;
1545 		}
1546 
1547 		s = splsched();
1548 		wake_lock(thread);
1549 		thread_lock(thread);
1550 	}
1551 
1552 	thread_unlock(thread);
1553 	wake_unlock(thread);
1554 	splx(s);
1555 
1556 	/*
1557 	 * We return with the thread unlocked. To prevent it from
1558 	 * transitioning to a runnable state (or from TH_RUN to
1559 	 * being on the CPU), the caller must ensure the thread
1560 	 * is stopped via an external means (such as an AST)
1561 	 */
1562 
1563 	return TRUE;
1564 }
1565 
1566 /*
1567  * thread_unstop:
1568  *
1569  * Release a previous stop request and set
1570  * the thread running if appropriate.
1571  *
1572  * Use only after a successful stop operation.
1573  */
1574 void
thread_unstop(thread_t thread)1575 thread_unstop(
1576 	thread_t        thread)
1577 {
1578 	spl_t           s = splsched();
1579 
1580 	wake_lock(thread);
1581 	thread_lock(thread);
1582 
1583 	assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1584 
1585 	if (thread->state & TH_SUSP) {
1586 		thread->state &= ~TH_SUSP;
1587 
1588 		if (thread->wake_active) {
1589 			thread->wake_active = FALSE;
1590 			thread_unlock(thread);
1591 
1592 			thread_wakeup(&thread->wake_active);
1593 			wake_unlock(thread);
1594 			splx(s);
1595 
1596 			return;
1597 		}
1598 	}
1599 
1600 	thread_unlock(thread);
1601 	wake_unlock(thread);
1602 	splx(s);
1603 }
1604 
1605 /*
1606  * thread_wait:
1607  *
1608  * Wait for a thread to stop running. (non-interruptible)
1609  *
1610  */
1611 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1612 thread_wait(
1613 	thread_t        thread,
1614 	boolean_t       until_not_runnable)
1615 {
1616 	wait_result_t   wresult;
1617 	boolean_t       oncpu;
1618 	processor_t     processor;
1619 	spl_t           s = splsched();
1620 
1621 	wake_lock(thread);
1622 	thread_lock(thread);
1623 
1624 	/*
1625 	 * Wait until not running on a CPU.  If stronger requirement
1626 	 * desired, wait until not runnable.  Assumption: if thread is
1627 	 * on CPU, then TH_RUN is set, so we're not waiting in any case
1628 	 * where the original, pure "TH_RUN" check would have let us
1629 	 * finish.
1630 	 */
1631 	while ((oncpu = thread_isoncpu(thread)) ||
1632 	    (until_not_runnable && (thread->state & TH_RUN))) {
1633 		if (oncpu) {
1634 			assert(thread->state & TH_RUN);
1635 			processor = thread->chosen_processor;
1636 			cause_ast_check(processor);
1637 		}
1638 
1639 		thread->wake_active = TRUE;
1640 		thread_unlock(thread);
1641 
1642 		wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1643 		wake_unlock(thread);
1644 		splx(s);
1645 
1646 		if (wresult == THREAD_WAITING) {
1647 			thread_block(THREAD_CONTINUE_NULL);
1648 		}
1649 
1650 		s = splsched();
1651 		wake_lock(thread);
1652 		thread_lock(thread);
1653 	}
1654 
1655 	thread_unlock(thread);
1656 	wake_unlock(thread);
1657 	splx(s);
1658 }
1659 
1660 /*
1661  *	Routine: clear_wait_internal
1662  *
1663  *		Clear the wait condition for the specified thread.
1664  *		Start the thread executing if that is appropriate.
1665  *	Arguments:
1666  *		thread		thread to awaken
1667  *		result		Wakeup result the thread should see
1668  *	Conditions:
1669  *		At splsched
1670  *		the thread is locked.
1671  *	Returns:
1672  *		KERN_SUCCESS		thread was rousted out a wait
1673  *		KERN_FAILURE		thread was waiting but could not be rousted
1674  *		KERN_NOT_WAITING	thread was not waiting
1675  */
1676 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1677 clear_wait_internal(
1678 	thread_t        thread,
1679 	wait_result_t   wresult)
1680 {
1681 	waitq_t waitq = thread->waitq;
1682 
1683 	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1684 		return KERN_FAILURE;
1685 	}
1686 
1687 	/*
1688 	 * Check that the thread is waiting and not waking, as a waking thread
1689 	 * has already cleared its waitq, and is destined to be go'ed, don't
1690 	 * need to do it again.
1691 	 */
1692 	if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1693 		assert(waitq_is_null(thread->waitq));
1694 		return KERN_NOT_WAITING;
1695 	}
1696 
1697 	/* may drop and retake the thread lock */
1698 	if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1699 		return KERN_NOT_WAITING;
1700 	}
1701 
1702 	thread_go(thread, wresult, /* handoff */ false);
1703 
1704 	return KERN_SUCCESS;
1705 }
1706 
1707 
1708 /*
1709  *	clear_wait:
1710  *
1711  *	Clear the wait condition for the specified thread.  Start the thread
1712  *	executing if that is appropriate.
1713  *
1714  *	parameters:
1715  *	  thread		thread to awaken
1716  *	  result		Wakeup result the thread should see
1717  */
1718 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1719 clear_wait(
1720 	thread_t                thread,
1721 	wait_result_t   result)
1722 {
1723 	kern_return_t ret;
1724 	spl_t           s;
1725 
1726 	s = splsched();
1727 	thread_lock(thread);
1728 
1729 	ret = clear_wait_internal(thread, result);
1730 
1731 	if (thread == current_thread()) {
1732 		/*
1733 		 * The thread must be ready to wait again immediately
1734 		 * after clearing its own wait.
1735 		 */
1736 		assert((thread->state & TH_WAKING) == 0);
1737 	}
1738 
1739 	thread_unlock(thread);
1740 	splx(s);
1741 	return ret;
1742 }
1743 
1744 
1745 /*
1746  *	thread_wakeup_prim:
1747  *
1748  *	Common routine for thread_wakeup, thread_wakeup_with_result,
1749  *	and thread_wakeup_one.
1750  *
1751  */
1752 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1753 thread_wakeup_prim(
1754 	event_t          event,
1755 	boolean_t        one_thread,
1756 	wait_result_t    result)
1757 {
1758 	if (__improbable(event == NO_EVENT)) {
1759 		panic("%s() called with NO_EVENT", __func__);
1760 	}
1761 
1762 	struct waitq *wq = global_eventq(event);
1763 
1764 	if (one_thread) {
1765 		return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1766 	} else {
1767 		return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1768 	}
1769 }
1770 
1771 /*
1772  * Wakeup a specified thread if and only if it's waiting for this event
1773  */
1774 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1775 thread_wakeup_thread(
1776 	event_t         event,
1777 	thread_t        thread)
1778 {
1779 	if (__improbable(event == NO_EVENT)) {
1780 		panic("%s() called with NO_EVENT", __func__);
1781 	}
1782 
1783 	if (__improbable(thread == THREAD_NULL)) {
1784 		panic("%s() called with THREAD_NULL", __func__);
1785 	}
1786 
1787 	struct waitq *wq = global_eventq(event);
1788 
1789 	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1790 }
1791 
1792 /*
1793  * Wakeup a thread waiting on an event and promote it to a priority.
1794  *
1795  * Requires woken thread to un-promote itself when done.
1796  */
1797 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1798 thread_wakeup_one_with_pri(
1799 	event_t      event,
1800 	int          priority)
1801 {
1802 	if (__improbable(event == NO_EVENT)) {
1803 		panic("%s() called with NO_EVENT", __func__);
1804 	}
1805 
1806 	struct waitq *wq = global_eventq(event);
1807 
1808 	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1809 }
1810 
1811 /*
1812  * Wakeup a thread waiting on an event,
1813  * promote it to a priority,
1814  * and return a reference to the woken thread.
1815  *
1816  * Requires woken thread to un-promote itself when done.
1817  */
1818 thread_t
thread_wakeup_identify(event_t event,int priority)1819 thread_wakeup_identify(event_t  event,
1820     int      priority)
1821 {
1822 	if (__improbable(event == NO_EVENT)) {
1823 		panic("%s() called with NO_EVENT", __func__);
1824 	}
1825 
1826 	struct waitq *wq = global_eventq(event);
1827 
1828 	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1829 }
1830 
1831 /*
1832  *	thread_bind:
1833  *
1834  *	Force the current thread to execute on the specified processor.
1835  *	Takes effect after the next thread_block().
1836  *
1837  *	Returns the previous binding.  PROCESSOR_NULL means
1838  *	not bound.
1839  *
1840  *	XXX - DO NOT export this to users - XXX
1841  */
1842 processor_t
thread_bind(processor_t processor)1843 thread_bind(
1844 	processor_t             processor)
1845 {
1846 	thread_t                self = current_thread();
1847 	processor_t             prev;
1848 	spl_t                   s;
1849 
1850 	s = splsched();
1851 	thread_lock(self);
1852 
1853 	prev = thread_bind_internal(self, processor);
1854 
1855 	thread_unlock(self);
1856 	splx(s);
1857 
1858 	return prev;
1859 }
1860 
1861 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1862 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1863 {
1864 	assert(!ml_get_interrupts_enabled());
1865 	assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1866 #if MACH_ASSERT
1867 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1868 #endif
1869 
1870 	if (thread->bound_processor != processor) {
1871 		thread_bind_internal(thread, processor);
1872 	}
1873 }
1874 
1875 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)1876 thread_unbind_after_queue_shutdown(
1877 	thread_t                thread,
1878 	processor_t             processor __assert_only)
1879 {
1880 	assert(!ml_get_interrupts_enabled());
1881 
1882 	thread_lock(thread);
1883 
1884 	if (thread->bound_processor) {
1885 		bool removed;
1886 
1887 		assert(thread->bound_processor == processor);
1888 
1889 		removed = thread_run_queue_remove(thread);
1890 		/*
1891 		 * we can always unbind even if we didn't really remove the
1892 		 * thread from the runqueue
1893 		 */
1894 		thread_bind_internal(thread, PROCESSOR_NULL);
1895 		if (removed) {
1896 			thread_run_queue_reinsert(thread, SCHED_TAILQ);
1897 		}
1898 	}
1899 
1900 	thread_unlock(thread);
1901 }
1902 
1903 /*
1904  * thread_bind_internal:
1905  *
1906  * If the specified thread is not the current thread, and it is currently
1907  * running on another CPU, a remote AST must be sent to that CPU to cause
1908  * the thread to migrate to its bound processor. Otherwise, the migration
1909  * will occur at the next quantum expiration or blocking point.
1910  *
1911  * When the thread is the current thread, and explicit thread_block() should
1912  * be used to force the current processor to context switch away and
1913  * let the thread migrate to the bound processor.
1914  *
1915  * Thread must be locked, and at splsched.
1916  */
1917 
1918 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1919 thread_bind_internal(
1920 	thread_t                thread,
1921 	processor_t             processor)
1922 {
1923 	processor_t             prev;
1924 
1925 	/* <rdar://problem/15102234> */
1926 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1927 	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1928 	thread_assert_runq_null(thread);
1929 
1930 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1931 	    thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1932 
1933 	prev = thread->bound_processor;
1934 	thread->bound_processor = processor;
1935 
1936 	return prev;
1937 }
1938 
1939 /*
1940  * thread_vm_bind_group_add:
1941  *
1942  * The "VM bind group" is a special mechanism to mark a collection
1943  * of threads from the VM subsystem that, in general, should be scheduled
1944  * with only one CPU of parallelism. To accomplish this, we initially
1945  * bind all the threads to the master processor, which has the effect
1946  * that only one of the threads in the group can execute at once, including
1947  * preempting threads in the group that are a lower priority. Future
1948  * mechanisms may use more dynamic mechanisms to prevent the collection
1949  * of VM threads from using more CPU time than desired.
1950  *
1951  * The current implementation can result in priority inversions where
1952  * compute-bound priority 95 or realtime threads that happen to have
1953  * landed on the master processor prevent the VM threads from running.
1954  * When this situation is detected, we unbind the threads for one
1955  * scheduler tick to allow the scheduler to run the threads an
1956  * additional CPUs, before restoring the binding (assuming high latency
1957  * is no longer a problem).
1958  */
1959 
1960 /*
1961  * The current max is provisioned for:
1962  * vm_compressor_swap_trigger_thread (92)
1963  * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1964  * vm_pageout_continue (92)
1965  * memorystatus_thread (95)
1966  */
1967 #define MAX_VM_BIND_GROUP_COUNT (5)
1968 decl_simple_lock_data(static, sched_vm_group_list_lock);
1969 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1970 static int sched_vm_group_thread_count;
1971 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1972 
1973 void
thread_vm_bind_group_add(void)1974 thread_vm_bind_group_add(void)
1975 {
1976 	thread_t self = current_thread();
1977 
1978 	thread_reference(self);
1979 	self->options |= TH_OPT_SCHED_VM_GROUP;
1980 
1981 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1982 	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1983 	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1984 	simple_unlock(&sched_vm_group_list_lock);
1985 
1986 	thread_bind(master_processor);
1987 
1988 	/* Switch to bound processor if not already there */
1989 	thread_block(THREAD_CONTINUE_NULL);
1990 }
1991 
1992 static void
sched_vm_group_maintenance(void)1993 sched_vm_group_maintenance(void)
1994 {
1995 	uint64_t ctime = mach_absolute_time();
1996 	uint64_t longtime = ctime - sched_tick_interval;
1997 	int i;
1998 	spl_t s;
1999 	boolean_t high_latency_observed = FALSE;
2000 	boolean_t runnable_and_not_on_runq_observed = FALSE;
2001 	boolean_t bind_target_changed = FALSE;
2002 	processor_t bind_target = PROCESSOR_NULL;
2003 
2004 	/* Make sure nobody attempts to add new threads while we are enumerating them */
2005 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2006 
2007 	s = splsched();
2008 
2009 	for (i = 0; i < sched_vm_group_thread_count; i++) {
2010 		thread_t thread = sched_vm_group_thread_list[i];
2011 		assert(thread != THREAD_NULL);
2012 		thread_lock(thread);
2013 		if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2014 			if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2015 				high_latency_observed = TRUE;
2016 			} else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2017 				/* There are some cases where a thread be transitiong that also fall into this case */
2018 				runnable_and_not_on_runq_observed = TRUE;
2019 			}
2020 		}
2021 		thread_unlock(thread);
2022 
2023 		if (high_latency_observed && runnable_and_not_on_runq_observed) {
2024 			/* All the things we are looking for are true, stop looking */
2025 			break;
2026 		}
2027 	}
2028 
2029 	splx(s);
2030 
2031 	if (sched_vm_group_temporarily_unbound) {
2032 		/* If we turned off binding, make sure everything is OK before rebinding */
2033 		if (!high_latency_observed) {
2034 			/* rebind */
2035 			bind_target_changed = TRUE;
2036 			bind_target = master_processor;
2037 			sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2038 		}
2039 	} else {
2040 		/*
2041 		 * Check if we're in a bad state, which is defined by high
2042 		 * latency with no core currently executing a thread. If a
2043 		 * single thread is making progress on a CPU, that means the
2044 		 * binding concept to reduce parallelism is working as
2045 		 * designed.
2046 		 */
2047 		if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2048 			/* unbind */
2049 			bind_target_changed = TRUE;
2050 			bind_target = PROCESSOR_NULL;
2051 			sched_vm_group_temporarily_unbound = TRUE;
2052 		}
2053 	}
2054 
2055 	if (bind_target_changed) {
2056 		s = splsched();
2057 		for (i = 0; i < sched_vm_group_thread_count; i++) {
2058 			thread_t thread = sched_vm_group_thread_list[i];
2059 			boolean_t removed;
2060 			assert(thread != THREAD_NULL);
2061 
2062 			thread_lock(thread);
2063 			removed = thread_run_queue_remove(thread);
2064 			if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2065 				thread_bind_internal(thread, bind_target);
2066 			} else {
2067 				/*
2068 				 * Thread was in the middle of being context-switched-to,
2069 				 * or was in the process of blocking. To avoid switching the bind
2070 				 * state out mid-flight, defer the change if possible.
2071 				 */
2072 				if (bind_target == PROCESSOR_NULL) {
2073 					thread_bind_internal(thread, bind_target);
2074 				} else {
2075 					sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2076 				}
2077 			}
2078 
2079 			if (removed) {
2080 				thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2081 			}
2082 			thread_unlock(thread);
2083 		}
2084 		splx(s);
2085 	}
2086 
2087 	simple_unlock(&sched_vm_group_list_lock);
2088 }
2089 
2090 #if defined(__x86_64__)
2091 #define SCHED_AVOID_CPU0 1
2092 #else
2093 #define SCHED_AVOID_CPU0 0
2094 #endif
2095 
2096 int sched_allow_rt_smt = 1;
2097 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2098 int sched_allow_rt_steal = 1;
2099 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2100 
2101 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2102 
2103 int
sched_get_rt_n_backup_processors(void)2104 sched_get_rt_n_backup_processors(void)
2105 {
2106 	return sched_rt_n_backup_processors;
2107 }
2108 
2109 void
sched_set_rt_n_backup_processors(int n)2110 sched_set_rt_n_backup_processors(int n)
2111 {
2112 	if (n < 0) {
2113 		n = 0;
2114 	} else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2115 		n = SCHED_MAX_BACKUP_PROCESSORS;
2116 	}
2117 
2118 	sched_rt_n_backup_processors = n;
2119 }
2120 
2121 int sched_rt_runq_strict_priority = false;
2122 
2123 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2124 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2125 {
2126 	if (current_pset != new_pset) {
2127 		pset_unlock(current_pset);
2128 		pset_lock(new_pset);
2129 	}
2130 
2131 	return new_pset;
2132 }
2133 
2134 /*
2135  * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2136  * rebalancing opportunity exists when a core is (instantaneously) idle, but
2137  * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2138  * IPI thrash if this core does not remain idle following the load balancing ASTs
2139  * Idle "thrash", when IPI issue is followed by idle entry/core power down
2140  * followed by a wakeup shortly thereafter.
2141  */
2142 
2143 #if (DEVELOPMENT || DEBUG)
2144 int sched_smt_balance = 1;
2145 #endif
2146 
2147 /* Invoked with pset locked, returns with pset unlocked */
2148 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2149 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2150 {
2151 	processor_t ast_processor = NULL;
2152 
2153 #if (DEVELOPMENT || DEBUG)
2154 	if (__improbable(sched_smt_balance == 0)) {
2155 		goto smt_balance_exit;
2156 	}
2157 #endif
2158 
2159 	assert(cprocessor == current_processor());
2160 	if (cprocessor->is_SMT == FALSE) {
2161 		goto smt_balance_exit;
2162 	}
2163 
2164 	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2165 
2166 	/* Determine if both this processor and its sibling are idle,
2167 	 * indicating an SMT rebalancing opportunity.
2168 	 */
2169 	if (sib_processor->state != PROCESSOR_IDLE) {
2170 		goto smt_balance_exit;
2171 	}
2172 
2173 	processor_t sprocessor;
2174 
2175 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2176 	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2177 	    ~cpset->primary_map);
2178 	for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2179 		sprocessor = processor_array[cpuid];
2180 		if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2181 		    (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2182 			ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2183 			if (ipi_type != SCHED_IPI_NONE) {
2184 				assert(sprocessor != cprocessor);
2185 				ast_processor = sprocessor;
2186 				break;
2187 			}
2188 		}
2189 	}
2190 
2191 smt_balance_exit:
2192 	pset_unlock(cpset);
2193 
2194 	if (ast_processor) {
2195 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2196 		sched_ipi_perform(ast_processor, ipi_type);
2197 	}
2198 	return false;
2199 }
2200 
2201 static cpumap_t
pset_available_cpumap(processor_set_t pset)2202 pset_available_cpumap(processor_set_t pset)
2203 {
2204 	return pset->cpu_available_map & pset->recommended_bitmask;
2205 }
2206 
2207 int
pset_available_cpu_count(processor_set_t pset)2208 pset_available_cpu_count(processor_set_t pset)
2209 {
2210 	return bit_count(pset_available_cpumap(pset));
2211 }
2212 
2213 bool
pset_is_recommended(processor_set_t pset)2214 pset_is_recommended(processor_set_t pset)
2215 {
2216 	if (!pset) {
2217 		return false;
2218 	}
2219 	return pset_available_cpu_count(pset) > 0;
2220 }
2221 
2222 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2223 pset_available_but_not_running_cpumap(processor_set_t pset)
2224 {
2225 	return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2226 	       pset->recommended_bitmask;
2227 }
2228 
2229 bool
pset_has_stealable_threads(processor_set_t pset)2230 pset_has_stealable_threads(processor_set_t pset)
2231 {
2232 	pset_assert_locked(pset);
2233 
2234 	cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2235 	/*
2236 	 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2237 	 * available primary CPUs
2238 	 */
2239 	avail_map &= pset->primary_map;
2240 
2241 	return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2242 }
2243 
2244 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2245 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2246 {
2247 	cpumap_t avail_map = pset_available_cpumap(pset);
2248 	if (!sched_allow_rt_smt) {
2249 		/*
2250 		 * Secondary CPUs are not allowed to run RT threads, so
2251 		 * only primary CPUs should be included
2252 		 */
2253 		avail_map &= pset->primary_map;
2254 	}
2255 
2256 	return avail_map & ~pset->realtime_map;
2257 }
2258 
2259 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2260 pset_needs_a_followup_IPI(processor_set_t pset)
2261 {
2262 	int nbackup_cpus = 0;
2263 
2264 	if (rt_runq_is_low_latency(pset)) {
2265 		nbackup_cpus = sched_rt_n_backup_processors;
2266 	}
2267 
2268 	int rt_rq_count = rt_runq_count(pset);
2269 
2270 	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2271 }
2272 
2273 bool
pset_has_stealable_rt_threads(processor_set_t pset)2274 pset_has_stealable_rt_threads(processor_set_t pset)
2275 {
2276 	pset_node_t node = pset->node;
2277 	if (bit_count(node->pset_map) == 1) {
2278 		return false;
2279 	}
2280 
2281 	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2282 
2283 	return rt_runq_count(pset) > bit_count(avail_map);
2284 }
2285 
2286 static void
pset_update_rt_stealable_state(processor_set_t pset)2287 pset_update_rt_stealable_state(processor_set_t pset)
2288 {
2289 	if (pset_has_stealable_rt_threads(pset)) {
2290 		pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2291 	} else {
2292 		pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2293 	}
2294 }
2295 
2296 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2297 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2298 {
2299 	/* Acknowledge any pending IPIs here with pset lock held */
2300 	pset_assert_locked(pset);
2301 	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2302 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2303 		    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2304 	}
2305 	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2306 
2307 #if defined(CONFIG_SCHED_DEFERRED_AST)
2308 	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2309 #endif
2310 }
2311 
2312 /*
2313  * Called with pset locked, on a processor that is committing to run a new thread
2314  * Will transition an idle or dispatching processor to running as it picks up
2315  * the first new thread from the idle thread.
2316  */
2317 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2318 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2319 {
2320 	pset_assert_locked(pset);
2321 
2322 	if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2323 		assert(current_thread() == processor->idle_thread);
2324 
2325 		/*
2326 		 * Dispatching processor is now committed to running new_thread,
2327 		 * so change its state to PROCESSOR_RUNNING.
2328 		 */
2329 		pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2330 	} else {
2331 		assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2332 	}
2333 
2334 	processor_state_update_from_thread(processor, new_thread, true);
2335 
2336 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2337 		bit_set(pset->realtime_map, processor->cpu_id);
2338 	} else {
2339 		bit_clear(pset->realtime_map, processor->cpu_id);
2340 	}
2341 	pset_update_rt_stealable_state(pset);
2342 
2343 	pset_node_t node = pset->node;
2344 
2345 	if (bit_count(node->pset_map) == 1) {
2346 		/* Node has only a single pset, so skip node pset map updates */
2347 		return;
2348 	}
2349 
2350 	cpumap_t avail_map = pset_available_cpumap(pset);
2351 
2352 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2353 		if ((avail_map & pset->realtime_map) == avail_map) {
2354 			/* No more non-RT CPUs in this pset */
2355 			atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2356 		}
2357 		avail_map &= pset->primary_map;
2358 		if ((avail_map & pset->realtime_map) == avail_map) {
2359 			/* No more non-RT primary CPUs in this pset */
2360 			atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2361 		}
2362 	} else {
2363 		if ((avail_map & pset->realtime_map) != avail_map) {
2364 			if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2365 				atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2366 			}
2367 		}
2368 		avail_map &= pset->primary_map;
2369 		if ((avail_map & pset->realtime_map) != avail_map) {
2370 			if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2371 				atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2372 			}
2373 		}
2374 	}
2375 }
2376 
2377 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2378 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2379     processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2380 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2381 #if defined(__x86_64__)
2382 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2383 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2384 #endif
2385 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2386 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2387 
2388 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2389 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2390 {
2391 	pset_map_t pset_map = stealing_pset->node->pset_map;
2392 
2393 	bit_clear(pset_map, stealing_pset->pset_id);
2394 
2395 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2396 		processor_set_t nset = pset_array[pset_id];
2397 
2398 		if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2399 			return true;
2400 		}
2401 	}
2402 
2403 	return false;
2404 }
2405 
2406 /*
2407  * starting_pset must be locked, but returns true if it is unlocked before return
2408  */
2409 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2410 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2411     processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2412 {
2413 	bool starting_pset_is_unlocked = false;
2414 	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2415 	int max_pri = rt_runq_priority(starting_pset);
2416 	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2417 	processor_set_t pset = starting_pset;
2418 	processor_t next_rt_processor = PROCESSOR_NULL;
2419 	if (spill_ipi) {
2420 		processor_set_t nset = next_pset(pset);
2421 		assert(nset != starting_pset);
2422 		pset = change_locked_pset(pset, nset);
2423 		starting_pset_is_unlocked = true;
2424 	}
2425 	do {
2426 		const bool consider_secondaries = true;
2427 		next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2428 		if (next_rt_processor == PROCESSOR_NULL) {
2429 			if (!spill_ipi) {
2430 				break;
2431 			}
2432 			processor_set_t nset = next_pset(pset);
2433 			if (nset == starting_pset) {
2434 				break;
2435 			}
2436 			pset = change_locked_pset(pset, nset);
2437 			starting_pset_is_unlocked = true;
2438 		}
2439 	} while (next_rt_processor == PROCESSOR_NULL);
2440 	if (next_rt_processor) {
2441 		if (pset != starting_pset) {
2442 			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2443 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2444 				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2445 			}
2446 		}
2447 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2448 		*result_processor = next_rt_processor;
2449 	}
2450 	if (pset != starting_pset) {
2451 		pset_unlock(pset);
2452 	}
2453 
2454 	return starting_pset_is_unlocked;
2455 }
2456 
2457 /*
2458  * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2459  * followup processor - used in thread_select when there are still threads on the run queue and available processors
2460  * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2461  */
2462 typedef enum {
2463 	none,
2464 	backup,
2465 	followup,
2466 	spill
2467 } next_processor_type_t;
2468 
2469 #undef LOOP_COUNT
2470 #ifdef LOOP_COUNT
2471 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2472 #endif
2473 
2474 /*
2475  *	thread_select:
2476  *
2477  *	Select a new thread for the current processor to execute.
2478  *
2479  *	May select the current thread, which must be locked.
2480  */
2481 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2482 thread_select(thread_t          thread,
2483     processor_t       processor,
2484     ast_t            *reason)
2485 {
2486 	processor_set_t         pset = processor->processor_set;
2487 	thread_t                        new_thread = THREAD_NULL;
2488 
2489 	assert(processor == current_processor());
2490 	assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2491 
2492 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2493 	    0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2494 
2495 	__kdebug_only int idle_reason = 0;
2496 	__kdebug_only int delay_count = 0;
2497 
2498 #if defined(__x86_64__)
2499 	int timeout_count = sched_backup_cpu_timeout_count;
2500 	if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2501 		/* Prefer cpu0 as backup */
2502 		timeout_count--;
2503 	} else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2504 		/* Prefer secondary cpu as backup */
2505 		timeout_count--;
2506 	}
2507 #endif
2508 	bool pending_AST_URGENT = false;
2509 	bool pending_AST_PREEMPT = false;
2510 
2511 #ifdef LOOP_COUNT
2512 	int loop_count = -1;
2513 #endif
2514 
2515 	do {
2516 		/*
2517 		 *	Update the priority.
2518 		 */
2519 		if (SCHED(can_update_priority)(thread)) {
2520 			SCHED(update_priority)(thread);
2521 		}
2522 
2523 		pset_lock(pset);
2524 
2525 restart:
2526 #ifdef LOOP_COUNT
2527 		loop_count++;
2528 		if (loop_count > max_loop_count[processor->cpu_id]) {
2529 			max_loop_count[processor->cpu_id] = loop_count;
2530 			if (bit_count(loop_count) == 1) {
2531 				kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2532 			}
2533 		}
2534 #endif
2535 		pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2536 		pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2537 
2538 		processor_state_update_from_thread(processor, thread, true);
2539 
2540 		idle_reason = 0;
2541 
2542 		processor_t ast_processor = PROCESSOR_NULL;
2543 		processor_t next_rt_processor = PROCESSOR_NULL;
2544 		sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2545 		sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2546 
2547 		assert(processor->state != PROCESSOR_OFF_LINE);
2548 
2549 		/*
2550 		 * Bound threads are dispatched to a processor without going through
2551 		 * choose_processor(), so in those cases we must continue trying to dequeue work
2552 		 * as we are the only option.
2553 		 */
2554 		if (!SCHED(processor_bound_count)(processor)) {
2555 			if (!processor->is_recommended) {
2556 				/*
2557 				 * The performance controller has provided a hint to not dispatch more threads,
2558 				 */
2559 				idle_reason = 1;
2560 				goto send_followup_ipi_before_idle;
2561 			} else if (rt_runq_count(pset)) {
2562 				bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2563 				/* Give the current RT thread a chance to complete */
2564 				ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2565 #if defined(__x86_64__)
2566 				/*
2567 				 * On Intel we want to avoid SMT secondary processors and processor 0
2568 				 * but allow them to be used as backup processors in case the preferred chosen
2569 				 * processor is delayed by interrupts or processor stalls.  So if it is
2570 				 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2571 				 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2572 				 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2573 				 * to grab the thread before the (current) backup processor does.
2574 				 *
2575 				 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2576 				 * on DEVELOPMENT || DEBUG kernels.  It is also adjusted (see above) depending on whether we want to use
2577 				 * cpu0 before secondary cpus or not.
2578 				 */
2579 				if (!ok_to_run_realtime_thread) {
2580 					if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2581 						if (timeout_count-- > 0) {
2582 							pset_unlock(pset);
2583 							thread_unlock(thread);
2584 							delay(10);
2585 							delay_count++;
2586 							thread_lock(thread);
2587 							pset_lock(pset);
2588 							goto restart;
2589 						}
2590 						ok_to_run_realtime_thread = true;
2591 					}
2592 				}
2593 #endif
2594 				if (!ok_to_run_realtime_thread) {
2595 					idle_reason = 2;
2596 					goto send_followup_ipi_before_idle;
2597 				}
2598 			} else if (processor->processor_primary != processor) {
2599 				/*
2600 				 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2601 				 * we should look for work only under the same conditions that choose_processor()
2602 				 * would have assigned work, which is when all primary processors have been assigned work.
2603 				 */
2604 				if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2605 					/* There are idle primaries */
2606 					idle_reason = 3;
2607 					goto idle;
2608 				}
2609 			}
2610 		}
2611 
2612 		/*
2613 		 *	Test to see if the current thread should continue
2614 		 *	to run on this processor.  Must not be attempting to wait, and not
2615 		 *	bound to a different processor, nor be in the wrong
2616 		 *	processor set, nor be forced to context switch by TH_SUSP.
2617 		 *
2618 		 *	Note that there are never any RT threads in the regular runqueue.
2619 		 *
2620 		 *	This code is very insanely tricky.
2621 		 */
2622 
2623 		/* i.e. not waiting, not TH_SUSP'ed */
2624 		bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2625 
2626 		/*
2627 		 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2628 		 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2629 		 *       <rdar://problem/47907700>
2630 		 *
2631 		 * A yielding thread shouldn't be forced to context switch.
2632 		 */
2633 
2634 		bool is_yielding         = (*reason & AST_YIELD) == AST_YIELD;
2635 
2636 		bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2637 
2638 		bool affinity_mismatch   = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2639 
2640 		bool bound_elsewhere     = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2641 
2642 		bool avoid_processor     = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2643 
2644 		bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2645 
2646 		bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2647 		if (current_thread_can_keep_running) {
2648 			/*
2649 			 * This thread is eligible to keep running on this processor.
2650 			 *
2651 			 * RT threads with un-expired quantum stay on processor,
2652 			 * unless there's a valid RT thread with an earlier deadline
2653 			 * and it is still ok_to_run_realtime_thread.
2654 			 */
2655 			if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2656 				/*
2657 				 * Pick a new RT thread only if ok_to_run_realtime_thread
2658 				 * (but the current thread is allowed to complete).
2659 				 */
2660 				if (ok_to_run_realtime_thread) {
2661 					if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2662 						goto pick_new_rt_thread;
2663 					}
2664 					if (rt_runq_priority(pset) > thread->sched_pri) {
2665 						if (sched_rt_runq_strict_priority) {
2666 							/* The next RT thread is better, so pick it off the runqueue. */
2667 							goto pick_new_rt_thread;
2668 						}
2669 
2670 						/*
2671 						 * See if the current lower priority thread can continue to run without causing
2672 						 * the higher priority thread on the runq queue to miss its deadline.
2673 						 */
2674 						thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2675 						if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2676 							/* The next RT thread is better, so pick it off the runqueue. */
2677 							goto pick_new_rt_thread;
2678 						}
2679 					} else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2680 						/* The next RT thread is better, so pick it off the runqueue. */
2681 						goto pick_new_rt_thread;
2682 					}
2683 					if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2684 						goto pick_new_rt_thread;
2685 					}
2686 				}
2687 
2688 				/* This is still the best RT thread to run. */
2689 				processor->deadline = thread->realtime.deadline;
2690 
2691 				sched_update_pset_load_average(pset, 0);
2692 
2693 				clear_pending_AST_bits(pset, processor, 1);
2694 
2695 				next_rt_processor = PROCESSOR_NULL;
2696 				next_rt_ipi_type = SCHED_IPI_NONE;
2697 
2698 				bool pset_unlocked = false;
2699 				__kdebug_only next_processor_type_t nptype = none;
2700 				if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2701 					nptype = spill;
2702 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2703 				} else if (pset_needs_a_followup_IPI(pset)) {
2704 					nptype = followup;
2705 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2706 				}
2707 				if (!pset_unlocked) {
2708 					pset_unlock(pset);
2709 				}
2710 
2711 				if (next_rt_processor) {
2712 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2713 					    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2714 					sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2715 				}
2716 
2717 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2718 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2719 				return thread;
2720 			}
2721 
2722 			if ((rt_runq_count(pset) == 0) &&
2723 			    SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2724 				/* This thread is still the highest priority runnable (non-idle) thread */
2725 				processor->deadline = RT_DEADLINE_NONE;
2726 
2727 				sched_update_pset_load_average(pset, 0);
2728 
2729 				clear_pending_AST_bits(pset, processor, 2);
2730 
2731 				pset_unlock(pset);
2732 
2733 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2734 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2735 				return thread;
2736 			}
2737 		} else {
2738 			/*
2739 			 * This processor must context switch.
2740 			 * If it's due to a rebalance, we should aggressively find this thread a new home.
2741 			 */
2742 			if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2743 				*reason |= AST_REBALANCE;
2744 			}
2745 		}
2746 
2747 		bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2748 		    (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2749 		    (processor->processor_secondary->state == PROCESSOR_IDLE));
2750 
2751 		/* OK, so we're not going to run the current thread. Look at the RT queue. */
2752 		if (ok_to_run_realtime_thread) {
2753 pick_new_rt_thread:
2754 			new_thread = sched_rt_choose_thread(pset);
2755 			if (new_thread != THREAD_NULL) {
2756 				processor->deadline = new_thread->realtime.deadline;
2757 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2758 
2759 				clear_pending_AST_bits(pset, processor, 3);
2760 
2761 				if (processor->processor_secondary != NULL) {
2762 					processor_t sprocessor = processor->processor_secondary;
2763 					if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2764 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2765 						ast_processor = sprocessor;
2766 					}
2767 				}
2768 			}
2769 		}
2770 
2771 send_followup_ipi_before_idle:
2772 		/* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2773 		if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2774 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2775 		}
2776 		__kdebug_only next_processor_type_t nptype = none;
2777 		bool pset_unlocked = false;
2778 		if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2779 			nptype = spill;
2780 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2781 		} else if (pset_needs_a_followup_IPI(pset)) {
2782 			nptype = followup;
2783 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2784 		}
2785 
2786 		assert(new_thread || !ast_processor);
2787 		if (new_thread || next_rt_processor) {
2788 			if (!pset_unlocked) {
2789 				pset_unlock(pset);
2790 				pset_unlocked = true;
2791 			}
2792 			if (ast_processor == next_rt_processor) {
2793 				ast_processor = PROCESSOR_NULL;
2794 				ipi_type = SCHED_IPI_NONE;
2795 			}
2796 
2797 			if (ast_processor) {
2798 				sched_ipi_perform(ast_processor, ipi_type);
2799 			}
2800 
2801 			if (next_rt_processor) {
2802 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2803 				    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2804 				sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2805 			}
2806 
2807 			if (new_thread) {
2808 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2809 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2810 				return new_thread;
2811 			}
2812 		}
2813 
2814 		if (pset_unlocked) {
2815 			pset_lock(pset);
2816 		}
2817 
2818 		if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2819 			/* Things changed while we dropped the lock */
2820 			goto restart;
2821 		}
2822 
2823 		if (processor->is_recommended) {
2824 			bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2825 			if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2826 				/* Things changed while we dropped the lock */
2827 				goto restart;
2828 			}
2829 
2830 			if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2831 				/* secondary can only run realtime thread */
2832 				if (idle_reason == 0) {
2833 					idle_reason = 4;
2834 				}
2835 				goto idle;
2836 			}
2837 		} else if (!SCHED(processor_bound_count)(processor)) {
2838 			/* processor not recommended and no bound threads */
2839 			if (idle_reason == 0) {
2840 				idle_reason = 5;
2841 			}
2842 			goto idle;
2843 		}
2844 
2845 		processor->deadline = RT_DEADLINE_NONE;
2846 
2847 		/* No RT threads, so let's look at the regular threads. */
2848 		if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2849 			pset_commit_processor_to_new_thread(pset, processor, new_thread);
2850 
2851 			clear_pending_AST_bits(pset, processor, 4);
2852 
2853 			ast_processor = PROCESSOR_NULL;
2854 			ipi_type = SCHED_IPI_NONE;
2855 
2856 			processor_t sprocessor = processor->processor_secondary;
2857 			if (sprocessor != NULL) {
2858 				if (sprocessor->state == PROCESSOR_RUNNING) {
2859 					if (thread_no_smt(new_thread)) {
2860 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2861 						ast_processor = sprocessor;
2862 					}
2863 				} else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2864 					ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2865 					ast_processor = sprocessor;
2866 				}
2867 			}
2868 			pset_unlock(pset);
2869 
2870 			if (ast_processor) {
2871 				sched_ipi_perform(ast_processor, ipi_type);
2872 			}
2873 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2874 			    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2875 			return new_thread;
2876 		}
2877 
2878 		if (processor->must_idle) {
2879 			processor->must_idle = false;
2880 			*reason |= AST_REBALANCE;
2881 			idle_reason = 6;
2882 			goto idle;
2883 		}
2884 
2885 		if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2886 			/*
2887 			 * No runnable threads, attempt to steal
2888 			 * from other processors. Returns with pset lock dropped.
2889 			 */
2890 
2891 			if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2892 				pset_lock(pset);
2893 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2894 				if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2895 					/*
2896 					 * A realtime thread choose this processor while it was DISPATCHING
2897 					 * and the pset lock was dropped
2898 					 */
2899 					ast_on(AST_URGENT | AST_PREEMPT);
2900 				}
2901 
2902 				clear_pending_AST_bits(pset, processor, 5);
2903 
2904 				pset_unlock(pset);
2905 
2906 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2907 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2908 				return new_thread;
2909 			}
2910 
2911 			/*
2912 			 * If other threads have appeared, shortcut
2913 			 * around again.
2914 			 */
2915 			if (SCHED(processor_bound_count)(processor)) {
2916 				continue;
2917 			}
2918 			if (processor->is_recommended) {
2919 				if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2920 					continue;
2921 				}
2922 			}
2923 
2924 			pset_lock(pset);
2925 		}
2926 
2927 idle:
2928 		/* Someone selected this processor while we had dropped the lock */
2929 		if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2930 		    (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2931 			goto restart;
2932 		}
2933 
2934 		if ((idle_reason == 0) && current_thread_can_keep_running) {
2935 			/* This thread is the only runnable (non-idle) thread */
2936 			if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2937 				processor->deadline = thread->realtime.deadline;
2938 			} else {
2939 				processor->deadline = RT_DEADLINE_NONE;
2940 			}
2941 
2942 			sched_update_pset_load_average(pset, 0);
2943 
2944 			clear_pending_AST_bits(pset, processor, 6);
2945 
2946 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2947 			    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2948 			pset_unlock(pset);
2949 			return thread;
2950 		}
2951 
2952 		/*
2953 		 *	Nothing is runnable, or this processor must be forced idle,
2954 		 *	so set this processor idle if it was running.
2955 		 */
2956 		if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2957 			pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2958 			processor_state_update_idle(processor);
2959 		}
2960 		pset_update_rt_stealable_state(pset);
2961 
2962 		clear_pending_AST_bits(pset, processor, 7);
2963 
2964 		/* Invoked with pset locked, returns with pset unlocked */
2965 		processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2966 
2967 		new_thread = processor->idle_thread;
2968 	} while (new_thread == THREAD_NULL);
2969 
2970 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2971 	    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2972 	return new_thread;
2973 }
2974 
2975 /*
2976  * thread_invoke
2977  *
2978  * Called at splsched with neither thread locked.
2979  *
2980  * Perform a context switch and start executing the new thread.
2981  *
2982  * Returns FALSE when the context switch didn't happen.
2983  * The reference to the new thread is still consumed.
2984  *
2985  * "self" is what is currently running on the processor,
2986  * "thread" is the new thread to context switch to
2987  * (which may be the same thread in some cases)
2988  */
2989 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2990 thread_invoke(
2991 	thread_t                        self,
2992 	thread_t                        thread,
2993 	ast_t                           reason)
2994 {
2995 	if (__improbable(get_preemption_level() != 0)) {
2996 		int pl = get_preemption_level();
2997 		panic("thread_invoke: preemption_level %d, possible cause: %s",
2998 		    pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2999 		    "blocking while holding a spinlock, or within interrupt context"));
3000 	}
3001 
3002 	thread_continue_t       continuation = self->continuation;
3003 	void                    *parameter   = self->parameter;
3004 
3005 	struct recount_snap snap = { 0 };
3006 	recount_snapshot(&snap);
3007 	uint64_t ctime = snap.rsn_time_mach;
3008 
3009 	check_monotonic_time(ctime);
3010 
3011 #ifdef CONFIG_MACH_APPROXIMATE_TIME
3012 	commpage_update_mach_approximate_time(ctime);
3013 #endif
3014 
3015 	if (ctime < thread->last_made_runnable_time) {
3016 		panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3017 		    ctime, thread->last_made_runnable_time);
3018 	}
3019 
3020 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3021 	if (!((thread->state & TH_IDLE) != 0 ||
3022 	    ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3023 		sched_timeshare_consider_maintenance(ctime, true);
3024 	}
3025 #endif
3026 
3027 	recount_log_switch_thread(&snap);
3028 
3029 	assert_thread_magic(self);
3030 	assert(self == current_thread());
3031 	thread_assert_runq_null(self);
3032 	assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3033 
3034 	thread_lock(thread);
3035 
3036 	assert_thread_magic(thread);
3037 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3038 	assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
3039 	thread_assert_runq_null(thread);
3040 
3041 	/* Update SFI class based on other factors */
3042 	thread->sfi_class = sfi_thread_classify(thread);
3043 
3044 	/* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3045 	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3046 	/*
3047 	 * In case a base_pri update happened between the timestamp and
3048 	 * taking the thread lock
3049 	 */
3050 	if (ctime <= thread->last_basepri_change_time) {
3051 		thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3052 	}
3053 
3054 	/* Allow realtime threads to hang onto a stack. */
3055 	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3056 		self->reserved_stack = self->kernel_stack;
3057 	}
3058 
3059 	/* Prepare for spin debugging */
3060 #if SCHED_HYGIENE_DEBUG
3061 	ml_spin_debug_clear(thread);
3062 #endif
3063 
3064 	if (continuation != NULL) {
3065 		if (!thread->kernel_stack) {
3066 			/*
3067 			 * If we are using a privileged stack,
3068 			 * check to see whether we can exchange it with
3069 			 * that of the other thread.
3070 			 */
3071 			if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3072 				goto need_stack;
3073 			}
3074 
3075 			/*
3076 			 * Context switch by performing a stack handoff.
3077 			 * Requires both threads to be parked in a continuation.
3078 			 */
3079 			continuation = thread->continuation;
3080 			parameter = thread->parameter;
3081 
3082 			processor_t processor = current_processor();
3083 			processor->active_thread = thread;
3084 			processor_state_update_from_thread(processor, thread, false);
3085 
3086 			if (thread->last_processor != processor && thread->last_processor != NULL) {
3087 				if (thread->last_processor->processor_set != processor->processor_set) {
3088 					thread->ps_switch++;
3089 				}
3090 				thread->p_switch++;
3091 			}
3092 			thread->last_processor = processor;
3093 			thread->c_switch++;
3094 			ast_context(thread);
3095 
3096 			thread_unlock(thread);
3097 
3098 			self->reason = reason;
3099 
3100 			processor->last_dispatch = ctime;
3101 			self->last_run_time = ctime;
3102 			timer_update(&thread->runnable_timer, ctime);
3103 			recount_switch_thread(&snap, self, get_threadtask(self));
3104 
3105 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3106 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3107 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3108 
3109 			if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3110 				SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3111 				    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3112 			}
3113 
3114 			DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3115 
3116 			SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3117 
3118 #if KPERF
3119 			kperf_off_cpu(self);
3120 #endif /* KPERF */
3121 
3122 			/*
3123 			 * This is where we actually switch thread identity,
3124 			 * and address space if required.  However, register
3125 			 * state is not switched - this routine leaves the
3126 			 * stack and register state active on the current CPU.
3127 			 */
3128 			TLOG(1, "thread_invoke: calling stack_handoff\n");
3129 			stack_handoff(self, thread);
3130 
3131 			/* 'self' is now off core */
3132 			assert(thread == current_thread_volatile());
3133 
3134 			DTRACE_SCHED(on__cpu);
3135 
3136 #if KPERF
3137 			kperf_on_cpu(thread, continuation, NULL);
3138 #endif /* KPERF */
3139 
3140 			recount_log_switch_thread_on(&snap);
3141 
3142 			thread_dispatch(self, thread);
3143 
3144 #if KASAN
3145 			/* Old thread's stack has been moved to the new thread, so explicitly
3146 			 * unpoison it. */
3147 			kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3148 #endif
3149 
3150 			thread->continuation = thread->parameter = NULL;
3151 
3152 			boolean_t enable_interrupts = TRUE;
3153 
3154 			/* idle thread needs to stay interrupts-disabled */
3155 			if ((thread->state & TH_IDLE)) {
3156 				enable_interrupts = FALSE;
3157 			}
3158 
3159 			assert(continuation);
3160 			call_continuation(continuation, parameter,
3161 			    thread->wait_result, enable_interrupts);
3162 			/*NOTREACHED*/
3163 		} else if (thread == self) {
3164 			/* same thread but with continuation */
3165 			ast_context(self);
3166 
3167 			thread_unlock(self);
3168 
3169 #if KPERF
3170 			kperf_on_cpu(thread, continuation, NULL);
3171 #endif /* KPERF */
3172 
3173 			recount_log_switch_thread_on(&snap);
3174 
3175 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3177 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3178 
3179 #if KASAN
3180 			/* stack handoff to self - no thread_dispatch(), so clear the stack
3181 			 * and free the fakestack directly */
3182 #if KASAN_CLASSIC
3183 			kasan_fakestack_drop(self);
3184 			kasan_fakestack_gc(self);
3185 #endif /* KASAN_CLASSIC */
3186 			kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3187 #endif /* KASAN */
3188 
3189 			self->continuation = self->parameter = NULL;
3190 
3191 			boolean_t enable_interrupts = TRUE;
3192 
3193 			/* idle thread needs to stay interrupts-disabled */
3194 			if ((self->state & TH_IDLE)) {
3195 				enable_interrupts = FALSE;
3196 			}
3197 
3198 			call_continuation(continuation, parameter,
3199 			    self->wait_result, enable_interrupts);
3200 			/*NOTREACHED*/
3201 		}
3202 	} else {
3203 		/*
3204 		 * Check that the other thread has a stack
3205 		 */
3206 		if (!thread->kernel_stack) {
3207 need_stack:
3208 			if (!stack_alloc_try(thread)) {
3209 				thread_unlock(thread);
3210 				thread_stack_enqueue(thread);
3211 				return FALSE;
3212 			}
3213 		} else if (thread == self) {
3214 			ast_context(self);
3215 			thread_unlock(self);
3216 
3217 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3218 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3219 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3220 
3221 			return TRUE;
3222 		}
3223 	}
3224 
3225 	/*
3226 	 * Context switch by full context save.
3227 	 */
3228 	processor_t processor = current_processor();
3229 	processor->active_thread = thread;
3230 	processor_state_update_from_thread(processor, thread, false);
3231 
3232 	if (thread->last_processor != processor && thread->last_processor != NULL) {
3233 		if (thread->last_processor->processor_set != processor->processor_set) {
3234 			thread->ps_switch++;
3235 		}
3236 		thread->p_switch++;
3237 	}
3238 	thread->last_processor = processor;
3239 	thread->c_switch++;
3240 	ast_context(thread);
3241 
3242 	thread_unlock(thread);
3243 
3244 	self->reason = reason;
3245 
3246 	processor->last_dispatch = ctime;
3247 	self->last_run_time = ctime;
3248 	timer_update(&thread->runnable_timer, ctime);
3249 	recount_switch_thread(&snap, self, get_threadtask(self));
3250 
3251 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3252 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3253 	    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3254 
3255 	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3256 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3257 		    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3258 	}
3259 
3260 	DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3261 
3262 	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3263 
3264 #if KPERF
3265 	kperf_off_cpu(self);
3266 #endif /* KPERF */
3267 
3268 	/*
3269 	 * This is where we actually switch register context,
3270 	 * and address space if required.  We will next run
3271 	 * as a result of a subsequent context switch.
3272 	 *
3273 	 * Once registers are switched and the processor is running "thread",
3274 	 * the stack variables and non-volatile registers will contain whatever
3275 	 * was there the last time that thread blocked. No local variables should
3276 	 * be used after this point, except for the special case of "thread", which
3277 	 * the platform layer returns as the previous thread running on the processor
3278 	 * via the function call ABI as a return register, and "self", which may have
3279 	 * been stored on the stack or a non-volatile register, but a stale idea of
3280 	 * what was on the CPU is newly-accurate because that thread is again
3281 	 * running on the CPU.
3282 	 *
3283 	 * If one of the threads is using a continuation, thread_continue
3284 	 * is used to stitch up its context.
3285 	 *
3286 	 * If we are invoking a thread which is resuming from a continuation,
3287 	 * the CPU will invoke thread_continue next.
3288 	 *
3289 	 * If the current thread is parking in a continuation, then its state
3290 	 * won't be saved and the stack will be discarded. When the stack is
3291 	 * re-allocated, it will be configured to resume from thread_continue.
3292 	 */
3293 
3294 	assert(continuation == self->continuation);
3295 	thread = machine_switch_context(self, continuation, thread);
3296 	assert(self == current_thread_volatile());
3297 	TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3298 
3299 	assert(continuation == NULL && self->continuation == NULL);
3300 
3301 	DTRACE_SCHED(on__cpu);
3302 
3303 #if KPERF
3304 	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3305 #endif /* KPERF */
3306 
3307 	/* Previous snap on the old stack is gone. */
3308 	recount_log_switch_thread_on(NULL);
3309 
3310 	/* We have been resumed and are set to run. */
3311 	thread_dispatch(thread, self);
3312 
3313 	return TRUE;
3314 }
3315 
3316 #if defined(CONFIG_SCHED_DEFERRED_AST)
3317 /*
3318  *	pset_cancel_deferred_dispatch:
3319  *
3320  *	Cancels all ASTs that we can cancel for the given processor set
3321  *	if the current processor is running the last runnable thread in the
3322  *	system.
3323  *
3324  *	This function assumes the current thread is runnable.  This must
3325  *	be called with the pset unlocked.
3326  */
3327 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3328 pset_cancel_deferred_dispatch(
3329 	processor_set_t         pset,
3330 	processor_t             processor)
3331 {
3332 	processor_t             active_processor = NULL;
3333 	uint32_t                sampled_sched_run_count;
3334 
3335 	pset_lock(pset);
3336 	sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3337 
3338 	/*
3339 	 * If we have emptied the run queue, and our current thread is runnable, we
3340 	 * should tell any processors that are still DISPATCHING that they will
3341 	 * probably not have any work to do.  In the event that there are no
3342 	 * pending signals that we can cancel, this is also uninteresting.
3343 	 *
3344 	 * In the unlikely event that another thread becomes runnable while we are
3345 	 * doing this (sched_run_count is atomically updated, not guarded), the
3346 	 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3347 	 * in order to dispatch it to a processor in our pset.  So, the other
3348 	 * codepath will wait while we squash all cancelable ASTs, get the pset
3349 	 * lock, and then dispatch the freshly runnable thread.  So this should be
3350 	 * correct (we won't accidentally have a runnable thread that hasn't been
3351 	 * dispatched to an idle processor), if not ideal (we may be restarting the
3352 	 * dispatch process, which could have some overhead).
3353 	 */
3354 
3355 	if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3356 		uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3357 		    pset->pending_deferred_AST_cpu_mask &
3358 		    ~pset->pending_AST_URGENT_cpu_mask);
3359 		for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3360 			active_processor = processor_array[cpuid];
3361 			/*
3362 			 * If a processor is DISPATCHING, it could be because of
3363 			 * a cancelable signal.
3364 			 *
3365 			 * IF the processor is not our
3366 			 * current processor (the current processor should not
3367 			 * be DISPATCHING, so this is a bit paranoid), AND there
3368 			 * is a cancelable signal pending on the processor, AND
3369 			 * there is no non-cancelable signal pending (as there is
3370 			 * no point trying to backtrack on bringing the processor
3371 			 * up if a signal we cannot cancel is outstanding), THEN
3372 			 * it should make sense to roll back the processor state
3373 			 * to the IDLE state.
3374 			 *
3375 			 * If the racey nature of this approach (as the signal
3376 			 * will be arbitrated by hardware, and can fire as we
3377 			 * roll back state) results in the core responding
3378 			 * despite being pushed back to the IDLE state, it
3379 			 * should be no different than if the core took some
3380 			 * interrupt while IDLE.
3381 			 */
3382 			if (active_processor != processor) {
3383 				/*
3384 				 * Squash all of the processor state back to some
3385 				 * reasonable facsimile of PROCESSOR_IDLE.
3386 				 */
3387 
3388 				processor_state_update_idle(active_processor);
3389 				active_processor->deadline = RT_DEADLINE_NONE;
3390 				pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3391 				bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3392 				machine_signal_idle_cancel(active_processor);
3393 			}
3394 		}
3395 	}
3396 
3397 	pset_unlock(pset);
3398 }
3399 #else
3400 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3401 #endif
3402 
3403 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3404 thread_csw_callout(
3405 	thread_t            old,
3406 	thread_t            new,
3407 	uint64_t            timestamp)
3408 {
3409 	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3410 	uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3411 	machine_switch_perfcontrol_context(event, timestamp, 0,
3412 	    same_pri_latency, old, new);
3413 }
3414 
3415 
3416 /*
3417  *	thread_dispatch:
3418  *
3419  *	Handle threads at context switch.  Re-dispatch other thread
3420  *	if still running, otherwise update run state and perform
3421  *	special actions.  Update quantum for other thread and begin
3422  *	the quantum for ourselves.
3423  *
3424  *      "thread" is the old thread that we have switched away from.
3425  *      "self" is the new current thread that we have context switched to
3426  *
3427  *	Called at splsched.
3428  *
3429  */
3430 void
thread_dispatch(thread_t thread,thread_t self)3431 thread_dispatch(
3432 	thread_t                thread,
3433 	thread_t                self)
3434 {
3435 	processor_t             processor = self->last_processor;
3436 	bool was_idle = false;
3437 
3438 	assert(processor == current_processor());
3439 	assert(self == current_thread_volatile());
3440 	assert(thread != self);
3441 
3442 	if (thread != THREAD_NULL) {
3443 		/*
3444 		 * Do the perfcontrol callout for context switch.
3445 		 * The reason we do this here is:
3446 		 * - thread_dispatch() is called from various places that are not
3447 		 *   the direct context switch path for eg. processor shutdown etc.
3448 		 *   So adding the callout here covers all those cases.
3449 		 * - We want this callout as early as possible to be close
3450 		 *   to the timestamp taken in thread_invoke()
3451 		 * - We want to avoid holding the thread lock while doing the
3452 		 *   callout
3453 		 * - We do not want to callout if "thread" is NULL.
3454 		 */
3455 		thread_csw_callout(thread, self, processor->last_dispatch);
3456 
3457 #if KASAN
3458 		if (thread->continuation != NULL) {
3459 			/*
3460 			 * Thread has a continuation and the normal stack is going away.
3461 			 * Unpoison the stack and mark all fakestack objects as unused.
3462 			 */
3463 #if KASAN_CLASSIC
3464 			kasan_fakestack_drop(thread);
3465 #endif /* KASAN_CLASSIC */
3466 			if (thread->kernel_stack) {
3467 				kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3468 			}
3469 		}
3470 
3471 
3472 #if KASAN_CLASSIC
3473 		/*
3474 		 * Free all unused fakestack objects.
3475 		 */
3476 		kasan_fakestack_gc(thread);
3477 #endif /* KASAN_CLASSIC */
3478 #endif /* KASAN */
3479 
3480 		/*
3481 		 *	If blocked at a continuation, discard
3482 		 *	the stack.
3483 		 */
3484 		if (thread->continuation != NULL && thread->kernel_stack != 0) {
3485 			stack_free(thread);
3486 		}
3487 
3488 		if (thread->state & TH_IDLE) {
3489 			was_idle = true;
3490 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3491 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3492 			    (uintptr_t)thread_tid(thread), 0, thread->state,
3493 			    sched_run_buckets[TH_BUCKET_RUN], 0);
3494 		} else {
3495 			int64_t consumed;
3496 			int64_t remainder = 0;
3497 
3498 			if (processor->quantum_end > processor->last_dispatch) {
3499 				remainder = processor->quantum_end -
3500 				    processor->last_dispatch;
3501 			}
3502 
3503 			consumed = thread->quantum_remaining - remainder;
3504 
3505 			if ((thread->reason & AST_LEDGER) == 0) {
3506 				/*
3507 				 * Bill CPU time to both the task and
3508 				 * the individual thread.
3509 				 */
3510 				ledger_credit_thread(thread, thread->t_ledger,
3511 				    task_ledgers.cpu_time, consumed);
3512 				ledger_credit_thread(thread, thread->t_threadledger,
3513 				    thread_ledgers.cpu_time, consumed);
3514 				if (thread->t_bankledger) {
3515 					ledger_credit_thread(thread, thread->t_bankledger,
3516 					    bank_ledgers.cpu_time,
3517 					    (consumed - thread->t_deduct_bank_ledger_time));
3518 				}
3519 				thread->t_deduct_bank_ledger_time = 0;
3520 				if (consumed > 0) {
3521 					/*
3522 					 * This should never be negative, but in traces we are seeing some instances
3523 					 * of consumed being negative.
3524 					 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3525 					 */
3526 					sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3527 				}
3528 			}
3529 
3530 			/* For the thread that we just context switched away from, figure
3531 			 * out if we have expired the wq quantum and set the AST if we have
3532 			 */
3533 			if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3534 				thread_evaluate_workqueue_quantum_expiry(thread);
3535 			}
3536 
3537 			if (__improbable(thread->rwlock_count != 0)) {
3538 				smr_mark_active_trackers_stalled(thread);
3539 			}
3540 
3541 			/*
3542 			 * Pairs with task_restartable_ranges_synchronize
3543 			 */
3544 			wake_lock(thread);
3545 			thread_lock(thread);
3546 
3547 			/*
3548 			 * Same as ast_check(), in case we missed the IPI
3549 			 */
3550 			thread_reset_pcs_ack_IPI(thread);
3551 
3552 			/*
3553 			 * Apply a priority floor if the thread holds a kernel resource
3554 			 * or explicitly requested it.
3555 			 * Do this before checking starting_pri to avoid overpenalizing
3556 			 * repeated rwlock blockers.
3557 			 */
3558 			if (__improbable(thread->rwlock_count != 0)) {
3559 				lck_rw_set_promotion_locked(thread);
3560 			}
3561 			if (__improbable(thread->priority_floor_count != 0)) {
3562 				thread_floor_boost_set_promotion_locked(thread);
3563 			}
3564 
3565 			boolean_t keep_quantum = processor->first_timeslice;
3566 
3567 			/*
3568 			 * Treat a thread which has dropped priority since it got on core
3569 			 * as having expired its quantum.
3570 			 */
3571 			if (processor->starting_pri > thread->sched_pri) {
3572 				keep_quantum = FALSE;
3573 			}
3574 
3575 			/* Compute remainder of current quantum. */
3576 			if (keep_quantum &&
3577 			    processor->quantum_end > processor->last_dispatch) {
3578 				thread->quantum_remaining = (uint32_t)remainder;
3579 			} else {
3580 				thread->quantum_remaining = 0;
3581 			}
3582 
3583 			if (thread->sched_mode == TH_MODE_REALTIME) {
3584 				/*
3585 				 *	Cancel the deadline if the thread has
3586 				 *	consumed the entire quantum.
3587 				 */
3588 				if (thread->quantum_remaining == 0) {
3589 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3590 					    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3591 					thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3592 				}
3593 			} else {
3594 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3595 				/*
3596 				 *	For non-realtime threads treat a tiny
3597 				 *	remaining quantum as an expired quantum
3598 				 *	but include what's left next time.
3599 				 */
3600 				if (thread->quantum_remaining < min_std_quantum) {
3601 					thread->reason |= AST_QUANTUM;
3602 					thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3603 				}
3604 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3605 			}
3606 
3607 			/*
3608 			 *	If we are doing a direct handoff then
3609 			 *	take the remainder of the quantum.
3610 			 */
3611 			if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3612 				self->quantum_remaining = thread->quantum_remaining;
3613 				thread->reason |= AST_QUANTUM;
3614 				thread->quantum_remaining = 0;
3615 			} else {
3616 #if defined(CONFIG_SCHED_MULTIQ)
3617 				if (SCHED(sched_groups_enabled) &&
3618 				    thread->sched_group == self->sched_group) {
3619 					KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3620 					    MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3621 					    self->reason, (uintptr_t)thread_tid(thread),
3622 					    self->quantum_remaining, thread->quantum_remaining, 0);
3623 
3624 					self->quantum_remaining = thread->quantum_remaining;
3625 					thread->quantum_remaining = 0;
3626 					/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3627 				}
3628 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3629 			}
3630 
3631 			thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3632 
3633 			if (!(thread->state & TH_WAIT)) {
3634 				/*
3635 				 *	Still runnable.
3636 				 */
3637 				thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3638 
3639 				machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3640 
3641 				ast_t reason = thread->reason;
3642 				sched_options_t options = SCHED_NONE;
3643 
3644 				if (reason & AST_REBALANCE) {
3645 					options |= SCHED_REBALANCE;
3646 					if (reason & AST_QUANTUM) {
3647 						/*
3648 						 * Having gone to the trouble of forcing this thread off a less preferred core,
3649 						 * we should force the preferable core to reschedule immediately to give this
3650 						 * thread a chance to run instead of just sitting on the run queue where
3651 						 * it may just be stolen back by the idle core we just forced it off.
3652 						 * But only do this at the end of a quantum to prevent cascading effects.
3653 						 */
3654 						options |= SCHED_PREEMPT;
3655 					}
3656 				}
3657 
3658 				if (reason & AST_QUANTUM) {
3659 					options |= SCHED_TAILQ;
3660 				} else if (reason & AST_PREEMPT) {
3661 					options |= SCHED_HEADQ;
3662 				} else {
3663 					options |= (SCHED_PREEMPT | SCHED_TAILQ);
3664 				}
3665 
3666 				thread_setrun(thread, options);
3667 
3668 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3669 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3670 				    (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3671 				    sched_run_buckets[TH_BUCKET_RUN], 0);
3672 
3673 				if (thread->wake_active) {
3674 					thread->wake_active = FALSE;
3675 					thread_unlock(thread);
3676 
3677 					thread_wakeup(&thread->wake_active);
3678 				} else {
3679 					thread_unlock(thread);
3680 				}
3681 
3682 				wake_unlock(thread);
3683 			} else {
3684 				/*
3685 				 *	Waiting.
3686 				 */
3687 				boolean_t should_terminate = FALSE;
3688 				uint32_t new_run_count;
3689 				int thread_state = thread->state;
3690 
3691 				/* Only the first call to thread_dispatch
3692 				 * after explicit termination should add
3693 				 * the thread to the termination queue
3694 				 */
3695 				if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3696 					should_terminate = TRUE;
3697 					thread_state |= TH_TERMINATE2;
3698 				}
3699 
3700 				timer_stop(&thread->runnable_timer, processor->last_dispatch);
3701 
3702 				thread_state &= ~TH_RUN;
3703 				thread->state = thread_state;
3704 
3705 				thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3706 				thread->chosen_processor = PROCESSOR_NULL;
3707 
3708 				new_run_count = SCHED(run_count_decr)(thread);
3709 
3710 #if CONFIG_SCHED_AUTO_JOIN
3711 				if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3712 					work_interval_auto_join_unwind(thread);
3713 				}
3714 #endif /* CONFIG_SCHED_AUTO_JOIN */
3715 
3716 #if CONFIG_SCHED_SFI
3717 				if (thread->reason & AST_SFI) {
3718 					thread->wait_sfi_begin_time = processor->last_dispatch;
3719 				}
3720 #endif
3721 				machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3722 
3723 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3724 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3725 				    (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3726 				    new_run_count, 0);
3727 
3728 				if (thread_state & TH_WAIT_REPORT) {
3729 					(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3730 				}
3731 
3732 				if (thread->wake_active) {
3733 					thread->wake_active = FALSE;
3734 					thread_unlock(thread);
3735 
3736 					thread_wakeup(&thread->wake_active);
3737 				} else {
3738 					thread_unlock(thread);
3739 				}
3740 
3741 				wake_unlock(thread);
3742 
3743 				if (should_terminate) {
3744 					thread_terminate_enqueue(thread);
3745 				}
3746 			}
3747 		}
3748 		/*
3749 		 * The thread could have been added to the termination queue, so it's
3750 		 * unsafe to use after this point.
3751 		 */
3752 		thread = THREAD_NULL;
3753 	}
3754 
3755 	int urgency = THREAD_URGENCY_NONE;
3756 	uint64_t latency = 0;
3757 
3758 	/* Update (new) current thread and reprogram running timers */
3759 	thread_lock(self);
3760 
3761 	if (!(self->state & TH_IDLE)) {
3762 		uint64_t        arg1, arg2;
3763 
3764 #if CONFIG_SCHED_SFI
3765 		ast_t                   new_ast;
3766 
3767 		new_ast = sfi_thread_needs_ast(self, NULL);
3768 
3769 		if (new_ast != AST_NONE) {
3770 			ast_on(new_ast);
3771 		}
3772 #endif
3773 
3774 		if (processor->last_dispatch < self->last_made_runnable_time) {
3775 			panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3776 			    processor->last_dispatch, self->last_made_runnable_time);
3777 		}
3778 
3779 		assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3780 
3781 		latency = processor->last_dispatch - self->last_made_runnable_time;
3782 		assert(latency >= self->same_pri_latency);
3783 
3784 		urgency = thread_get_urgency(self, &arg1, &arg2);
3785 
3786 		thread_tell_urgency(urgency, arg1, arg2, latency, self);
3787 
3788 		/*
3789 		 *	Start a new CPU limit interval if the previous one has
3790 		 *	expired. This should happen before initializing a new
3791 		 *	quantum.
3792 		 */
3793 		if (cpulimit_affects_quantum &&
3794 		    thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3795 			thread_cpulimit_restart(processor->last_dispatch);
3796 		}
3797 
3798 		/*
3799 		 *	Get a new quantum if none remaining.
3800 		 */
3801 		if (self->quantum_remaining == 0) {
3802 			thread_quantum_init(self, processor->last_dispatch);
3803 		}
3804 
3805 		/*
3806 		 *	Set up quantum timer and timeslice.
3807 		 */
3808 		processor->quantum_end = processor->last_dispatch +
3809 		    self->quantum_remaining;
3810 
3811 		running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3812 		    processor->quantum_end, processor->last_dispatch);
3813 		if (was_idle) {
3814 			/*
3815 			 * kperf's running timer is active whenever the idle thread for a
3816 			 * CPU is not running.
3817 			 */
3818 			kperf_running_setup(processor, processor->last_dispatch);
3819 		}
3820 		running_timers_activate(processor);
3821 		processor->first_timeslice = TRUE;
3822 	} else {
3823 		running_timers_deactivate(processor);
3824 		processor->first_timeslice = FALSE;
3825 		thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3826 	}
3827 
3828 	assert(self->block_hint == kThreadWaitNone);
3829 	self->computation_epoch = processor->last_dispatch;
3830 	/*
3831 	 * This relies on the interrupt time being tallied up to the thread in the
3832 	 * exception handler epilogue, which is before AST context where preemption
3833 	 * is considered (and the scheduler is potentially invoked to
3834 	 * context switch, here).
3835 	 */
3836 	self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3837 	self->reason = AST_NONE;
3838 	processor->starting_pri = self->sched_pri;
3839 
3840 	thread_unlock(self);
3841 
3842 	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3843 	    processor->last_dispatch);
3844 
3845 #if defined(CONFIG_SCHED_DEFERRED_AST)
3846 	/*
3847 	 * TODO: Can we state that redispatching our old thread is also
3848 	 * uninteresting?
3849 	 */
3850 	if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3851 		pset_cancel_deferred_dispatch(processor->processor_set, processor);
3852 	}
3853 #endif
3854 }
3855 
3856 /*
3857  *	thread_block_reason:
3858  *
3859  *	Forces a reschedule, blocking the caller if a wait
3860  *	has been asserted.
3861  *
3862  *	If a continuation is specified, then thread_invoke will
3863  *	attempt to discard the thread's kernel stack.  When the
3864  *	thread resumes, it will execute the continuation function
3865  *	on a new kernel stack.
3866  */
3867 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3868 thread_block_reason(
3869 	thread_continue_t       continuation,
3870 	void                            *parameter,
3871 	ast_t                           reason)
3872 {
3873 	thread_t        self = current_thread();
3874 	processor_t     processor;
3875 	thread_t        new_thread;
3876 	spl_t           s;
3877 
3878 	s = splsched();
3879 
3880 	processor = current_processor();
3881 
3882 	/* If we're explicitly yielding, force a subsequent quantum */
3883 	if (reason & AST_YIELD) {
3884 		processor->first_timeslice = FALSE;
3885 	}
3886 
3887 	/* We're handling all scheduling AST's */
3888 	ast_off(AST_SCHEDULING);
3889 
3890 	clear_pending_nonurgent_preemption(processor);
3891 
3892 #if PROC_REF_DEBUG
3893 	if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3894 		uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3895 	}
3896 #endif
3897 
3898 	self->continuation = continuation;
3899 	self->parameter = parameter;
3900 
3901 	if (self->state & ~(TH_RUN | TH_IDLE)) {
3902 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3903 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3904 		    reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3905 	}
3906 
3907 	do {
3908 		thread_lock(self);
3909 		new_thread = thread_select(self, processor, &reason);
3910 		thread_unlock(self);
3911 	} while (!thread_invoke(self, new_thread, reason));
3912 
3913 	splx(s);
3914 
3915 	return self->wait_result;
3916 }
3917 
3918 /*
3919  *	thread_block:
3920  *
3921  *	Block the current thread if a wait has been asserted.
3922  */
3923 wait_result_t
thread_block(thread_continue_t continuation)3924 thread_block(
3925 	thread_continue_t       continuation)
3926 {
3927 	return thread_block_reason(continuation, NULL, AST_NONE);
3928 }
3929 
3930 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3931 thread_block_parameter(
3932 	thread_continue_t       continuation,
3933 	void                            *parameter)
3934 {
3935 	return thread_block_reason(continuation, parameter, AST_NONE);
3936 }
3937 
3938 /*
3939  *	thread_run:
3940  *
3941  *	Switch directly from the current thread to the
3942  *	new thread, handing off our quantum if appropriate.
3943  *
3944  *	New thread must be runnable, and not on a run queue.
3945  *
3946  *	Called at splsched.
3947  */
3948 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3949 thread_run(
3950 	thread_t                        self,
3951 	thread_continue_t       continuation,
3952 	void                            *parameter,
3953 	thread_t                        new_thread)
3954 {
3955 	ast_t reason = AST_NONE;
3956 
3957 	if ((self->state & TH_IDLE) == 0) {
3958 		reason = AST_HANDOFF;
3959 	}
3960 
3961 	/*
3962 	 * If this thread hadn't been setrun'ed, it
3963 	 * might not have a chosen processor, so give it one
3964 	 */
3965 	if (new_thread->chosen_processor == NULL) {
3966 		new_thread->chosen_processor = current_processor();
3967 	}
3968 
3969 	self->continuation = continuation;
3970 	self->parameter = parameter;
3971 
3972 	while (!thread_invoke(self, new_thread, reason)) {
3973 		/* the handoff failed, so we have to fall back to the normal block path */
3974 		processor_t processor = current_processor();
3975 
3976 		reason = AST_NONE;
3977 
3978 		thread_lock(self);
3979 		new_thread = thread_select(self, processor, &reason);
3980 		thread_unlock(self);
3981 	}
3982 
3983 	return self->wait_result;
3984 }
3985 
3986 /*
3987  *	thread_continue:
3988  *
3989  *	Called at splsched when a thread first receives
3990  *	a new stack after a continuation.
3991  *
3992  *	Called with THREAD_NULL as the old thread when
3993  *	invoked by machine_load_context.
3994  */
3995 void
thread_continue(thread_t thread)3996 thread_continue(
3997 	thread_t        thread)
3998 {
3999 	thread_t                self = current_thread();
4000 	thread_continue_t       continuation;
4001 	void                    *parameter;
4002 
4003 	DTRACE_SCHED(on__cpu);
4004 
4005 	continuation = self->continuation;
4006 	parameter = self->parameter;
4007 
4008 	assert(continuation != NULL);
4009 
4010 #if KPERF
4011 	kperf_on_cpu(self, continuation, NULL);
4012 #endif
4013 
4014 	thread_dispatch(thread, self);
4015 
4016 	self->continuation = self->parameter = NULL;
4017 
4018 #if SCHED_HYGIENE_DEBUG
4019 	/* Reset interrupt-masked spin debugging timeout */
4020 	ml_spin_debug_clear(self);
4021 #endif
4022 
4023 	TLOG(1, "thread_continue: calling call_continuation\n");
4024 
4025 	boolean_t enable_interrupts = TRUE;
4026 
4027 	/* bootstrap thread, idle thread need to stay interrupts-disabled */
4028 	if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4029 		enable_interrupts = FALSE;
4030 	}
4031 
4032 #if KASAN_TBI
4033 	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4034 #endif /* KASAN_TBI */
4035 
4036 
4037 	call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
4038 	/*NOTREACHED*/
4039 }
4040 
4041 void
thread_quantum_init(thread_t thread,uint64_t now)4042 thread_quantum_init(thread_t thread, uint64_t now)
4043 {
4044 	uint64_t new_quantum = 0;
4045 
4046 	switch (thread->sched_mode) {
4047 	case TH_MODE_REALTIME:
4048 		new_quantum = thread->realtime.computation;
4049 		new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4050 		break;
4051 
4052 	case TH_MODE_FIXED:
4053 		new_quantum = SCHED(initial_quantum_size)(thread);
4054 		new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4055 		break;
4056 
4057 	default:
4058 		new_quantum = SCHED(initial_quantum_size)(thread);
4059 		break;
4060 	}
4061 
4062 	if (cpulimit_affects_quantum) {
4063 		const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4064 
4065 		/*
4066 		 * If there's no remaining CPU time, the ledger system will
4067 		 * notice and put the thread to sleep.
4068 		 */
4069 		if (cpulimit_remaining > 0) {
4070 			new_quantum = MIN(new_quantum, cpulimit_remaining);
4071 		}
4072 	}
4073 
4074 	assert3u(new_quantum, <, UINT32_MAX);
4075 	assert3u(new_quantum, >, 0);
4076 
4077 	thread->quantum_remaining = (uint32_t)new_quantum;
4078 }
4079 
4080 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)4081 sched_timeshare_initial_quantum_size(thread_t thread)
4082 {
4083 	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4084 		return bg_quantum;
4085 	} else {
4086 		return std_quantum;
4087 	}
4088 }
4089 
4090 /*
4091  *	run_queue_init:
4092  *
4093  *	Initialize a run queue before first use.
4094  */
4095 void
run_queue_init(run_queue_t rq)4096 run_queue_init(
4097 	run_queue_t             rq)
4098 {
4099 	rq->highq = NOPRI;
4100 	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4101 		rq->bitmap[i] = 0;
4102 	}
4103 	rq->urgency = rq->count = 0;
4104 	for (int i = 0; i < NRQS; i++) {
4105 		circle_queue_init(&rq->queues[i]);
4106 	}
4107 }
4108 
4109 /*
4110  *	run_queue_dequeue:
4111  *
4112  *	Perform a dequeue operation on a run queue,
4113  *	and return the resulting thread.
4114  *
4115  *	The run queue must be locked (see thread_run_queue_remove()
4116  *	for more info), and not empty.
4117  */
4118 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4119 run_queue_dequeue(
4120 	run_queue_t     rq,
4121 	sched_options_t options)
4122 {
4123 	thread_t        thread;
4124 	circle_queue_t  queue = &rq->queues[rq->highq];
4125 
4126 	if (options & SCHED_HEADQ) {
4127 		thread = cqe_dequeue_head(queue, struct thread, runq_links);
4128 	} else {
4129 		thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4130 	}
4131 
4132 	assert(thread != THREAD_NULL);
4133 	assert_thread_magic(thread);
4134 
4135 	thread_clear_runq(thread);
4136 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4137 	rq->count--;
4138 	if (SCHED(priority_is_urgent)(rq->highq)) {
4139 		rq->urgency--; assert(rq->urgency >= 0);
4140 	}
4141 	if (circle_queue_empty(queue)) {
4142 		bitmap_clear(rq->bitmap, rq->highq);
4143 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4144 	}
4145 
4146 	return thread;
4147 }
4148 
4149 /*
4150  *	run_queue_enqueue:
4151  *
4152  *	Perform a enqueue operation on a run queue.
4153  *
4154  *	The run queue must be locked (see thread_run_queue_remove()
4155  *	for more info).
4156  */
4157 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4158 run_queue_enqueue(
4159 	run_queue_t      rq,
4160 	thread_t         thread,
4161 	sched_options_t  options)
4162 {
4163 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4164 	boolean_t       result = FALSE;
4165 
4166 	assert_thread_magic(thread);
4167 
4168 	if (circle_queue_empty(queue)) {
4169 		circle_enqueue_tail(queue, &thread->runq_links);
4170 
4171 		rq_bitmap_set(rq->bitmap, thread->sched_pri);
4172 		if (thread->sched_pri > rq->highq) {
4173 			rq->highq = thread->sched_pri;
4174 			result = TRUE;
4175 		}
4176 	} else {
4177 		if (options & SCHED_TAILQ) {
4178 			circle_enqueue_tail(queue, &thread->runq_links);
4179 		} else {
4180 			circle_enqueue_head(queue, &thread->runq_links);
4181 		}
4182 	}
4183 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4184 		rq->urgency++;
4185 	}
4186 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4187 	rq->count++;
4188 
4189 	return result;
4190 }
4191 
4192 /*
4193  *	run_queue_remove:
4194  *
4195  *	Remove a specific thread from a runqueue.
4196  *
4197  *	The run queue must be locked.
4198  */
4199 void
run_queue_remove(run_queue_t rq,thread_t thread)4200 run_queue_remove(
4201 	run_queue_t    rq,
4202 	thread_t       thread)
4203 {
4204 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4205 
4206 	thread_assert_runq_nonnull(thread);
4207 	assert_thread_magic(thread);
4208 
4209 	circle_dequeue(queue, &thread->runq_links);
4210 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4211 	rq->count--;
4212 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4213 		rq->urgency--; assert(rq->urgency >= 0);
4214 	}
4215 
4216 	if (circle_queue_empty(queue)) {
4217 		/* update run queue status */
4218 		bitmap_clear(rq->bitmap, thread->sched_pri);
4219 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4220 	}
4221 
4222 	thread_clear_runq(thread);
4223 }
4224 
4225 /*
4226  *      run_queue_peek
4227  *
4228  *      Peek at the runq and return the highest
4229  *      priority thread from the runq.
4230  *
4231  *	The run queue must be locked.
4232  */
4233 thread_t
run_queue_peek(run_queue_t rq)4234 run_queue_peek(
4235 	run_queue_t    rq)
4236 {
4237 	if (rq->count > 0) {
4238 		circle_queue_t queue = &rq->queues[rq->highq];
4239 		thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4240 		assert_thread_magic(thread);
4241 		return thread;
4242 	} else {
4243 		return THREAD_NULL;
4244 	}
4245 }
4246 
4247 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4248 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4249 {
4250 	int pri = thread->sched_pri;
4251 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4252 	int i = pri - BASEPRI_RTQUEUES;
4253 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4254 	bitmap_t *map = rt_run_queue->bitmap;
4255 
4256 	bitmap_set(map, i);
4257 
4258 	queue_t     queue       = &rt_runq->pri_queue;
4259 	uint64_t    deadline    = thread->realtime.deadline;
4260 	bool        preempt     = false;
4261 	bool        earliest    = false;
4262 
4263 	if (queue_empty(queue)) {
4264 		enqueue_tail(queue, &thread->runq_links);
4265 		preempt = true;
4266 		earliest = true;
4267 		rt_runq->pri_earliest_deadline = deadline;
4268 		rt_runq->pri_constraint = thread->realtime.constraint;
4269 	} else {
4270 		/* Insert into rt_runq in thread deadline order */
4271 		queue_entry_t iter;
4272 		qe_foreach(iter, queue) {
4273 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4274 			assert_thread_magic(iter_thread);
4275 
4276 			if (deadline < iter_thread->realtime.deadline) {
4277 				if (iter == queue_first(queue)) {
4278 					preempt = true;
4279 					earliest = true;
4280 					rt_runq->pri_earliest_deadline = deadline;
4281 					rt_runq->pri_constraint = thread->realtime.constraint;
4282 				}
4283 				insque(&thread->runq_links, queue_prev(iter));
4284 				break;
4285 			} else if (iter == queue_last(queue)) {
4286 				enqueue_tail(queue, &thread->runq_links);
4287 				break;
4288 			}
4289 		}
4290 	}
4291 	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4292 		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4293 		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4294 		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4295 	}
4296 
4297 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4298 	rt_runq->pri_count++;
4299 	os_atomic_inc(&rt_run_queue->count, relaxed);
4300 
4301 	thread_set_runq_locked(thread, processor);
4302 
4303 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4304 
4305 	return preempt;
4306 }
4307 
4308 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4309 rt_runq_dequeue(rt_queue_t rt_run_queue)
4310 {
4311 	bitmap_t *map = rt_run_queue->bitmap;
4312 	int i = bitmap_first(map, NRTQS);
4313 	assert((i >= 0) && (i < NRTQS));
4314 
4315 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4316 
4317 	if (!sched_rt_runq_strict_priority) {
4318 		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4319 		if (ed_index != i) {
4320 			assert((ed_index >= 0) && (ed_index < NRTQS));
4321 			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4322 
4323 			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4324 			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4325 
4326 			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4327 				/* choose the earliest deadline thread */
4328 				rt_runq = ed_runq;
4329 				i = ed_index;
4330 			}
4331 		}
4332 	}
4333 
4334 	assert(rt_runq->pri_count > 0);
4335 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4336 	uint32_t constraint = RT_CONSTRAINT_NONE;
4337 	int ed_index = NOPRI;
4338 	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4339 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4340 	if (--rt_runq->pri_count > 0) {
4341 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4342 		assert(next_rt != THREAD_NULL);
4343 		earliest_deadline = next_rt->realtime.deadline;
4344 		constraint = next_rt->realtime.constraint;
4345 		ed_index = i;
4346 	} else {
4347 		bitmap_clear(map, i);
4348 	}
4349 	rt_runq->pri_earliest_deadline = earliest_deadline;
4350 	rt_runq->pri_constraint = constraint;
4351 
4352 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4353 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4354 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4355 			earliest_deadline = rt_runq->pri_earliest_deadline;
4356 			constraint = rt_runq->pri_constraint;
4357 			ed_index = i;
4358 		}
4359 	}
4360 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4361 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4362 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4363 	os_atomic_dec(&rt_run_queue->count, relaxed);
4364 
4365 	thread_clear_runq(new_thread);
4366 
4367 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4368 
4369 	return new_thread;
4370 }
4371 
4372 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4373 rt_runq_first(rt_queue_t rt_run_queue)
4374 {
4375 	bitmap_t *map = rt_run_queue->bitmap;
4376 	int i = bitmap_first(map, NRTQS);
4377 	if (i < 0) {
4378 		return THREAD_NULL;
4379 	}
4380 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4381 	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4382 
4383 	return next_rt;
4384 }
4385 
4386 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4387 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4388 {
4389 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4390 
4391 	int pri = thread->sched_pri;
4392 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4393 	int i = pri - BASEPRI_RTQUEUES;
4394 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4395 	bitmap_t *map = rt_run_queue->bitmap;
4396 
4397 	assert(rt_runq->pri_count > 0);
4398 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4399 	uint32_t constraint = RT_CONSTRAINT_NONE;
4400 	int ed_index = NOPRI;
4401 	remqueue(&thread->runq_links);
4402 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4403 	if (--rt_runq->pri_count > 0) {
4404 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4405 		earliest_deadline = next_rt->realtime.deadline;
4406 		constraint = next_rt->realtime.constraint;
4407 		ed_index = i;
4408 	} else {
4409 		bitmap_clear(map, i);
4410 	}
4411 	rt_runq->pri_earliest_deadline = earliest_deadline;
4412 	rt_runq->pri_constraint = constraint;
4413 
4414 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4415 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4416 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4417 			earliest_deadline = rt_runq->pri_earliest_deadline;
4418 			constraint = rt_runq->pri_constraint;
4419 			ed_index = i;
4420 		}
4421 	}
4422 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4423 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4424 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4425 	os_atomic_dec(&rt_run_queue->count, relaxed);
4426 
4427 	thread_clear_runq_locked(thread);
4428 
4429 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4430 }
4431 
4432 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4433 sched_rtlocal_runq(processor_set_t pset)
4434 {
4435 	return &pset->rt_runq;
4436 }
4437 
4438 void
sched_rtlocal_init(processor_set_t pset)4439 sched_rtlocal_init(processor_set_t pset)
4440 {
4441 	pset_rt_init(pset);
4442 }
4443 
4444 void
sched_rtlocal_queue_shutdown(processor_t processor)4445 sched_rtlocal_queue_shutdown(processor_t processor)
4446 {
4447 	processor_set_t pset = processor->processor_set;
4448 	thread_t        thread;
4449 	queue_head_t    tqueue;
4450 
4451 	pset_lock(pset);
4452 
4453 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4454 	if (bit_count(pset_available_cpumap(pset)) > 0) {
4455 		pset_unlock(pset);
4456 		return;
4457 	}
4458 
4459 	queue_init(&tqueue);
4460 
4461 	while (rt_runq_count(pset) > 0) {
4462 		thread = rt_runq_dequeue(&pset->rt_runq);
4463 		enqueue_tail(&tqueue, &thread->runq_links);
4464 	}
4465 	sched_update_pset_load_average(pset, 0);
4466 	pset_update_rt_stealable_state(pset);
4467 	pset_unlock(pset);
4468 
4469 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4470 		remqueue(&thread->runq_links);
4471 
4472 		thread_lock(thread);
4473 
4474 		thread_setrun(thread, SCHED_TAILQ);
4475 
4476 		thread_unlock(thread);
4477 	}
4478 }
4479 
4480 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4481 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4482 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4483 {
4484 	thread_t        thread;
4485 
4486 	pset_node_t node = &pset_node0;
4487 	processor_set_t pset = node->psets;
4488 
4489 	spl_t s = splsched();
4490 	do {
4491 		while (pset != NULL) {
4492 			pset_lock(pset);
4493 
4494 			bitmap_t *map = pset->rt_runq.bitmap;
4495 			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4496 				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4497 
4498 				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4499 					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4500 						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4501 					}
4502 				}
4503 			}
4504 
4505 			pset_unlock(pset);
4506 
4507 			pset = pset->pset_list;
4508 		}
4509 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4510 	splx(s);
4511 }
4512 
4513 int64_t
sched_rtlocal_runq_count_sum(void)4514 sched_rtlocal_runq_count_sum(void)
4515 {
4516 	pset_node_t node = &pset_node0;
4517 	processor_set_t pset = node->psets;
4518 	int64_t count = 0;
4519 
4520 	do {
4521 		while (pset != NULL) {
4522 			count += pset->rt_runq.runq_stats.count_sum;
4523 
4524 			pset = pset->pset_list;
4525 		}
4526 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4527 
4528 	return count;
4529 }
4530 
4531 /*
4532  * Called with stealing_pset locked and
4533  * returns with stealing_pset locked
4534  * but the lock will have been dropped
4535  * if a thread is returned.
4536  */
4537 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4538 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4539 {
4540 	if (!sched_allow_rt_steal) {
4541 		return THREAD_NULL;
4542 	}
4543 	pset_map_t pset_map = stealing_pset->node->pset_map;
4544 
4545 	bit_clear(pset_map, stealing_pset->pset_id);
4546 
4547 	processor_set_t pset = stealing_pset;
4548 
4549 	processor_set_t target_pset;
4550 	uint64_t target_deadline;
4551 
4552 retry:
4553 	target_pset = NULL;
4554 	target_deadline = earliest_deadline - rt_deadline_epsilon;
4555 
4556 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4557 		processor_set_t nset = pset_array[pset_id];
4558 
4559 		/*
4560 		 * During startup, while pset_array[] and node->pset_map are still being initialized,
4561 		 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4562 		 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4563 		 * so just check nset is not NULL instead.
4564 		 */
4565 		if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4566 			target_deadline = nset->stealable_rt_threads_earliest_deadline;
4567 			target_pset = nset;
4568 		}
4569 	}
4570 
4571 	if (target_pset != NULL) {
4572 		pset = change_locked_pset(pset, target_pset);
4573 		if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4574 			thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4575 			pset_update_rt_stealable_state(pset);
4576 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4577 
4578 			pset = change_locked_pset(pset, stealing_pset);
4579 			return new_thread;
4580 		}
4581 		pset = change_locked_pset(pset, stealing_pset);
4582 		earliest_deadline = rt_runq_earliest_deadline(pset);
4583 		goto retry;
4584 	}
4585 
4586 	pset = change_locked_pset(pset, stealing_pset);
4587 	return THREAD_NULL;
4588 }
4589 
4590 /*
4591  * pset is locked
4592  */
4593 thread_t
sched_rt_choose_thread(processor_set_t pset)4594 sched_rt_choose_thread(processor_set_t pset)
4595 {
4596 	processor_t processor = current_processor();
4597 
4598 	if (SCHED(steal_thread_enabled)(pset)) {
4599 		do {
4600 			bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4601 			if (spill_pending) {
4602 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4603 			}
4604 			thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4605 			if (new_thread != THREAD_NULL) {
4606 				if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4607 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4608 				}
4609 				return new_thread;
4610 			}
4611 		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4612 	}
4613 
4614 	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4615 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4616 	}
4617 
4618 	if (rt_runq_count(pset) > 0) {
4619 		thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4620 		assert(new_thread != THREAD_NULL);
4621 		pset_update_rt_stealable_state(pset);
4622 		return new_thread;
4623 	}
4624 
4625 	return THREAD_NULL;
4626 }
4627 
4628 /*
4629  *	realtime_queue_insert:
4630  *
4631  *	Enqueue a thread for realtime execution.
4632  */
4633 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4634 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4635 {
4636 	pset_assert_locked(pset);
4637 
4638 	bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4639 	pset_update_rt_stealable_state(pset);
4640 
4641 	return preempt;
4642 }
4643 
4644 /*
4645  *	realtime_setrun:
4646  *
4647  *	Dispatch a thread for realtime execution.
4648  *
4649  *	Thread must be locked.  Associated pset must
4650  *	be locked, and is returned unlocked.
4651  */
4652 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4653 realtime_setrun(
4654 	processor_t                     chosen_processor,
4655 	thread_t                        thread)
4656 {
4657 	processor_set_t pset = chosen_processor->processor_set;
4658 	pset_assert_locked(pset);
4659 	bool pset_is_locked = true;
4660 
4661 	int n_backup = 0;
4662 
4663 	if (thread->realtime.constraint <= rt_constraint_threshold) {
4664 		n_backup = sched_rt_n_backup_processors;
4665 	}
4666 	assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4667 
4668 	int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4669 	if (existing_backups > 0) {
4670 		n_backup = n_backup - existing_backups;
4671 		if (n_backup < 0) {
4672 			n_backup = 0;
4673 		}
4674 	}
4675 
4676 	sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4677 	processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4678 
4679 	thread->chosen_processor = chosen_processor;
4680 
4681 	/* <rdar://problem/15102234> */
4682 	assert(thread->bound_processor == PROCESSOR_NULL);
4683 
4684 	realtime_queue_insert(chosen_processor, pset, thread);
4685 
4686 	processor_t processor = chosen_processor;
4687 
4688 	int count = 0;
4689 	for (int i = 0; i <= n_backup; i++) {
4690 		if (i == 0) {
4691 			ipi_type[i] = SCHED_IPI_NONE;
4692 			ipi_processor[i] = processor;
4693 			count++;
4694 
4695 			ast_t preempt = AST_NONE;
4696 			if (thread->sched_pri > processor->current_pri) {
4697 				preempt = (AST_PREEMPT | AST_URGENT);
4698 			} else if (thread->sched_pri == processor->current_pri) {
4699 				if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4700 					preempt = (AST_PREEMPT | AST_URGENT);
4701 				}
4702 			}
4703 
4704 			if (preempt != AST_NONE) {
4705 				if (processor->state == PROCESSOR_IDLE) {
4706 					if (processor == current_processor()) {
4707 						pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4708 						ast_on(preempt);
4709 
4710 						if ((preempt & AST_URGENT) == AST_URGENT) {
4711 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4712 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4713 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4714 							}
4715 						}
4716 
4717 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4718 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4719 						}
4720 					} else {
4721 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4722 					}
4723 				} else if (processor->state == PROCESSOR_DISPATCHING) {
4724 					if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4725 						KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4726 						    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4727 					}
4728 				} else {
4729 					if (processor == current_processor()) {
4730 						ast_on(preempt);
4731 
4732 						if ((preempt & AST_URGENT) == AST_URGENT) {
4733 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4734 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4735 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4736 							}
4737 						}
4738 
4739 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4740 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4741 						}
4742 					} else {
4743 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4744 					}
4745 				}
4746 			} else {
4747 				/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4748 			}
4749 		} else {
4750 			if (!pset_is_locked) {
4751 				pset_lock(pset);
4752 			}
4753 			ipi_type[i] = SCHED_IPI_NONE;
4754 			ipi_processor[i] = PROCESSOR_NULL;
4755 			pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4756 			if (ipi_processor[i] == PROCESSOR_NULL) {
4757 				break;
4758 			}
4759 			count++;
4760 
4761 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4762 			    ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4763 #if defined(__x86_64__)
4764 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4765 			if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4766 				processor_t p0 = ipi_processor[0];
4767 				processor_t p1 = ipi_processor[1];
4768 				assert(p0 && p1);
4769 				if (p_is_good(p0) && p_is_good(p1)) {
4770 					/*
4771 					 * Both the chosen processor and the first backup are non-cpu0 primaries,
4772 					 * so there is no need for a 2nd backup processor.
4773 					 */
4774 					break;
4775 				}
4776 			}
4777 #endif
4778 		}
4779 	}
4780 
4781 	if (pset_is_locked) {
4782 		pset_unlock(pset);
4783 	}
4784 
4785 	assert((count > 0) && (count <= (n_backup + 1)));
4786 	for (int i = 0; i < count; i++) {
4787 		assert(ipi_processor[i] != PROCESSOR_NULL);
4788 		sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4789 	}
4790 }
4791 
4792 
4793 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4794 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4795     thread_t thread, __unused sched_ipi_event_t event)
4796 {
4797 #if defined(CONFIG_SCHED_DEFERRED_AST)
4798 #if CONFIG_THREAD_GROUPS
4799 	if (thread) {
4800 		struct thread_group *tg = thread_group_get(thread);
4801 		if (thread_group_uses_immediate_ipi(tg)) {
4802 			return SCHED_IPI_IMMEDIATE;
4803 		}
4804 	}
4805 #endif /* CONFIG_THREAD_GROUPS */
4806 	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4807 		return SCHED_IPI_DEFERRED;
4808 	}
4809 #else /* CONFIG_SCHED_DEFERRED_AST */
4810 	(void) thread;
4811 	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4812 #endif /* CONFIG_SCHED_DEFERRED_AST */
4813 	return SCHED_IPI_NONE;
4814 }
4815 
4816 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4817 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4818 {
4819 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4820 	assert(dst != NULL);
4821 
4822 	processor_set_t pset = dst->processor_set;
4823 	if (current_processor() == dst) {
4824 		return SCHED_IPI_NONE;
4825 	}
4826 
4827 	bool dst_idle = (dst->state == PROCESSOR_IDLE);
4828 	if (dst_idle) {
4829 		pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4830 	}
4831 
4832 	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4833 	switch (ipi_type) {
4834 	case SCHED_IPI_NONE:
4835 		return SCHED_IPI_NONE;
4836 #if defined(CONFIG_SCHED_DEFERRED_AST)
4837 	case SCHED_IPI_DEFERRED:
4838 		bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4839 		break;
4840 #endif /* CONFIG_SCHED_DEFERRED_AST */
4841 	default:
4842 		if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4843 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4844 			    dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4845 		}
4846 		bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4847 		break;
4848 	}
4849 	return ipi_type;
4850 }
4851 
4852 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4853 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4854 {
4855 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4856 	boolean_t deferred_ipi_supported = false;
4857 	processor_set_t pset = dst->processor_set;
4858 
4859 #if defined(CONFIG_SCHED_DEFERRED_AST)
4860 	deferred_ipi_supported = true;
4861 #endif /* CONFIG_SCHED_DEFERRED_AST */
4862 
4863 	switch (event) {
4864 	case SCHED_IPI_EVENT_SPILL:
4865 	case SCHED_IPI_EVENT_SMT_REBAL:
4866 	case SCHED_IPI_EVENT_REBALANCE:
4867 	case SCHED_IPI_EVENT_BOUND_THR:
4868 	case SCHED_IPI_EVENT_RT_PREEMPT:
4869 		/*
4870 		 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4871 		 * scenarios use immediate IPIs always.
4872 		 */
4873 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4874 		break;
4875 	case SCHED_IPI_EVENT_PREEMPT:
4876 		/* In the preemption case, use immediate IPIs for RT threads */
4877 		if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4878 			ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4879 			break;
4880 		}
4881 
4882 		/*
4883 		 * For Non-RT threads preemption,
4884 		 * If the core is active, use immediate IPIs.
4885 		 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4886 		 */
4887 		if (deferred_ipi_supported && dst_idle) {
4888 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4889 		}
4890 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4891 		break;
4892 	default:
4893 		panic("Unrecognized scheduler IPI event type %d", event);
4894 	}
4895 	assert(ipi_type != SCHED_IPI_NONE);
4896 	return ipi_type;
4897 }
4898 
4899 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4900 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4901 {
4902 	switch (ipi) {
4903 	case SCHED_IPI_NONE:
4904 		break;
4905 	case SCHED_IPI_IDLE:
4906 		machine_signal_idle(dst);
4907 		break;
4908 	case SCHED_IPI_IMMEDIATE:
4909 		cause_ast_check(dst);
4910 		break;
4911 	case SCHED_IPI_DEFERRED:
4912 		machine_signal_idle_deferred(dst);
4913 		break;
4914 	default:
4915 		panic("Unrecognized scheduler IPI type: %d", ipi);
4916 	}
4917 }
4918 
4919 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4920 
4921 boolean_t
priority_is_urgent(int priority)4922 priority_is_urgent(int priority)
4923 {
4924 	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4925 }
4926 
4927 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4928 
4929 /*
4930  *	processor_setrun:
4931  *
4932  *	Dispatch a thread for execution on a
4933  *	processor.
4934  *
4935  *	Thread must be locked.  Associated pset must
4936  *	be locked, and is returned unlocked.
4937  */
4938 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4939 processor_setrun(
4940 	processor_t                     processor,
4941 	thread_t                        thread,
4942 	integer_t                       options)
4943 {
4944 	processor_set_t pset = processor->processor_set;
4945 	pset_assert_locked(pset);
4946 	ast_t preempt = AST_NONE;
4947 	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4948 
4949 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4950 
4951 	thread->chosen_processor = processor;
4952 
4953 	/*
4954 	 *	Set preemption mode.
4955 	 */
4956 #if defined(CONFIG_SCHED_DEFERRED_AST)
4957 	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4958 #endif
4959 	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4960 		preempt = (AST_PREEMPT | AST_URGENT);
4961 	} else if (processor->current_is_eagerpreempt) {
4962 		preempt = (AST_PREEMPT | AST_URGENT);
4963 	} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4964 		if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4965 			preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4966 		} else {
4967 			preempt = AST_NONE;
4968 		}
4969 	} else {
4970 		preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4971 	}
4972 
4973 	if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4974 		/*
4975 		 * Having gone to the trouble of forcing this thread off a less preferred core,
4976 		 * we should force the preferable core to reschedule immediately to give this
4977 		 * thread a chance to run instead of just sitting on the run queue where
4978 		 * it may just be stolen back by the idle core we just forced it off.
4979 		 */
4980 		preempt |= AST_PREEMPT;
4981 	}
4982 
4983 	SCHED(processor_enqueue)(processor, thread, options);
4984 	sched_update_pset_load_average(pset, 0);
4985 
4986 	if (preempt != AST_NONE) {
4987 		if (processor->state == PROCESSOR_IDLE) {
4988 			ipi_action = eExitIdle;
4989 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4990 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4991 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4992 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4993 			}
4994 		} else if ((processor->state == PROCESSOR_RUNNING ||
4995 		    processor->state == PROCESSOR_SHUTDOWN) &&
4996 		    (thread->sched_pri >= processor->current_pri)) {
4997 			ipi_action = eInterruptRunning;
4998 		}
4999 	} else {
5000 		/*
5001 		 * New thread is not important enough to preempt what is running, but
5002 		 * special processor states may need special handling
5003 		 */
5004 		if (processor->state == PROCESSOR_SHUTDOWN &&
5005 		    thread->sched_pri >= processor->current_pri) {
5006 			ipi_action = eInterruptRunning;
5007 		} else if (processor->state == PROCESSOR_IDLE) {
5008 			ipi_action = eExitIdle;
5009 		} else if (processor->state == PROCESSOR_DISPATCHING) {
5010 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5011 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5012 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5013 			}
5014 		}
5015 	}
5016 
5017 	if (ipi_action != eDoNothing) {
5018 		if (processor == current_processor()) {
5019 			if (ipi_action == eExitIdle) {
5020 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
5021 			}
5022 			if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5023 				ast_on(preempt);
5024 			}
5025 
5026 			if ((preempt & AST_URGENT) == AST_URGENT) {
5027 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5028 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5029 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5030 				}
5031 			} else {
5032 				if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5033 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5034 				}
5035 			}
5036 
5037 			if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5038 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5039 			} else {
5040 				bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5041 			}
5042 		} else {
5043 			sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5044 			ipi_type = sched_ipi_action(processor, thread, event);
5045 		}
5046 	}
5047 
5048 	pset_unlock(pset);
5049 	sched_ipi_perform(processor, ipi_type);
5050 
5051 	if (ipi_action != eDoNothing && processor == current_processor()) {
5052 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
5053 		ast_on(new_preempt);
5054 	}
5055 }
5056 
5057 /*
5058  *	choose_next_pset:
5059  *
5060  *	Return the next sibling pset containing
5061  *	available processors.
5062  *
5063  *	Returns the original pset if none other is
5064  *	suitable.
5065  */
5066 static processor_set_t
choose_next_pset(processor_set_t pset)5067 choose_next_pset(
5068 	processor_set_t         pset)
5069 {
5070 	processor_set_t         nset = pset;
5071 
5072 	do {
5073 		nset = next_pset(nset);
5074 
5075 		/*
5076 		 * Sometimes during startup the pset_map can contain a bit
5077 		 * for a pset that isn't fully published in pset_array because
5078 		 * the pset_map read isn't an acquire load.
5079 		 *
5080 		 * In order to avoid needing an acquire barrier here, just bail
5081 		 * out.
5082 		 */
5083 		if (nset == PROCESSOR_SET_NULL) {
5084 			return pset;
5085 		}
5086 	} while (nset->online_processor_count < 1 && nset != pset);
5087 
5088 	return nset;
5089 }
5090 
5091 /*
5092  *	choose_processor:
5093  *
5094  *	Choose a processor for the thread, beginning at
5095  *	the pset.  Accepts an optional processor hint in
5096  *	the pset.
5097  *
5098  *	Returns a processor, possibly from a different pset.
5099  *
5100  *	The thread must be locked.  The pset must be locked,
5101  *	and the resulting pset is locked on return.
5102  */
5103 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)5104 choose_processor(
5105 	processor_set_t         starting_pset,
5106 	processor_t             processor,
5107 	thread_t                thread)
5108 {
5109 	processor_set_t pset = starting_pset;
5110 	processor_set_t nset;
5111 
5112 	assert(thread->sched_pri <= MAXPRI);
5113 
5114 	/*
5115 	 * Prefer the hinted processor, when appropriate.
5116 	 */
5117 
5118 	/* Fold last processor hint from secondary processor to its primary */
5119 	if (processor != PROCESSOR_NULL) {
5120 		processor = processor->processor_primary;
5121 	}
5122 
5123 	/*
5124 	 * Only consult platform layer if pset is active, which
5125 	 * it may not be in some cases when a multi-set system
5126 	 * is going to sleep.
5127 	 */
5128 	if (pset->online_processor_count) {
5129 		if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5130 			processor_t mc_processor = machine_choose_processor(pset, processor);
5131 			if (mc_processor != PROCESSOR_NULL) {
5132 				processor = mc_processor->processor_primary;
5133 			}
5134 		}
5135 	}
5136 
5137 	/*
5138 	 * At this point, we may have a processor hint, and we may have
5139 	 * an initial starting pset. If the hint is not in the pset, or
5140 	 * if the hint is for a processor in an invalid state, discard
5141 	 * the hint.
5142 	 */
5143 	if (processor != PROCESSOR_NULL) {
5144 		if (processor->processor_set != pset) {
5145 			processor = PROCESSOR_NULL;
5146 		} else if (!processor->is_recommended) {
5147 			processor = PROCESSOR_NULL;
5148 		} else {
5149 			switch (processor->state) {
5150 			case PROCESSOR_START:
5151 			case PROCESSOR_SHUTDOWN:
5152 			case PROCESSOR_PENDING_OFFLINE:
5153 			case PROCESSOR_OFF_LINE:
5154 				/*
5155 				 * Hint is for a processor that cannot support running new threads.
5156 				 */
5157 				processor = PROCESSOR_NULL;
5158 				break;
5159 			case PROCESSOR_IDLE:
5160 				/*
5161 				 * Hint is for an idle processor. Assume it is no worse than any other
5162 				 * idle processor. The platform layer had an opportunity to provide
5163 				 * the "least cost idle" processor above.
5164 				 */
5165 				if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5166 					uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5167 					uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5168 					/*
5169 					 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5170 					 * that needn't be avoided, don't continue running on the same core.
5171 					 */
5172 					if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
5173 						return processor;
5174 					}
5175 				}
5176 				processor = PROCESSOR_NULL;
5177 				break;
5178 			case PROCESSOR_RUNNING:
5179 			case PROCESSOR_DISPATCHING:
5180 				/*
5181 				 * Hint is for an active CPU. This fast-path allows
5182 				 * realtime threads to preempt non-realtime threads
5183 				 * to regain their previous executing processor.
5184 				 */
5185 				if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5186 					if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5187 						return processor;
5188 					}
5189 					processor = PROCESSOR_NULL;
5190 				}
5191 
5192 				/* Otherwise, use hint as part of search below */
5193 				break;
5194 			default:
5195 				processor = PROCESSOR_NULL;
5196 				break;
5197 			}
5198 		}
5199 	}
5200 
5201 	/*
5202 	 * Iterate through the processor sets to locate
5203 	 * an appropriate processor. Seed results with
5204 	 * a last-processor hint, if available, so that
5205 	 * a search must find something strictly better
5206 	 * to replace it.
5207 	 *
5208 	 * A primary/secondary pair of SMT processors are
5209 	 * "unpaired" if the primary is busy but its
5210 	 * corresponding secondary is idle (so the physical
5211 	 * core has full use of its resources).
5212 	 */
5213 
5214 	integer_t lowest_priority = MAXPRI + 1;
5215 	integer_t lowest_secondary_priority = MAXPRI + 1;
5216 	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5217 	integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5218 	integer_t lowest_count = INT_MAX;
5219 	processor_t lp_processor = PROCESSOR_NULL;
5220 	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5221 	processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5222 	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5223 	processor_t lc_processor = PROCESSOR_NULL;
5224 
5225 	if (processor != PROCESSOR_NULL) {
5226 		/* All other states should be enumerated above. */
5227 		assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5228 		assert(thread->sched_pri < BASEPRI_RTQUEUES);
5229 
5230 		lowest_priority = processor->current_pri;
5231 		lp_processor = processor;
5232 
5233 		lowest_count = SCHED(processor_runq_count)(processor);
5234 		lc_processor = processor;
5235 	}
5236 
5237 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5238 		pset_node_t node = pset->node;
5239 		bool include_ast_urgent_pending_cpus = false;
5240 		cpumap_t ast_urgent_pending;
5241 try_again:
5242 		ast_urgent_pending = 0;
5243 		int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5244 		for (; consider_secondaries < 2; consider_secondaries++) {
5245 			pset = change_locked_pset(pset, starting_pset);
5246 			do {
5247 				cpumap_t available_map = pset_available_cpumap(pset);
5248 				if (available_map == 0) {
5249 					goto no_available_cpus;
5250 				}
5251 
5252 				processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5253 				if (processor) {
5254 					return processor;
5255 				}
5256 
5257 				if (consider_secondaries) {
5258 					processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5259 					if (processor) {
5260 						/*
5261 						 * Instead of looping through all the psets to find the global
5262 						 * furthest deadline processor, preempt the first candidate found.
5263 						 * The preempted thread will then find any other available far deadline
5264 						 * processors to preempt.
5265 						 */
5266 						return processor;
5267 					}
5268 
5269 					ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5270 
5271 					if (rt_runq_count(pset) < lowest_count) {
5272 						int cpuid = bit_first(available_map);
5273 						assert(cpuid >= 0);
5274 						lc_processor = processor_array[cpuid];
5275 						lowest_count = rt_runq_count(pset);
5276 					}
5277 				}
5278 
5279 no_available_cpus:
5280 				nset = next_pset(pset);
5281 
5282 				if (nset != starting_pset) {
5283 					pset = change_locked_pset(pset, nset);
5284 				}
5285 			} while (nset != starting_pset);
5286 		}
5287 
5288 		/* Short cut for single pset nodes */
5289 		if (bit_count(node->pset_map) == 1) {
5290 			if (lc_processor) {
5291 				pset_assert_locked(lc_processor->processor_set);
5292 				return lc_processor;
5293 			}
5294 		} else {
5295 			if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5296 				/* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5297 				include_ast_urgent_pending_cpus = true;
5298 				goto try_again;
5299 			}
5300 		}
5301 
5302 		processor = lc_processor;
5303 
5304 		if (processor) {
5305 			pset = change_locked_pset(pset, processor->processor_set);
5306 			/* Check that chosen processor is still usable */
5307 			cpumap_t available_map = pset_available_cpumap(pset);
5308 			if (bit_test(available_map, processor->cpu_id)) {
5309 				return processor;
5310 			}
5311 
5312 			/* processor is no longer usable */
5313 			processor = PROCESSOR_NULL;
5314 		}
5315 
5316 		pset_assert_locked(pset);
5317 		pset_unlock(pset);
5318 		return PROCESSOR_NULL;
5319 	}
5320 
5321 	/* No realtime threads from this point on */
5322 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5323 
5324 	do {
5325 		/*
5326 		 * Choose an idle processor, in pset traversal order
5327 		 */
5328 		uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5329 		uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5330 
5331 		/* there shouldn't be a pending AST if the processor is idle */
5332 		assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5333 
5334 		/*
5335 		 * Look at the preferred cores first.
5336 		 */
5337 		int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
5338 		if (cpuid < 0) {
5339 			cpuid = lsb_first(preferred_idle_primary_map);
5340 		}
5341 		if (cpuid >= 0) {
5342 			processor = processor_array[cpuid];
5343 			pset->cpu_preferred_last_chosen = cpuid;
5344 			return processor;
5345 		}
5346 
5347 		/*
5348 		 * Look at the cores that don't need to be avoided next.
5349 		 */
5350 		if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5351 			uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5352 			cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen);
5353 			if (cpuid < 0) {
5354 				cpuid = lsb_first(non_avoided_idle_primary_map);
5355 			}
5356 			if (cpuid >= 0) {
5357 				processor = processor_array[cpuid];
5358 				pset->cpu_preferred_last_chosen = cpuid;
5359 				return processor;
5360 			}
5361 		}
5362 
5363 		/*
5364 		 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5365 		 */
5366 		cpuid = lsb_first(idle_primary_map);
5367 		if (cpuid >= 0) {
5368 			processor = processor_array[cpuid];
5369 			return processor;
5370 		}
5371 
5372 		/*
5373 		 * Otherwise, enumerate active and idle processors to find primary candidates
5374 		 * with lower priority/etc.
5375 		 */
5376 
5377 		uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5378 		    pset->recommended_bitmask &
5379 		    ~pset->pending_AST_URGENT_cpu_mask);
5380 
5381 		if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5382 			active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5383 		}
5384 
5385 		active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5386 		for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5387 			cpuid = ((rotid + pset->last_chosen + 1) & 63);
5388 			processor = processor_array[cpuid];
5389 
5390 			integer_t cpri = processor->current_pri;
5391 			processor_t primary = processor->processor_primary;
5392 			if (primary != processor) {
5393 				/* If primary is running a NO_SMT thread, don't choose its secondary */
5394 				if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5395 					if (cpri < lowest_secondary_priority) {
5396 						lowest_secondary_priority = cpri;
5397 						lp_paired_secondary_processor = processor;
5398 					}
5399 				}
5400 			} else {
5401 				if (cpri < lowest_priority) {
5402 					lowest_priority = cpri;
5403 					lp_processor = processor;
5404 				}
5405 			}
5406 
5407 			integer_t ccount = SCHED(processor_runq_count)(processor);
5408 			if (ccount < lowest_count) {
5409 				lowest_count = ccount;
5410 				lc_processor = processor;
5411 			}
5412 		}
5413 
5414 		/*
5415 		 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5416 		 * the idle primary would have short-circuited the loop above
5417 		 */
5418 		uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5419 		    ~pset->primary_map &
5420 		    pset->recommended_bitmask);
5421 
5422 		/* there shouldn't be a pending AST if the processor is idle */
5423 		assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5424 		assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5425 
5426 		for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5427 			processor = processor_array[cpuid];
5428 
5429 			processor_t cprimary = processor->processor_primary;
5430 
5431 			integer_t primary_pri = cprimary->current_pri;
5432 
5433 			/*
5434 			 * TODO: This should also make the same decisions
5435 			 * as secondary_can_run_realtime_thread
5436 			 *
5437 			 * TODO: Keep track of the pending preemption priority
5438 			 * of the primary to make this more accurate.
5439 			 */
5440 
5441 			/* If the primary is running a no-smt thread, then don't choose its secondary */
5442 			if (cprimary->state == PROCESSOR_RUNNING &&
5443 			    processor_active_thread_no_smt(cprimary)) {
5444 				continue;
5445 			}
5446 
5447 			/*
5448 			 * Find the idle secondary processor with the lowest priority primary
5449 			 *
5450 			 * We will choose this processor as a fallback if we find no better
5451 			 * primary to preempt.
5452 			 */
5453 			if (primary_pri < lowest_idle_secondary_priority) {
5454 				lp_idle_secondary_processor = processor;
5455 				lowest_idle_secondary_priority = primary_pri;
5456 			}
5457 
5458 			/* Find the the lowest priority active primary with idle secondary */
5459 			if (primary_pri < lowest_unpaired_primary_priority) {
5460 				/* If the primary processor is offline or starting up, it's not a candidate for this path */
5461 				if (cprimary->state != PROCESSOR_RUNNING &&
5462 				    cprimary->state != PROCESSOR_DISPATCHING) {
5463 					continue;
5464 				}
5465 
5466 				if (!cprimary->is_recommended) {
5467 					continue;
5468 				}
5469 
5470 				/* if the primary is pending preemption, don't try to re-preempt it */
5471 				if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5472 					continue;
5473 				}
5474 
5475 				if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5476 				    bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5477 					continue;
5478 				}
5479 
5480 				lowest_unpaired_primary_priority = primary_pri;
5481 				lp_unpaired_primary_processor = cprimary;
5482 			}
5483 		}
5484 
5485 		/*
5486 		 * We prefer preempting a primary processor over waking up its secondary.
5487 		 * The secondary will then be woken up by the preempted thread.
5488 		 */
5489 		if (thread->sched_pri > lowest_unpaired_primary_priority) {
5490 			pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5491 			return lp_unpaired_primary_processor;
5492 		}
5493 
5494 		/*
5495 		 * We prefer preempting a lower priority active processor over directly
5496 		 * waking up an idle secondary.
5497 		 * The preempted thread will then find the idle secondary.
5498 		 */
5499 		if (thread->sched_pri > lowest_priority) {
5500 			pset->last_chosen = lp_processor->cpu_id;
5501 			return lp_processor;
5502 		}
5503 
5504 		/*
5505 		 * lc_processor is used to indicate the best processor set run queue
5506 		 * on which to enqueue a thread when all available CPUs are busy with
5507 		 * higher priority threads, so try to make sure it is initialized.
5508 		 */
5509 		if (lc_processor == PROCESSOR_NULL) {
5510 			cpumap_t available_map = pset_available_cpumap(pset);
5511 			cpuid = lsb_first(available_map);
5512 			if (cpuid >= 0) {
5513 				lc_processor = processor_array[cpuid];
5514 				lowest_count = SCHED(processor_runq_count)(lc_processor);
5515 			}
5516 		}
5517 
5518 		/*
5519 		 * Move onto the next processor set.
5520 		 *
5521 		 * If all primary processors in this pset are running a higher
5522 		 * priority thread, move on to next pset. Only when we have
5523 		 * exhausted the search for primary processors do we
5524 		 * fall back to secondaries.
5525 		 */
5526 #if CONFIG_SCHED_EDGE
5527 		/*
5528 		 * The edge scheduler expects a CPU to be selected from the pset it passed in
5529 		 * as the starting pset for non-RT workloads. The edge migration algorithm
5530 		 * should already have considered idle CPUs and loads to decide the starting_pset;
5531 		 * which means that this loop can be short-circuted.
5532 		 */
5533 		nset = starting_pset;
5534 #else /* CONFIG_SCHED_EDGE */
5535 		nset = next_pset(pset);
5536 #endif /* CONFIG_SCHED_EDGE */
5537 
5538 		if (nset != starting_pset) {
5539 			pset = change_locked_pset(pset, nset);
5540 		}
5541 	} while (nset != starting_pset);
5542 
5543 	/*
5544 	 * Make sure that we pick a running processor,
5545 	 * and that the correct processor set is locked.
5546 	 * Since we may have unlocked the candidate processor's
5547 	 * pset, it may have changed state.
5548 	 *
5549 	 * All primary processors are running a higher priority
5550 	 * thread, so the only options left are enqueuing on
5551 	 * the secondary processor that would perturb the least priority
5552 	 * primary, or the least busy primary.
5553 	 */
5554 
5555 	/* lowest_priority is evaluated in the main loops above */
5556 	if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5557 		processor = lp_idle_secondary_processor;
5558 	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5559 		processor = lp_paired_secondary_processor;
5560 	} else if (lc_processor != PROCESSOR_NULL) {
5561 		processor = lc_processor;
5562 	} else {
5563 		processor = PROCESSOR_NULL;
5564 	}
5565 
5566 	if (processor) {
5567 		pset = change_locked_pset(pset, processor->processor_set);
5568 		/* Check that chosen processor is still usable */
5569 		cpumap_t available_map = pset_available_cpumap(pset);
5570 		if (bit_test(available_map, processor->cpu_id)) {
5571 			pset->last_chosen = processor->cpu_id;
5572 			return processor;
5573 		}
5574 
5575 		/* processor is no longer usable */
5576 		processor = PROCESSOR_NULL;
5577 	}
5578 
5579 	pset_assert_locked(pset);
5580 	pset_unlock(pset);
5581 	return PROCESSOR_NULL;
5582 }
5583 
5584 /*
5585  * Default implementation of SCHED(choose_node)()
5586  * for single node systems
5587  */
5588 pset_node_t
sched_choose_node(__unused thread_t thread)5589 sched_choose_node(__unused thread_t thread)
5590 {
5591 	return &pset_node0;
5592 }
5593 
5594 /*
5595  *	choose_starting_pset:
5596  *
5597  *	Choose a starting processor set for the thread.
5598  *	May return a processor hint within the pset.
5599  *
5600  *	Returns a starting processor set, to be used by
5601  *      choose_processor.
5602  *
5603  *	The thread must be locked.  The resulting pset is unlocked on return,
5604  *      and is chosen without taking any pset locks.
5605  */
5606 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5607 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5608 {
5609 	processor_set_t pset;
5610 	processor_t processor = PROCESSOR_NULL;
5611 
5612 	if (thread->affinity_set != AFFINITY_SET_NULL) {
5613 		/*
5614 		 * Use affinity set policy hint.
5615 		 */
5616 		pset = thread->affinity_set->aset_pset;
5617 	} else if (thread->last_processor != PROCESSOR_NULL) {
5618 		/*
5619 		 *	Simple (last processor) affinity case.
5620 		 */
5621 		processor = thread->last_processor;
5622 		pset = processor->processor_set;
5623 	} else {
5624 		/*
5625 		 *	No Affinity case:
5626 		 *
5627 		 *	Utilitize a per task hint to spread threads
5628 		 *	among the available processor sets.
5629 		 * NRG this seems like the wrong thing to do.
5630 		 * See also task->pset_hint = pset in thread_setrun()
5631 		 */
5632 		pset = get_threadtask(thread)->pset_hint;
5633 		if (pset == PROCESSOR_SET_NULL) {
5634 			pset = current_processor()->processor_set;
5635 		}
5636 
5637 		pset = choose_next_pset(pset);
5638 	}
5639 
5640 	if (!bit_test(node->pset_map, pset->pset_id)) {
5641 		/* pset is not from this node so choose one that is */
5642 		int id = lsb_first(node->pset_map);
5643 		if (id < 0) {
5644 			/* startup race, so check again under the node lock */
5645 			lck_spin_lock(&pset_node_lock);
5646 			if (bit_test(node->pset_map, pset->pset_id)) {
5647 				id = pset->pset_id;
5648 			} else {
5649 				id = lsb_first(node->pset_map);
5650 			}
5651 			lck_spin_unlock(&pset_node_lock);
5652 		}
5653 		assert(id >= 0);
5654 		pset = pset_array[id];
5655 	}
5656 
5657 	if (bit_count(node->pset_map) == 1) {
5658 		/* Only a single pset in this node */
5659 		goto out;
5660 	}
5661 
5662 	bool avoid_cpu0 = false;
5663 
5664 #if defined(__x86_64__)
5665 	if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5666 		/* Avoid the pset containing cpu0 */
5667 		avoid_cpu0 = true;
5668 		/* Assert that cpu0 is in pset0.  I expect this to be true on __x86_64__ */
5669 		assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5670 	}
5671 #endif
5672 
5673 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5674 		pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5675 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5676 			if (avoid_cpu0) {
5677 				rt_target_map = bit_ror64(rt_target_map, 1);
5678 			}
5679 			int rotid = lsb_first(rt_target_map);
5680 			if (rotid >= 0) {
5681 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5682 				pset = pset_array[id];
5683 				goto out;
5684 			}
5685 		}
5686 		if (!pset->is_SMT || !sched_allow_rt_smt) {
5687 			/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5688 			goto out;
5689 		}
5690 		rt_target_map = atomic_load(&node->pset_non_rt_map);
5691 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5692 			if (avoid_cpu0) {
5693 				rt_target_map = bit_ror64(rt_target_map, 1);
5694 			}
5695 			int rotid = lsb_first(rt_target_map);
5696 			if (rotid >= 0) {
5697 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5698 				pset = pset_array[id];
5699 				goto out;
5700 			}
5701 		}
5702 		/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5703 	} else {
5704 		pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5705 		if (!bit_test(idle_map, pset->pset_id)) {
5706 			int next_idle_pset_id = lsb_first(idle_map);
5707 			if (next_idle_pset_id >= 0) {
5708 				pset = pset_array[next_idle_pset_id];
5709 			}
5710 		}
5711 	}
5712 
5713 out:
5714 	if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5715 		processor = PROCESSOR_NULL;
5716 	}
5717 	if (processor != PROCESSOR_NULL) {
5718 		*processor_hint = processor;
5719 	}
5720 
5721 	assert(pset != NULL);
5722 	return pset;
5723 }
5724 
5725 /*
5726  *	thread_setrun:
5727  *
5728  *	Dispatch thread for execution, onto an idle
5729  *	processor or run queue, and signal a preemption
5730  *	as appropriate.
5731  *
5732  *	Thread must be locked.
5733  */
5734 void
thread_setrun(thread_t thread,sched_options_t options)5735 thread_setrun(
5736 	thread_t                        thread,
5737 	sched_options_t                 options)
5738 {
5739 	processor_t                     processor = PROCESSOR_NULL;
5740 	processor_set_t         pset;
5741 
5742 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5743 	thread_assert_runq_null(thread);
5744 
5745 #if CONFIG_PREADOPT_TG
5746 	/* We know that the thread is not in the runq by virtue of being in this
5747 	 * function and the thread is not self since we are running. We can safely
5748 	 * resolve the thread group hierarchy and modify the thread's thread group
5749 	 * here. */
5750 	thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5751 #endif
5752 
5753 	/*
5754 	 *	Update priority if needed.
5755 	 */
5756 	if (SCHED(can_update_priority)(thread)) {
5757 		SCHED(update_priority)(thread);
5758 	}
5759 	thread->sfi_class = sfi_thread_classify(thread);
5760 
5761 	if (thread->bound_processor == PROCESSOR_NULL) {
5762 		/*
5763 		 * Unbound case.
5764 		 *
5765 		 * Usually, this loop will only be executed once,
5766 		 * but if CLPC derecommends a processor after it has been chosen,
5767 		 * or if a processor is shut down after it is chosen,
5768 		 * choose_processor() may return NULL, so a retry
5769 		 * may be necessary.  A single retry will usually
5770 		 * be enough, and we can't afford to retry too many times
5771 		 * because interrupts are disabled.
5772 		 */
5773 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5774 		for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5775 			processor_t processor_hint = PROCESSOR_NULL;
5776 			pset_node_t node = SCHED(choose_node)(thread);
5777 			processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5778 
5779 			pset_lock(starting_pset);
5780 
5781 			processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5782 			if (processor != PROCESSOR_NULL) {
5783 				pset = processor->processor_set;
5784 				pset_assert_locked(pset);
5785 				break;
5786 			}
5787 		}
5788 		/*
5789 		 * If choose_processor() still returns NULL,
5790 		 * which is very unlikely,
5791 		 * choose the master_processor, which is always
5792 		 * safe to choose.
5793 		 */
5794 		if (processor == PROCESSOR_NULL) {
5795 			/* Choose fallback processor */
5796 			processor = master_processor;
5797 			pset = processor->processor_set;
5798 			pset_lock(pset);
5799 			assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5800 		}
5801 		task_t task = get_threadtask(thread);
5802 		if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5803 			task->pset_hint = pset; /* NRG this is done without holding the task lock */
5804 		}
5805 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5806 		    (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5807 		assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5808 	} else {
5809 		/*
5810 		 *	Bound case:
5811 		 *
5812 		 *	Unconditionally dispatch on the processor.
5813 		 */
5814 		processor = thread->bound_processor;
5815 		pset = processor->processor_set;
5816 		pset_lock(pset);
5817 
5818 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5819 		    (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5820 	}
5821 
5822 	/*
5823 	 *	Dispatch the thread on the chosen processor.
5824 	 *	TODO: This should be based on sched_mode, not sched_pri
5825 	 */
5826 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5827 		realtime_setrun(processor, thread);
5828 	} else {
5829 		processor_setrun(processor, thread, options);
5830 	}
5831 	/* pset is now unlocked */
5832 	if (thread->bound_processor == PROCESSOR_NULL) {
5833 		SCHED(check_spill)(pset, thread);
5834 	}
5835 }
5836 
5837 processor_set_t
task_choose_pset(task_t task)5838 task_choose_pset(
5839 	task_t          task)
5840 {
5841 	processor_set_t         pset = task->pset_hint;
5842 
5843 	if (pset != PROCESSOR_SET_NULL) {
5844 		pset = choose_next_pset(pset);
5845 	}
5846 
5847 	return pset;
5848 }
5849 
5850 /*
5851  *	Check for a preemption point in
5852  *	the current context.
5853  *
5854  *	Called at splsched with thread locked.
5855  */
5856 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5857 csw_check(
5858 	thread_t                thread,
5859 	processor_t             processor,
5860 	ast_t                   check_reason)
5861 {
5862 	processor_set_t pset = processor->processor_set;
5863 
5864 	assert(thread == processor->active_thread);
5865 
5866 	pset_lock(pset);
5867 
5868 	processor_state_update_from_thread(processor, thread, true);
5869 
5870 	ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5871 
5872 	/* Acknowledge the IPI if we decided not to preempt */
5873 
5874 	if ((preempt & AST_URGENT) == 0) {
5875 		if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5876 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5877 		}
5878 	}
5879 
5880 	if ((preempt & AST_PREEMPT) == 0) {
5881 		bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5882 	}
5883 
5884 	pset_unlock(pset);
5885 
5886 	return update_pending_nonurgent_preemption(processor, preempt);
5887 }
5888 
5889 void
clear_pending_nonurgent_preemption(processor_t processor)5890 clear_pending_nonurgent_preemption(processor_t processor)
5891 {
5892 	if (!processor->pending_nonurgent_preemption) {
5893 		return;
5894 	}
5895 
5896 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5897 
5898 	processor->pending_nonurgent_preemption = false;
5899 	running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
5900 }
5901 
5902 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)5903 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5904 {
5905 	if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5906 		clear_pending_nonurgent_preemption(processor);
5907 		return reason;
5908 	}
5909 
5910 	if (nonurgent_preemption_timer_abs == 0) {
5911 		/* Preemption timer not enabled */
5912 		return reason;
5913 	}
5914 
5915 	if (current_thread()->state & TH_IDLE) {
5916 		/* idle threads don't need nonurgent preemption */
5917 		return reason;
5918 	}
5919 
5920 	if (processor->pending_nonurgent_preemption) {
5921 		/* Timer is already armed, no need to do it again */
5922 		return reason;
5923 	}
5924 
5925 	if (ml_did_interrupt_userspace()) {
5926 		/*
5927 		 * We're preempting userspace here, so we don't need
5928 		 * to defer the preemption.  Force AST_URGENT
5929 		 * so that we can avoid arming this timer without risking
5930 		 * ast_taken_user deciding to spend too long in kernel
5931 		 * space to handle other ASTs.
5932 		 */
5933 
5934 		return reason | AST_URGENT;
5935 	}
5936 
5937 	/*
5938 	 * We've decided to do a nonurgent preemption when running in
5939 	 * kernelspace. We defer the preemption until reaching userspace boundary
5940 	 * to give a grace period for locks etc to be dropped and to reach
5941 	 * a clean preemption point, so that the preempting thread doesn't
5942 	 * always immediately hit the lock that the waking thread still holds.
5943 	 *
5944 	 * Arm a timer to enforce that the preemption executes within a bounded
5945 	 * time if the thread doesn't block or return to userspace quickly.
5946 	 */
5947 
5948 	processor->pending_nonurgent_preemption = true;
5949 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5950 	    reason);
5951 
5952 	uint64_t now = mach_absolute_time();
5953 
5954 	uint64_t deadline = now + nonurgent_preemption_timer_abs;
5955 
5956 	running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
5957 	    now, deadline);
5958 
5959 	return reason;
5960 }
5961 
5962 /*
5963  * Check for preemption at splsched with
5964  * pset and thread locked
5965  */
5966 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5967 csw_check_locked(
5968 	thread_t                thread,
5969 	processor_t             processor,
5970 	processor_set_t         pset,
5971 	ast_t                   check_reason)
5972 {
5973 	/*
5974 	 * If the current thread is running on a processor that is no longer recommended,
5975 	 * urgently preempt it, at which point thread_select() should
5976 	 * try to idle the processor and re-dispatch the thread to a recommended processor.
5977 	 */
5978 	if (!processor->is_recommended) {
5979 		return check_reason | AST_PREEMPT | AST_URGENT;
5980 	}
5981 
5982 	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5983 		return check_reason | AST_PREEMPT | AST_URGENT;
5984 	}
5985 
5986 	if (rt_runq_count(pset) > 0) {
5987 		if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5988 			return check_reason | AST_PREEMPT | AST_URGENT;
5989 		} else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5990 			return check_reason | AST_PREEMPT | AST_URGENT;
5991 		} else {
5992 			return check_reason | AST_PREEMPT;
5993 		}
5994 	}
5995 
5996 	ast_t result = SCHED(processor_csw_check)(processor);
5997 	if (result != AST_NONE) {
5998 		return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5999 	}
6000 
6001 	/*
6002 	 * Same for avoid-processor
6003 	 *
6004 	 * TODO: Should these set AST_REBALANCE?
6005 	 */
6006 	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6007 		return check_reason | AST_PREEMPT;
6008 	}
6009 
6010 	/*
6011 	 * Even though we could continue executing on this processor, a
6012 	 * secondary SMT core should try to shed load to another primary core.
6013 	 *
6014 	 * TODO: Should this do the same check that thread_select does? i.e.
6015 	 * if no bound threads target this processor, and idle primaries exist, preempt
6016 	 * The case of RT threads existing is already taken care of above
6017 	 */
6018 
6019 	if (processor->current_pri < BASEPRI_RTQUEUES &&
6020 	    processor->processor_primary != processor) {
6021 		return check_reason | AST_PREEMPT;
6022 	}
6023 
6024 	if (thread->state & TH_SUSP) {
6025 		return check_reason | AST_PREEMPT;
6026 	}
6027 
6028 #if CONFIG_SCHED_SFI
6029 	/*
6030 	 * Current thread may not need to be preempted, but maybe needs
6031 	 * an SFI wait?
6032 	 */
6033 	result = sfi_thread_needs_ast(thread, NULL);
6034 	if (result != AST_NONE) {
6035 		return result;
6036 	}
6037 #endif
6038 
6039 	return AST_NONE;
6040 }
6041 
6042 /*
6043  * Handle preemption IPI or IPI in response to setting an AST flag
6044  * Triggered by cause_ast_check
6045  * Called at splsched
6046  */
6047 void
ast_check(processor_t processor)6048 ast_check(processor_t processor)
6049 {
6050 	smr_ack_ipi();
6051 
6052 	if (processor->state != PROCESSOR_RUNNING &&
6053 	    processor->state != PROCESSOR_SHUTDOWN) {
6054 		return;
6055 	}
6056 
6057 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6058 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6059 
6060 	thread_t thread = processor->active_thread;
6061 
6062 	assert(thread == current_thread());
6063 
6064 	/*
6065 	 * Pairs with task_restartable_ranges_synchronize
6066 	 */
6067 	thread_lock(thread);
6068 
6069 	thread_reset_pcs_ack_IPI(thread);
6070 
6071 	/*
6072 	 * Propagate thread ast to processor.
6073 	 * (handles IPI in response to setting AST flag)
6074 	 */
6075 	ast_propagate(thread);
6076 
6077 	/*
6078 	 * Stash the old urgency and perfctl values to find out if
6079 	 * csw_check updates them.
6080 	 */
6081 	thread_urgency_t old_urgency = processor->current_urgency;
6082 	perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6083 
6084 	ast_t preempt;
6085 
6086 	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6087 		ast_on(preempt);
6088 	}
6089 
6090 	if (old_urgency != processor->current_urgency) {
6091 		/*
6092 		 * Urgency updates happen with the thread lock held (ugh).
6093 		 * TODO: This doesn't notice QoS changes...
6094 		 */
6095 		uint64_t urgency_param1, urgency_param2;
6096 
6097 		thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
6098 		thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
6099 	}
6100 
6101 	thread_unlock(thread);
6102 
6103 	if (old_perfctl_class != processor->current_perfctl_class) {
6104 		/*
6105 		 * We updated the perfctl class of this thread from another core.
6106 		 * Let CLPC know that the currently running thread has a new
6107 		 * class.
6108 		 */
6109 
6110 		machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
6111 		    mach_approximate_time(), 0, thread);
6112 	}
6113 
6114 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6115 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6116 }
6117 
6118 
6119 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)6120 thread_preempt_expire(
6121 	timer_call_param_t      p0,
6122 	__unused timer_call_param_t      p1)
6123 {
6124 	processor_t processor = p0;
6125 
6126 	assert(processor == current_processor());
6127 	assert(p1 == NULL);
6128 
6129 	thread_t thread = current_thread();
6130 
6131 	/*
6132 	 * This is set and cleared by the current core, so we will
6133 	 * never see a race with running timer expiration
6134 	 */
6135 	assert(processor->pending_nonurgent_preemption);
6136 
6137 	clear_pending_nonurgent_preemption(processor);
6138 
6139 	thread_lock(thread);
6140 
6141 	/*
6142 	 * Check again to see if it's still worth a
6143 	 * context switch, but this time force enable kernel preemption
6144 	 */
6145 
6146 	ast_t preempt = csw_check(thread, processor, AST_URGENT);
6147 
6148 	if (preempt) {
6149 		ast_on(preempt);
6150 	}
6151 
6152 	thread_unlock(thread);
6153 
6154 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6155 }
6156 
6157 
6158 /*
6159  *	set_sched_pri:
6160  *
6161  *	Set the scheduled priority of the specified thread.
6162  *
6163  *	This may cause the thread to change queues.
6164  *
6165  *	Thread must be locked.
6166  */
6167 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)6168 set_sched_pri(
6169 	thread_t        thread,
6170 	int16_t         new_priority,
6171 	set_sched_pri_options_t options)
6172 {
6173 	bool is_current_thread = (thread == current_thread());
6174 	bool removed_from_runq = false;
6175 	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6176 
6177 	int16_t old_priority = thread->sched_pri;
6178 
6179 	/* If we're already at this priority, no need to mess with the runqueue */
6180 	if (new_priority == old_priority) {
6181 #if CONFIG_SCHED_CLUTCH
6182 		/* For the first thread in the system, the priority is correct but
6183 		 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6184 		 * scheduler relies on the bucket being set for all threads, update
6185 		 * its bucket here.
6186 		 */
6187 		if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6188 			assert(thread == vm_pageout_scan_thread);
6189 			SCHED(update_thread_bucket)(thread);
6190 		}
6191 #endif /* CONFIG_SCHED_CLUTCH */
6192 
6193 		return;
6194 	}
6195 
6196 	if (is_current_thread) {
6197 		assert(thread->state & TH_RUN);
6198 		thread_assert_runq_null(thread);
6199 	} else {
6200 		removed_from_runq = thread_run_queue_remove(thread);
6201 	}
6202 
6203 	thread->sched_pri = new_priority;
6204 
6205 #if CONFIG_SCHED_CLUTCH
6206 	/*
6207 	 * Since for the clutch scheduler, the thread's bucket determines its runq
6208 	 * in the hierarchy it is important to update the bucket when the thread
6209 	 * lock is held and the thread has been removed from the runq hierarchy.
6210 	 */
6211 	SCHED(update_thread_bucket)(thread);
6212 
6213 #endif /* CONFIG_SCHED_CLUTCH */
6214 
6215 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6216 	    (uintptr_t)thread_tid(thread),
6217 	    thread->base_pri,
6218 	    thread->sched_pri,
6219 	    thread->sched_usage,
6220 	    0);
6221 
6222 	if (removed_from_runq) {
6223 		thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
6224 	} else if (is_current_thread) {
6225 		processor_t processor = thread->last_processor;
6226 		assert(processor == current_processor());
6227 
6228 		thread_urgency_t old_urgency = processor->current_urgency;
6229 
6230 		/*
6231 		 * When dropping in priority, check if the thread no longer belongs on core.
6232 		 * If a thread raises its own priority, don't aggressively rebalance it.
6233 		 * <rdar://problem/31699165>
6234 		 *
6235 		 * csw_check does a processor_state_update_from_thread, but
6236 		 * we should do our own if we're being lazy.
6237 		 */
6238 		if (!lazy_update && new_priority < old_priority) {
6239 			ast_t preempt;
6240 
6241 			if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6242 				ast_on(preempt);
6243 			}
6244 		} else {
6245 			processor_state_update_from_thread(processor, thread, false);
6246 		}
6247 
6248 		/*
6249 		 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6250 		 * class alterations from user space to occur relatively infrequently, hence
6251 		 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6252 		 * inheritance is expected to involve priority changes.
6253 		 */
6254 		if (processor->current_urgency != old_urgency) {
6255 			uint64_t urgency_param1, urgency_param2;
6256 
6257 			thread_urgency_t new_urgency = thread_get_urgency(thread,
6258 			    &urgency_param1, &urgency_param2);
6259 
6260 			thread_tell_urgency(new_urgency, urgency_param1,
6261 			    urgency_param2, 0, thread);
6262 		}
6263 
6264 		/* TODO: only call this if current_perfctl_class changed */
6265 		uint64_t ctime = mach_approximate_time();
6266 		machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6267 	} else if (thread->state & TH_RUN) {
6268 		processor_t processor = thread->last_processor;
6269 
6270 		if (!lazy_update &&
6271 		    processor != PROCESSOR_NULL &&
6272 		    processor != current_processor() &&
6273 		    processor->active_thread == thread) {
6274 			cause_ast_check(processor);
6275 		}
6276 	}
6277 }
6278 
6279 /*
6280  * thread_run_queue_remove_for_handoff
6281  *
6282  * Pull a thread or its (recursive) push target out of the runqueue
6283  * so that it is ready for thread_run()
6284  *
6285  * Called at splsched
6286  *
6287  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6288  * This may be different than the thread that was passed in.
6289  */
6290 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6291 thread_run_queue_remove_for_handoff(thread_t thread)
6292 {
6293 	thread_t pulled_thread = THREAD_NULL;
6294 
6295 	thread_lock(thread);
6296 
6297 	/*
6298 	 * Check that the thread is not bound to a different processor,
6299 	 * NO_SMT flag is not set on the thread, cluster type of
6300 	 * processor matches with thread if the thread is pinned to a
6301 	 * particular cluster and that realtime is not involved.
6302 	 *
6303 	 * Next, pull it off its run queue.  If it doesn't come, it's not eligible.
6304 	 */
6305 	processor_t processor = current_processor();
6306 	if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6307 	    && (!thread_no_smt(thread))
6308 	    && (processor->current_pri < BASEPRI_RTQUEUES)
6309 	    && (thread->sched_pri < BASEPRI_RTQUEUES)
6310 #if __AMP__
6311 	    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6312 	    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6313 #endif /* __AMP__ */
6314 	    ) {
6315 		if (thread_run_queue_remove(thread)) {
6316 			pulled_thread = thread;
6317 		}
6318 	}
6319 
6320 	thread_unlock(thread);
6321 
6322 	return pulled_thread;
6323 }
6324 
6325 /*
6326  * thread_prepare_for_handoff
6327  *
6328  * Make the thread ready for handoff.
6329  * If the thread was runnable then pull it off the runq, if the thread could
6330  * not be pulled, return NULL.
6331  *
6332  * If the thread was woken up from wait for handoff, make sure it is not bound to
6333  * different processor.
6334  *
6335  * Called at splsched
6336  *
6337  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6338  * This may be different than the thread that was passed in.
6339  */
6340 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6341 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6342 {
6343 	thread_t pulled_thread = THREAD_NULL;
6344 
6345 	if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6346 		processor_t processor = current_processor();
6347 		thread_lock(thread);
6348 
6349 		/*
6350 		 * Check that the thread is not bound to a different processor,
6351 		 * NO_SMT flag is not set on the thread and cluster type of
6352 		 * processor matches with thread if the thread is pinned to a
6353 		 * particular cluster. Call setrun instead if above conditions
6354 		 * are not satisfied.
6355 		 */
6356 		if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6357 		    && (!thread_no_smt(thread))
6358 #if __AMP__
6359 		    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6360 		    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6361 #endif /* __AMP__ */
6362 		    ) {
6363 			pulled_thread = thread;
6364 		} else {
6365 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6366 		}
6367 		thread_unlock(thread);
6368 	} else {
6369 		pulled_thread = thread_run_queue_remove_for_handoff(thread);
6370 	}
6371 
6372 	return pulled_thread;
6373 }
6374 
6375 /*
6376  *	thread_run_queue_remove:
6377  *
6378  *	Remove a thread from its current run queue and
6379  *	return TRUE if successful.
6380  *
6381  *	Thread must be locked.
6382  *
6383  *	If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6384  *	run queues because the caller locked the thread.  Otherwise
6385  *	the thread is on a run queue, but could be chosen for dispatch
6386  *	and removed by another processor under a different lock, which
6387  *	will set thread->runq to PROCESSOR_NULL.
6388  *
6389  *	Hence the thread select path must not rely on anything that could
6390  *	be changed under the thread lock after calling this function,
6391  *	most importantly thread->sched_pri.
6392  */
6393 boolean_t
thread_run_queue_remove(thread_t thread)6394 thread_run_queue_remove(
6395 	thread_t        thread)
6396 {
6397 	boolean_t removed = FALSE;
6398 
6399 	if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6400 		/* Thread isn't runnable */
6401 		thread_assert_runq_null(thread);
6402 		return FALSE;
6403 	}
6404 
6405 	processor_t processor = thread_get_runq(thread);
6406 	if (processor == PROCESSOR_NULL) {
6407 		/*
6408 		 * The thread is either not on the runq,
6409 		 * or is in the midst of being removed from the runq.
6410 		 *
6411 		 * runq is set to NULL under the pset lock, not the thread
6412 		 * lock, so the thread may still be in the process of being dequeued
6413 		 * from the runq. It will wait in invoke for the thread lock to be
6414 		 * dropped.
6415 		 */
6416 
6417 		return FALSE;
6418 	}
6419 
6420 	if (thread->sched_pri < BASEPRI_RTQUEUES) {
6421 		return SCHED(processor_queue_remove)(processor, thread);
6422 	}
6423 
6424 	processor_set_t pset = processor->processor_set;
6425 
6426 	pset_lock(pset);
6427 
6428 	/*
6429 	 * Must re-read the thread runq after acquiring the pset lock, in
6430 	 * case another core swooped in before us to dequeue the thread.
6431 	 */
6432 	if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6433 		/*
6434 		 *	Thread is on the RT run queue and we have a lock on
6435 		 *	that run queue.
6436 		 */
6437 		rt_runq_remove(SCHED(rt_runq)(pset), thread);
6438 		pset_update_rt_stealable_state(pset);
6439 
6440 		removed = TRUE;
6441 	}
6442 
6443 	pset_unlock(pset);
6444 
6445 	return removed;
6446 }
6447 
6448 /*
6449  * Put the thread back where it goes after a thread_run_queue_remove
6450  *
6451  * Thread must have been removed under the same thread lock hold
6452  *
6453  * thread locked, at splsched
6454  */
6455 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6456 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6457 {
6458 	thread_assert_runq_null(thread);
6459 	assert(thread->state & (TH_RUN));
6460 
6461 	thread_setrun(thread, options);
6462 }
6463 
6464 void
sys_override_cpu_throttle(boolean_t enable_override)6465 sys_override_cpu_throttle(boolean_t enable_override)
6466 {
6467 	if (enable_override) {
6468 		cpu_throttle_enabled = 0;
6469 	} else {
6470 		cpu_throttle_enabled = 1;
6471 	}
6472 }
6473 
6474 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6475 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6476 {
6477 	uint64_t urgency_param1 = 0, urgency_param2 = 0;
6478 	task_t task = get_threadtask_early(thread);
6479 
6480 	thread_urgency_t urgency;
6481 
6482 	if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6483 		urgency_param1 = 0;
6484 		urgency_param2 = 0;
6485 
6486 		urgency = THREAD_URGENCY_NONE;
6487 	} else if (thread->sched_mode == TH_MODE_REALTIME) {
6488 		urgency_param1 = thread->realtime.period;
6489 		urgency_param2 = thread->realtime.deadline;
6490 
6491 		urgency = THREAD_URGENCY_REAL_TIME;
6492 	} else if (cpu_throttle_enabled &&
6493 	    (thread->sched_pri <= MAXPRI_THROTTLE) &&
6494 	    (thread->base_pri <= MAXPRI_THROTTLE)) {
6495 		/*
6496 		 * Threads that are running at low priority but are not
6497 		 * tagged with a specific QoS are separated out from
6498 		 * the "background" urgency. Performance management
6499 		 * subsystem can decide to either treat these threads
6500 		 * as normal threads or look at other signals like thermal
6501 		 * levels for optimal power/perf tradeoffs for a platform.
6502 		 */
6503 		boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6504 		boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6505 
6506 		/*
6507 		 * Background urgency applied when thread priority is
6508 		 * MAXPRI_THROTTLE or lower and thread is not promoted
6509 		 * and thread has a QoS specified
6510 		 */
6511 		urgency_param1 = thread->sched_pri;
6512 		urgency_param2 = thread->base_pri;
6513 
6514 		if (thread_lacks_qos && !task_is_suppressed) {
6515 			urgency = THREAD_URGENCY_LOWPRI;
6516 		} else {
6517 			urgency = THREAD_URGENCY_BACKGROUND;
6518 		}
6519 	} else {
6520 		/* For otherwise unclassified threads, report throughput QoS parameters */
6521 		urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6522 		urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6523 		urgency = THREAD_URGENCY_NORMAL;
6524 	}
6525 
6526 	if (arg1 != NULL) {
6527 		*arg1 = urgency_param1;
6528 	}
6529 	if (arg2 != NULL) {
6530 		*arg2 = urgency_param2;
6531 	}
6532 
6533 	return urgency;
6534 }
6535 
6536 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6537 thread_get_perfcontrol_class(thread_t thread)
6538 {
6539 	/* Special case handling */
6540 	if (thread->state & TH_IDLE) {
6541 		return PERFCONTROL_CLASS_IDLE;
6542 	}
6543 
6544 	if (thread->sched_mode == TH_MODE_REALTIME) {
6545 		return PERFCONTROL_CLASS_REALTIME;
6546 	}
6547 
6548 	/* perfcontrol_class based on base_pri */
6549 	if (thread->base_pri <= MAXPRI_THROTTLE) {
6550 		return PERFCONTROL_CLASS_BACKGROUND;
6551 	} else if (thread->base_pri <= BASEPRI_UTILITY) {
6552 		return PERFCONTROL_CLASS_UTILITY;
6553 	} else if (thread->base_pri <= BASEPRI_DEFAULT) {
6554 		return PERFCONTROL_CLASS_NONUI;
6555 	} else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6556 		return PERFCONTROL_CLASS_USER_INITIATED;
6557 	} else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6558 		return PERFCONTROL_CLASS_UI;
6559 	} else {
6560 		if (get_threadtask(thread) == kernel_task) {
6561 			/*
6562 			 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6563 			 * All other lower priority kernel threads should be treated
6564 			 * as regular threads for performance control purposes.
6565 			 */
6566 			return PERFCONTROL_CLASS_KERNEL;
6567 		}
6568 		return PERFCONTROL_CLASS_ABOVEUI;
6569 	}
6570 }
6571 
6572 /*
6573  *	This is the processor idle loop, which just looks for other threads
6574  *	to execute.  Processor idle threads invoke this without supplying a
6575  *	current thread to idle without an asserted wait state.
6576  *
6577  *	Returns a the next thread to execute if dispatched directly.
6578  */
6579 
6580 #if 0
6581 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6582 #else
6583 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6584 #endif
6585 
6586 #if (DEVELOPMENT || DEBUG)
6587 int sched_idle_delay_cpuid = -1;
6588 #endif
6589 
6590 thread_t
processor_idle(thread_t thread,processor_t processor)6591 processor_idle(
6592 	thread_t                        thread,
6593 	processor_t                     processor)
6594 {
6595 	processor_set_t         pset = processor->processor_set;
6596 	struct recount_snap snap = { 0 };
6597 
6598 	(void)splsched();
6599 
6600 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6601 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6602 	    (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6603 
6604 	SCHED_STATS_INC(idle_transitions);
6605 	assert(processor->running_timers_active == false);
6606 
6607 	recount_snapshot(&snap);
6608 	recount_processor_idle(&processor->pr_recount, &snap);
6609 
6610 	while (1) {
6611 		/*
6612 		 * Ensure that updates to my processor and pset state,
6613 		 * made by the IPI source processor before sending the IPI,
6614 		 * are visible on this processor now (even though we don't
6615 		 * take the pset lock yet).
6616 		 */
6617 		atomic_thread_fence(memory_order_acquire);
6618 
6619 		if (processor->state != PROCESSOR_IDLE) {
6620 			break;
6621 		}
6622 		if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6623 			break;
6624 		}
6625 #if defined(CONFIG_SCHED_DEFERRED_AST)
6626 		if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6627 			break;
6628 		}
6629 #endif
6630 		if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6631 			break;
6632 		}
6633 
6634 		if (processor->is_recommended && (processor->processor_primary == processor)) {
6635 			if (rt_runq_count(pset)) {
6636 				break;
6637 			}
6638 		} else {
6639 			if (SCHED(processor_bound_count)(processor)) {
6640 				break;
6641 			}
6642 		}
6643 
6644 		IDLE_KERNEL_DEBUG_CONSTANT(
6645 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6646 
6647 		machine_track_platform_idle(TRUE);
6648 
6649 		machine_idle();
6650 		/* returns with interrupts enabled */
6651 
6652 		machine_track_platform_idle(FALSE);
6653 
6654 #if (DEVELOPMENT || DEBUG)
6655 		if (processor->cpu_id == sched_idle_delay_cpuid) {
6656 			delay(500);
6657 		}
6658 #endif
6659 
6660 		(void)splsched();
6661 
6662 		atomic_thread_fence(memory_order_acquire);
6663 
6664 		IDLE_KERNEL_DEBUG_CONSTANT(
6665 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6666 
6667 		/*
6668 		 * Check if we should call sched_timeshare_consider_maintenance() here.
6669 		 * The CPU was woken out of idle due to an interrupt and we should do the
6670 		 * call only if the processor is still idle. If the processor is non-idle,
6671 		 * the threads running on the processor would do the call as part of
6672 		 * context swithing.
6673 		 */
6674 		if (processor->state == PROCESSOR_IDLE) {
6675 			sched_timeshare_consider_maintenance(mach_absolute_time(), true);
6676 		}
6677 
6678 		if (!SCHED(processor_queue_empty)(processor)) {
6679 			/* Secondary SMT processors respond to directed wakeups
6680 			 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6681 			 */
6682 			if (processor->processor_primary == processor) {
6683 				break;
6684 			}
6685 		}
6686 	}
6687 
6688 	recount_snapshot(&snap);
6689 	recount_processor_run(&processor->pr_recount, &snap);
6690 	smr_cpu_join(processor, snap.rsn_time_mach);
6691 
6692 	ast_t reason = AST_NONE;
6693 
6694 	/* We're handling all scheduling AST's */
6695 	ast_off(AST_SCHEDULING);
6696 
6697 	/*
6698 	 * thread_select will move the processor from dispatching to running,
6699 	 * or put it in idle if there's nothing to do.
6700 	 */
6701 	thread_t cur_thread = current_thread();
6702 
6703 	thread_lock(cur_thread);
6704 	thread_t new_thread = thread_select(cur_thread, processor, &reason);
6705 	thread_unlock(cur_thread);
6706 
6707 	assert(processor->running_timers_active == false);
6708 
6709 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6710 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6711 	    (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6712 
6713 	return new_thread;
6714 }
6715 
6716 /*
6717  *	Each processor has a dedicated thread which
6718  *	executes the idle loop when there is no suitable
6719  *	previous context.
6720  *
6721  *	This continuation is entered with interrupts disabled.
6722  */
6723 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6724 idle_thread(__assert_only void* parameter,
6725     __unused wait_result_t result)
6726 {
6727 	assert(ml_get_interrupts_enabled() == FALSE);
6728 	assert(parameter == NULL);
6729 
6730 	processor_t processor = current_processor();
6731 
6732 	smr_cpu_leave(processor, processor->last_dispatch);
6733 
6734 	/*
6735 	 * Ensure that anything running in idle context triggers
6736 	 * preemption-disabled checks.
6737 	 */
6738 	disable_preemption_without_measurements();
6739 
6740 	/*
6741 	 * Enable interrupts temporarily to handle any pending interrupts
6742 	 * or IPIs before deciding to sleep
6743 	 */
6744 	spllo();
6745 
6746 	thread_t new_thread = processor_idle(THREAD_NULL, processor);
6747 	/* returns with interrupts disabled */
6748 
6749 	enable_preemption();
6750 
6751 	if (new_thread != THREAD_NULL) {
6752 		thread_run(processor->idle_thread,
6753 		    idle_thread, NULL, new_thread);
6754 		/*NOTREACHED*/
6755 	}
6756 
6757 	thread_block(idle_thread);
6758 	/*NOTREACHED*/
6759 }
6760 
6761 kern_return_t
idle_thread_create(processor_t processor)6762 idle_thread_create(
6763 	processor_t             processor)
6764 {
6765 	kern_return_t   result;
6766 	thread_t                thread;
6767 	spl_t                   s;
6768 	char                    name[MAXTHREADNAMESIZE];
6769 
6770 	result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6771 	if (result != KERN_SUCCESS) {
6772 		return result;
6773 	}
6774 
6775 	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6776 	thread_set_thread_name(thread, name);
6777 
6778 	s = splsched();
6779 	thread_lock(thread);
6780 	thread->bound_processor = processor;
6781 	processor->idle_thread = thread;
6782 	thread->sched_pri = thread->base_pri = IDLEPRI;
6783 	thread->state = (TH_RUN | TH_IDLE);
6784 	thread->options |= TH_OPT_IDLE_THREAD;
6785 	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6786 	thread_unlock(thread);
6787 	splx(s);
6788 
6789 	thread_deallocate(thread);
6790 
6791 	return KERN_SUCCESS;
6792 }
6793 
6794 static void sched_update_powered_cores_continue(void);
6795 
6796 /*
6797  * sched_startup:
6798  *
6799  * Kicks off scheduler services.
6800  *
6801  * Called at splsched.
6802  */
6803 void
sched_startup(void)6804 sched_startup(void)
6805 {
6806 	kern_return_t   result;
6807 	thread_t                thread;
6808 
6809 	simple_lock_init(&sched_vm_group_list_lock, 0);
6810 
6811 	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6812 	    NULL, MAXPRI_KERNEL, &thread);
6813 	if (result != KERN_SUCCESS) {
6814 		panic("sched_startup");
6815 	}
6816 
6817 	thread_deallocate(thread);
6818 
6819 	assert_thread_magic(thread);
6820 
6821 	/*
6822 	 * Yield to the sched_init_thread once, to
6823 	 * initialize our own thread after being switched
6824 	 * back to.
6825 	 *
6826 	 * The current thread is the only other thread
6827 	 * active at this point.
6828 	 */
6829 	thread_block(THREAD_CONTINUE_NULL);
6830 
6831 	result = kernel_thread_start_priority((thread_continue_t)sched_update_powered_cores_continue,
6832 	    NULL, MAXPRI_KERNEL, &thread);
6833 	if (result != KERN_SUCCESS) {
6834 		panic("sched_startup");
6835 	}
6836 
6837 	thread_deallocate(thread);
6838 
6839 	assert_thread_magic(thread);
6840 }
6841 
6842 #if __arm64__
6843 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6844 #endif /* __arm64__ */
6845 
6846 
6847 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6848 
6849 static volatile uint64_t                sched_maintenance_deadline;
6850 static uint64_t                         sched_tick_last_abstime;
6851 static uint64_t                         sched_tick_delta;
6852 uint64_t                                sched_tick_max_delta;
6853 
6854 
6855 /*
6856  *	sched_init_thread:
6857  *
6858  *	Perform periodic bookkeeping functions about ten
6859  *	times per second.
6860  */
6861 void
sched_timeshare_maintenance_continue(void)6862 sched_timeshare_maintenance_continue(void)
6863 {
6864 	uint64_t        sched_tick_ctime, late_time;
6865 
6866 	struct sched_update_scan_context scan_context = {
6867 		.earliest_bg_make_runnable_time = UINT64_MAX,
6868 		.earliest_normal_make_runnable_time = UINT64_MAX,
6869 		.earliest_rt_make_runnable_time = UINT64_MAX
6870 	};
6871 
6872 	sched_tick_ctime = mach_absolute_time();
6873 
6874 	if (__improbable(sched_tick_last_abstime == 0)) {
6875 		sched_tick_last_abstime = sched_tick_ctime;
6876 		late_time = 0;
6877 		sched_tick_delta = 1;
6878 	} else {
6879 		late_time = sched_tick_ctime - sched_tick_last_abstime;
6880 		sched_tick_delta = late_time / sched_tick_interval;
6881 		/* Ensure a delta of 1, since the interval could be slightly
6882 		 * smaller than the sched_tick_interval due to dispatch
6883 		 * latencies.
6884 		 */
6885 		sched_tick_delta = MAX(sched_tick_delta, 1);
6886 
6887 		/* In the event interrupt latencies or platform
6888 		 * idle events that advanced the timebase resulted
6889 		 * in periods where no threads were dispatched,
6890 		 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6891 		 * iterations.
6892 		 */
6893 		sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6894 
6895 		sched_tick_last_abstime = sched_tick_ctime;
6896 		sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6897 	}
6898 
6899 	scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6900 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6901 	    sched_tick_delta, late_time, 0, 0, 0);
6902 
6903 	/* Add a number of pseudo-ticks corresponding to the elapsed interval
6904 	 * This could be greater than 1 if substantial intervals where
6905 	 * all processors are idle occur, which rarely occurs in practice.
6906 	 */
6907 
6908 	sched_tick += sched_tick_delta;
6909 
6910 	update_vm_info();
6911 
6912 	/*
6913 	 *  Compute various averages.
6914 	 */
6915 	compute_averages(sched_tick_delta);
6916 
6917 	/*
6918 	 *  Scan the run queues for threads which
6919 	 *  may need to be updated, and find the earliest runnable thread on the runqueue
6920 	 *  to report its latency.
6921 	 */
6922 	SCHED(thread_update_scan)(&scan_context);
6923 
6924 	SCHED(rt_runq_scan)(&scan_context);
6925 
6926 	uint64_t ctime = mach_absolute_time();
6927 
6928 	uint64_t bg_max_latency       = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6929 	    ctime - scan_context.earliest_bg_make_runnable_time : 0;
6930 
6931 	uint64_t default_max_latency  = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6932 	    ctime - scan_context.earliest_normal_make_runnable_time : 0;
6933 
6934 	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6935 	    ctime - scan_context.earliest_rt_make_runnable_time : 0;
6936 
6937 	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6938 
6939 	/*
6940 	 * Check to see if the special sched VM group needs attention.
6941 	 */
6942 	sched_vm_group_maintenance();
6943 
6944 #if __arm64__
6945 	/* Check to see if the recommended cores failsafe is active */
6946 	sched_recommended_cores_maintenance();
6947 #endif /* __arm64__ */
6948 
6949 
6950 #if DEBUG || DEVELOPMENT
6951 #if __x86_64__
6952 #include <i386/misc_protos.h>
6953 	/* Check for long-duration interrupts */
6954 	mp_interrupt_watchdog();
6955 #endif /* __x86_64__ */
6956 #endif /* DEBUG || DEVELOPMENT */
6957 
6958 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6959 	    sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6960 	    sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6961 
6962 	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6963 	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6964 	/*NOTREACHED*/
6965 }
6966 
6967 static uint64_t sched_maintenance_wakeups;
6968 
6969 /*
6970  * Determine if the set of routines formerly driven by a maintenance timer
6971  * must be invoked, based on a deadline comparison. Signals the scheduler
6972  * maintenance thread on deadline expiration. Must be invoked at an interval
6973  * lower than the "sched_tick_interval", currently accomplished by
6974  * invocation via the quantum expiration timer and at context switch time.
6975  * Performance matters: this routine reuses a timestamp approximating the
6976  * current absolute time received from the caller, and should perform
6977  * no more than a comparison against the deadline in the common case.
6978  */
6979 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)6980 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6981 {
6982 	uint64_t deadline = sched_maintenance_deadline;
6983 
6984 	if (__improbable(ctime >= deadline)) {
6985 		if (__improbable(current_thread() == sched_maintenance_thread)) {
6986 			return;
6987 		}
6988 		OSMemoryBarrier();
6989 
6990 		uint64_t ndeadline = ctime + sched_tick_interval;
6991 
6992 		if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6993 			thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6994 			sched_maintenance_wakeups++;
6995 			smr_maintenance(ctime);
6996 		}
6997 	}
6998 
6999 	smr_cpu_tick(ctime, safe_point);
7000 
7001 #if !CONFIG_SCHED_CLUTCH
7002 	/*
7003 	 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7004 	 * scheduler, the load is maintained at the thread group and bucket level.
7005 	 */
7006 	uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7007 
7008 	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7009 		uint64_t new_deadline = 0;
7010 		if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7011 			compute_sched_load();
7012 			new_deadline = ctime + sched_load_compute_interval_abs;
7013 			os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7014 		}
7015 	}
7016 #endif /* CONFIG_SCHED_CLUTCH */
7017 
7018 #if __arm64__
7019 	uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7020 
7021 	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7022 		/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7023 		if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7024 			machine_perfcontrol_deadline_passed(perf_deadline);
7025 		}
7026 	}
7027 #endif /* __arm64__ */
7028 }
7029 
7030 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7031 
7032 void
sched_init_thread(void)7033 sched_init_thread(void)
7034 {
7035 	thread_block(THREAD_CONTINUE_NULL);
7036 
7037 	thread_t thread = current_thread();
7038 
7039 	thread_set_thread_name(thread, "sched_maintenance_thread");
7040 
7041 	sched_maintenance_thread = thread;
7042 
7043 	SCHED(maintenance_continuation)();
7044 
7045 	/*NOTREACHED*/
7046 }
7047 
7048 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7049 
7050 /*
7051  *	thread_update_scan / runq_scan:
7052  *
7053  *	Scan the run queues to account for timesharing threads
7054  *	which need to be updated.
7055  *
7056  *	Scanner runs in two passes.  Pass one squirrels likely
7057  *	threads away in an array, pass two does the update.
7058  *
7059  *	This is necessary because the run queue is locked for
7060  *	the candidate scan, but	the thread is locked for the update.
7061  *
7062  *	Array should be sized to make forward progress, without
7063  *	disabling preemption for long periods.
7064  */
7065 
7066 #define THREAD_UPDATE_SIZE              128
7067 
7068 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7069 static uint32_t thread_update_count = 0;
7070 
7071 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7072 boolean_t
thread_update_add_thread(thread_t thread)7073 thread_update_add_thread(thread_t thread)
7074 {
7075 	if (thread_update_count == THREAD_UPDATE_SIZE) {
7076 		return FALSE;
7077 	}
7078 
7079 	thread_update_array[thread_update_count++] = thread;
7080 	thread_reference(thread);
7081 	return TRUE;
7082 }
7083 
7084 void
thread_update_process_threads(void)7085 thread_update_process_threads(void)
7086 {
7087 	assert(thread_update_count <= THREAD_UPDATE_SIZE);
7088 
7089 	for (uint32_t i = 0; i < thread_update_count; i++) {
7090 		thread_t thread = thread_update_array[i];
7091 		assert_thread_magic(thread);
7092 		thread_update_array[i] = THREAD_NULL;
7093 
7094 		spl_t s = splsched();
7095 		thread_lock(thread);
7096 		if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7097 			SCHED(update_priority)(thread);
7098 		}
7099 		thread_unlock(thread);
7100 		splx(s);
7101 
7102 		thread_deallocate(thread);
7103 	}
7104 
7105 	thread_update_count = 0;
7106 }
7107 
7108 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)7109 runq_scan_thread(
7110 	thread_t thread,
7111 	sched_update_scan_context_t scan_context)
7112 {
7113 	assert_thread_magic(thread);
7114 
7115 	if (thread->sched_stamp != sched_tick &&
7116 	    thread->sched_mode == TH_MODE_TIMESHARE) {
7117 		if (thread_update_add_thread(thread) == FALSE) {
7118 			return TRUE;
7119 		}
7120 	}
7121 
7122 	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7123 		if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7124 			scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7125 		}
7126 	} else {
7127 		if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7128 			scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7129 		}
7130 	}
7131 
7132 	return FALSE;
7133 }
7134 
7135 /*
7136  *	Scan a runq for candidate threads.
7137  *
7138  *	Returns TRUE if retry is needed.
7139  */
7140 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)7141 runq_scan(
7142 	run_queue_t                   runq,
7143 	sched_update_scan_context_t   scan_context)
7144 {
7145 	int count       = runq->count;
7146 	int queue_index;
7147 
7148 	assert(count >= 0);
7149 
7150 	if (count == 0) {
7151 		return FALSE;
7152 	}
7153 
7154 	for (queue_index = bitmap_first(runq->bitmap, NRQS);
7155 	    queue_index >= 0;
7156 	    queue_index = bitmap_next(runq->bitmap, queue_index)) {
7157 		thread_t thread;
7158 		circle_queue_t queue = &runq->queues[queue_index];
7159 
7160 		cqe_foreach_element(thread, queue, runq_links) {
7161 			assert(count > 0);
7162 			if (runq_scan_thread(thread, scan_context) == TRUE) {
7163 				return TRUE;
7164 			}
7165 			count--;
7166 		}
7167 	}
7168 
7169 	return FALSE;
7170 }
7171 
7172 #if CONFIG_SCHED_CLUTCH
7173 
7174 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)7175 sched_clutch_timeshare_scan(
7176 	queue_t thread_queue,
7177 	uint16_t thread_count,
7178 	sched_update_scan_context_t scan_context)
7179 {
7180 	if (thread_count == 0) {
7181 		return FALSE;
7182 	}
7183 
7184 	thread_t thread;
7185 	qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7186 		if (runq_scan_thread(thread, scan_context) == TRUE) {
7187 			return TRUE;
7188 		}
7189 		thread_count--;
7190 	}
7191 
7192 	assert(thread_count == 0);
7193 	return FALSE;
7194 }
7195 
7196 
7197 #endif /* CONFIG_SCHED_CLUTCH */
7198 
7199 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7200 
7201 bool
thread_is_eager_preempt(thread_t thread)7202 thread_is_eager_preempt(thread_t thread)
7203 {
7204 	return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7205 }
7206 
7207 void
thread_set_eager_preempt(thread_t thread)7208 thread_set_eager_preempt(thread_t thread)
7209 {
7210 	spl_t s = splsched();
7211 	thread_lock(thread);
7212 
7213 	assert(!thread_is_eager_preempt(thread));
7214 
7215 	thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7216 
7217 	if (thread == current_thread()) {
7218 		/* csw_check updates current_is_eagerpreempt on the processor */
7219 		ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7220 
7221 		thread_unlock(thread);
7222 
7223 		if (ast != AST_NONE) {
7224 			thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7225 		}
7226 	} else {
7227 		processor_t last_processor = thread->last_processor;
7228 
7229 		if (last_processor != PROCESSOR_NULL &&
7230 		    last_processor->state == PROCESSOR_RUNNING &&
7231 		    last_processor->active_thread == thread) {
7232 			cause_ast_check(last_processor);
7233 		}
7234 
7235 		thread_unlock(thread);
7236 	}
7237 
7238 	splx(s);
7239 }
7240 
7241 void
thread_clear_eager_preempt(thread_t thread)7242 thread_clear_eager_preempt(thread_t thread)
7243 {
7244 	spl_t s = splsched();
7245 	thread_lock(thread);
7246 
7247 	assert(thread_is_eager_preempt(thread));
7248 
7249 	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7250 
7251 	if (thread == current_thread()) {
7252 		current_processor()->current_is_eagerpreempt = false;
7253 	}
7254 
7255 	thread_unlock(thread);
7256 	splx(s);
7257 }
7258 
7259 /*
7260  * Scheduling statistics
7261  */
7262 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7263 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7264 {
7265 	struct sched_statistics *stats;
7266 	boolean_t to_realtime = FALSE;
7267 
7268 	stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7269 	stats->csw_count++;
7270 
7271 	if (otherpri >= BASEPRI_REALTIME) {
7272 		stats->rt_sched_count++;
7273 		to_realtime = TRUE;
7274 	}
7275 
7276 	if ((reasons & AST_PREEMPT) != 0) {
7277 		stats->preempt_count++;
7278 
7279 		if (selfpri >= BASEPRI_REALTIME) {
7280 			stats->preempted_rt_count++;
7281 		}
7282 
7283 		if (to_realtime) {
7284 			stats->preempted_by_rt_count++;
7285 		}
7286 	}
7287 }
7288 
7289 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7290 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7291 {
7292 	uint64_t timestamp = mach_absolute_time();
7293 
7294 	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7295 	stats->last_change_timestamp = timestamp;
7296 }
7297 
7298 /*
7299  *     For calls from assembly code
7300  */
7301 #undef thread_wakeup
7302 void
7303 thread_wakeup(
7304 	event_t         x);
7305 
7306 void
thread_wakeup(event_t x)7307 thread_wakeup(
7308 	event_t         x)
7309 {
7310 	thread_wakeup_with_result(x, THREAD_AWAKENED);
7311 }
7312 
7313 boolean_t
preemption_enabled(void)7314 preemption_enabled(void)
7315 {
7316 	return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7317 }
7318 
7319 static void
sched_timer_deadline_tracking_init(void)7320 sched_timer_deadline_tracking_init(void)
7321 {
7322 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7323 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7324 }
7325 
7326 static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7327 processor_reason_t latest_requested_reason = REASON_NONE;
7328 static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7329 bool perfcontrol_sleep_override = false;
7330 
7331 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7332 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7333 int32_t cluster_powerdown_suspend_count = 0;
7334 
7335 bool
sched_is_in_sleep(void)7336 sched_is_in_sleep(void)
7337 {
7338 	os_atomic_thread_fence(acquire);
7339 	return perfcontrol_sleep_override;
7340 }
7341 
7342 static void
sched_update_powered_cores_continue(void)7343 sched_update_powered_cores_continue(void)
7344 {
7345 	lck_mtx_lock(&cluster_powerdown_lock);
7346 
7347 	if (!cluster_powerdown_suspend_count) {
7348 		spl_t s = splsched();
7349 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7350 
7351 		uint64_t latest = latest_requested_powered_cores;
7352 		processor_reason_t reason = latest_requested_reason;
7353 		uint64_t current = current_requested_powered_cores;
7354 		current_requested_powered_cores = latest;
7355 		bool in_sleep = perfcontrol_sleep_override;
7356 
7357 		simple_unlock(&sched_available_cores_lock);
7358 		splx(s);
7359 
7360 		while (latest != current) {
7361 			if (!in_sleep) {
7362 				assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7363 				sched_update_powered_cores(latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7364 			}
7365 
7366 			s = splsched();
7367 			simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7368 
7369 			latest = latest_requested_powered_cores;
7370 			reason = latest_requested_reason;
7371 			current = current_requested_powered_cores;
7372 			current_requested_powered_cores = latest;
7373 			in_sleep = perfcontrol_sleep_override;
7374 
7375 			simple_unlock(&sched_available_cores_lock);
7376 			splx(s);
7377 		}
7378 
7379 		assert_wait((event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7380 
7381 		s = splsched();
7382 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7383 		if (latest_requested_powered_cores != current_requested_powered_cores) {
7384 			clear_wait(current_thread(), THREAD_AWAKENED);
7385 		}
7386 		simple_unlock(&sched_available_cores_lock);
7387 		splx(s);
7388 	}
7389 
7390 	lck_mtx_unlock(&cluster_powerdown_lock);
7391 
7392 	thread_block((thread_continue_t)sched_update_powered_cores_continue);
7393 	/*NOTREACHED*/
7394 }
7395 
7396 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7397 sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7398 {
7399 	assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7400 
7401 #if DEVELOPMENT || DEBUG
7402 	if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7403 		if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7404 			assert(cluster_powerdown_suspend_count > 0);
7405 		}
7406 		if (flags & ASSERT_IN_SLEEP) {
7407 			assert(perfcontrol_sleep_override == true);
7408 		}
7409 		return;
7410 	}
7411 #endif
7412 
7413 	spl_t s = splsched();
7414 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7415 
7416 	bool should_wakeup = !cluster_powerdown_suspend_count;
7417 	if (should_wakeup) {
7418 		latest_requested_powered_cores = requested_powered_cores;
7419 		latest_requested_reason = reason;
7420 	}
7421 
7422 	simple_unlock(&sched_available_cores_lock);
7423 	splx(s);
7424 
7425 	if (should_wakeup) {
7426 		thread_wakeup((event_t)sched_update_powered_cores_continue);
7427 	}
7428 }
7429 
7430 void
suspend_cluster_powerdown(void)7431 suspend_cluster_powerdown(void)
7432 {
7433 	lck_mtx_lock(&cluster_powerdown_lock);
7434 
7435 	assert(cluster_powerdown_suspend_count >= 0);
7436 
7437 	bool first_suspend = (cluster_powerdown_suspend_count == 0);
7438 	if (first_suspend) {
7439 		spl_t s = splsched();
7440 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7441 		latest_requested_powered_cores = ALL_CORES_POWERED;
7442 		current_requested_powered_cores = ALL_CORES_POWERED;
7443 		latest_requested_reason = REASON_SYSTEM;
7444 		simple_unlock(&sched_available_cores_lock);
7445 		splx(s);
7446 	}
7447 
7448 	cluster_powerdown_suspend_count++;
7449 
7450 	if (first_suspend) {
7451 		kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7452 		sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7453 	}
7454 
7455 	lck_mtx_unlock(&cluster_powerdown_lock);
7456 }
7457 
7458 void
resume_cluster_powerdown(void)7459 resume_cluster_powerdown(void)
7460 {
7461 	lck_mtx_lock(&cluster_powerdown_lock);
7462 
7463 	if (cluster_powerdown_suspend_count <= 0) {
7464 		panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7465 	}
7466 
7467 	cluster_powerdown_suspend_count--;
7468 
7469 	bool last_resume = (cluster_powerdown_suspend_count == 0);
7470 
7471 	if (last_resume) {
7472 		spl_t s = splsched();
7473 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7474 		latest_requested_powered_cores = ALL_CORES_POWERED;
7475 		current_requested_powered_cores = ALL_CORES_POWERED;
7476 		latest_requested_reason = REASON_SYSTEM;
7477 		simple_unlock(&sched_available_cores_lock);
7478 		splx(s);
7479 
7480 		kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7481 		sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE);
7482 	}
7483 
7484 	lck_mtx_unlock(&cluster_powerdown_lock);
7485 }
7486 
7487 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7488 static bool user_suspended_cluster_powerdown = false;
7489 
7490 kern_return_t
suspend_cluster_powerdown_from_user(void)7491 suspend_cluster_powerdown_from_user(void)
7492 {
7493 	kern_return_t ret = KERN_FAILURE;
7494 
7495 	lck_mtx_lock(&user_cluster_powerdown_lock);
7496 
7497 	if (!user_suspended_cluster_powerdown) {
7498 		suspend_cluster_powerdown();
7499 		user_suspended_cluster_powerdown = true;
7500 		ret = KERN_SUCCESS;
7501 	}
7502 
7503 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7504 
7505 	return ret;
7506 }
7507 
7508 kern_return_t
resume_cluster_powerdown_from_user(void)7509 resume_cluster_powerdown_from_user(void)
7510 {
7511 	kern_return_t ret = KERN_FAILURE;
7512 
7513 	lck_mtx_lock(&user_cluster_powerdown_lock);
7514 
7515 	if (user_suspended_cluster_powerdown) {
7516 		resume_cluster_powerdown();
7517 		user_suspended_cluster_powerdown = false;
7518 		ret = KERN_SUCCESS;
7519 	}
7520 
7521 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7522 
7523 	return ret;
7524 }
7525 
7526 int
get_cluster_powerdown_user_suspended(void)7527 get_cluster_powerdown_user_suspended(void)
7528 {
7529 	lck_mtx_lock(&user_cluster_powerdown_lock);
7530 
7531 	int ret = (int)user_suspended_cluster_powerdown;
7532 
7533 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7534 
7535 	return ret;
7536 }
7537 
7538 #if DEVELOPMENT || DEBUG
7539 /* Functions to support the temporary sysctl */
7540 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7541 void
sched_set_powered_cores(int requested_powered_cores)7542 sched_set_powered_cores(int requested_powered_cores)
7543 {
7544 	processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7545 	uint32_t flags = requested_powered_cores & 0x30000000;
7546 
7547 	saved_requested_powered_cores = requested_powered_cores;
7548 
7549 	requested_powered_cores = bits(requested_powered_cores, 28, 0);
7550 
7551 	sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7552 }
7553 int
sched_get_powered_cores(void)7554 sched_get_powered_cores(void)
7555 {
7556 	return (int)saved_requested_powered_cores;
7557 }
7558 #endif
7559 
7560 /*
7561  * Ensure that all cores are powered and recommended before sleep
7562  */
7563 void
sched_override_available_cores_for_sleep(void)7564 sched_override_available_cores_for_sleep(void)
7565 {
7566 	spl_t s = splsched();
7567 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7568 
7569 	if (perfcontrol_sleep_override == false) {
7570 		perfcontrol_sleep_override = true;
7571 #if __arm__ || __arm64__
7572 		sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7573 #endif
7574 	}
7575 
7576 	simple_unlock(&sched_available_cores_lock);
7577 	splx(s);
7578 
7579 	suspend_cluster_powerdown();
7580 }
7581 
7582 /*
7583  * Restore the previously recommended cores, but leave all cores powered
7584  * after sleep
7585  */
7586 void
sched_restore_available_cores_after_sleep(void)7587 sched_restore_available_cores_after_sleep(void)
7588 {
7589 	spl_t s = splsched();
7590 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7591 
7592 	if (perfcontrol_sleep_override == true) {
7593 		perfcontrol_sleep_override = false;
7594 #if __arm__ || __arm64__
7595 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7596 		    REASON_NONE, 0);
7597 #endif
7598 	}
7599 
7600 	simple_unlock(&sched_available_cores_lock);
7601 	splx(s);
7602 
7603 	resume_cluster_powerdown();
7604 }
7605 
7606 #if __arm__ || __arm64__
7607 
7608 uint32_t    perfcontrol_requested_recommended_core_count = MAX_CPUS;
7609 bool        perfcontrol_failsafe_active = false;
7610 
7611 uint64_t    perfcontrol_failsafe_maintenance_runnable_time;
7612 uint64_t    perfcontrol_failsafe_activation_time;
7613 uint64_t    perfcontrol_failsafe_deactivation_time;
7614 
7615 /* data covering who likely caused it and how long they ran */
7616 #define FAILSAFE_NAME_LEN       33 /* (2*MAXCOMLEN)+1 from size of p_name */
7617 char        perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7618 int         perfcontrol_failsafe_pid;
7619 uint64_t    perfcontrol_failsafe_tid;
7620 uint64_t    perfcontrol_failsafe_thread_timer_at_start;
7621 uint64_t    perfcontrol_failsafe_thread_timer_last_seen;
7622 uint64_t    perfcontrol_failsafe_recommended_at_trigger;
7623 
7624 /*
7625  * Perf controller calls here to update the recommended core bitmask.
7626  * If the failsafe is active, we don't immediately apply the new value.
7627  * Instead, we store the new request and use it after the failsafe deactivates.
7628  *
7629  * If the failsafe is not active, immediately apply the update.
7630  *
7631  * No scheduler locks are held, no other locks are held that scheduler might depend on,
7632  * interrupts are enabled
7633  *
7634  * currently prototype is in osfmk/arm/machine_routines.h
7635  */
7636 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,uint32_t flags)7637 sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7638 {
7639 	assert(preemption_enabled());
7640 
7641 	spl_t s = splsched();
7642 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7643 
7644 	if (reason == REASON_CLPC_SYSTEM) {
7645 		perfcontrol_system_requested_recommended_cores = recommended_cores;
7646 	} else {
7647 		assert(reason == REASON_CLPC_USER);
7648 		perfcontrol_user_requested_recommended_cores = recommended_cores;
7649 	}
7650 
7651 	perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7652 	perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7653 
7654 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7655 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7656 	} else {
7657 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7658 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7659 		    perfcontrol_requested_recommended_cores,
7660 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7661 	}
7662 
7663 	simple_unlock(&sched_available_cores_lock);
7664 	splx(s);
7665 }
7666 
7667 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7668 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7669 {
7670 	sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
7671 }
7672 
7673 /*
7674  * Consider whether we need to activate the recommended cores failsafe
7675  *
7676  * Called from quantum timer interrupt context of a realtime thread
7677  * No scheduler locks are held, interrupts are disabled
7678  */
7679 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7680 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7681 {
7682 	/*
7683 	 * Check if a realtime thread is starving the system
7684 	 * and bringing up non-recommended cores would help
7685 	 *
7686 	 * TODO: Is this the correct check for recommended == possible cores?
7687 	 * TODO: Validate the checks without the relevant lock are OK.
7688 	 */
7689 
7690 	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7691 		/* keep track of how long the responsible thread runs */
7692 		uint64_t cur_th_time = recount_current_thread_time_mach();
7693 
7694 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7695 
7696 		if (perfcontrol_failsafe_active == TRUE &&
7697 		    cur_thread->thread_id == perfcontrol_failsafe_tid) {
7698 			perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7699 		}
7700 
7701 		simple_unlock(&sched_available_cores_lock);
7702 
7703 		/* we're already trying to solve the problem, so bail */
7704 		return;
7705 	}
7706 
7707 	/* The failsafe won't help if there are no more processors to enable */
7708 	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7709 		return;
7710 	}
7711 
7712 	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7713 
7714 	/* Use the maintenance thread as our canary in the coal mine */
7715 	thread_t m_thread = sched_maintenance_thread;
7716 
7717 	/* If it doesn't look bad, nothing to see here */
7718 	if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7719 		return;
7720 	}
7721 
7722 	/* It looks bad, take the lock to be sure */
7723 	thread_lock(m_thread);
7724 
7725 	if (thread_get_runq(m_thread) == PROCESSOR_NULL ||
7726 	    (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7727 	    m_thread->last_made_runnable_time >= too_long_ago) {
7728 		/*
7729 		 * Maintenance thread is either on cpu or blocked, and
7730 		 * therefore wouldn't benefit from more cores
7731 		 */
7732 		thread_unlock(m_thread);
7733 		return;
7734 	}
7735 
7736 	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7737 
7738 	thread_unlock(m_thread);
7739 
7740 	/*
7741 	 * There are cores disabled at perfcontrol's recommendation, but the
7742 	 * system is so overloaded that the maintenance thread can't run.
7743 	 * That likely means that perfcontrol can't run either, so it can't fix
7744 	 * the recommendation.  We have to kick in a failsafe to keep from starving.
7745 	 *
7746 	 * When the maintenance thread has been starved for too long,
7747 	 * ignore the recommendation from perfcontrol and light up all the cores.
7748 	 *
7749 	 * TODO: Consider weird states like boot, sleep, or debugger
7750 	 */
7751 
7752 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7753 
7754 	if (perfcontrol_failsafe_active == TRUE) {
7755 		simple_unlock(&sched_available_cores_lock);
7756 		return;
7757 	}
7758 
7759 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7760 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7761 	    perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7762 
7763 	perfcontrol_failsafe_active = TRUE;
7764 	perfcontrol_failsafe_activation_time = mach_absolute_time();
7765 	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7766 	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7767 
7768 	/* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7769 	task_t task = get_threadtask(cur_thread);
7770 	perfcontrol_failsafe_pid = task_pid(task);
7771 	strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
7772 
7773 	perfcontrol_failsafe_tid = cur_thread->thread_id;
7774 
7775 	/* Blame the thread for time it has run recently */
7776 	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7777 
7778 	uint64_t last_seen = recount_current_thread_time_mach();
7779 
7780 	/* Compute the start time of the bad behavior in terms of the thread's on core time */
7781 	perfcontrol_failsafe_thread_timer_at_start  = last_seen - recent_computation;
7782 	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7783 
7784 	/* Ignore the previously recommended core configuration */
7785 	sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7786 
7787 	simple_unlock(&sched_available_cores_lock);
7788 }
7789 
7790 /*
7791  * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7792  *
7793  * Runs in the context of the maintenance thread, no locks held
7794  */
7795 static void
sched_recommended_cores_maintenance(void)7796 sched_recommended_cores_maintenance(void)
7797 {
7798 	/* Common case - no failsafe, nothing to be done here */
7799 	if (__probable(perfcontrol_failsafe_active == FALSE)) {
7800 		return;
7801 	}
7802 
7803 	uint64_t ctime = mach_absolute_time();
7804 
7805 	boolean_t print_diagnostic = FALSE;
7806 	char p_name[FAILSAFE_NAME_LEN] = "";
7807 
7808 	spl_t s = splsched();
7809 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7810 
7811 	/* Check again, under the lock, to avoid races */
7812 	if (perfcontrol_failsafe_active == FALSE) {
7813 		goto out;
7814 	}
7815 
7816 	/*
7817 	 * Ensure that the other cores get another few ticks to run some threads
7818 	 * If we don't have this hysteresis, the maintenance thread is the first
7819 	 * to run, and then it immediately kills the other cores
7820 	 */
7821 	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7822 		goto out;
7823 	}
7824 
7825 	/* Capture some diagnostic state under the lock so we can print it out later */
7826 
7827 	int      pid = perfcontrol_failsafe_pid;
7828 	uint64_t tid = perfcontrol_failsafe_tid;
7829 
7830 	uint64_t thread_usage       = perfcontrol_failsafe_thread_timer_last_seen -
7831 	    perfcontrol_failsafe_thread_timer_at_start;
7832 	uint64_t rec_cores_before   = perfcontrol_failsafe_recommended_at_trigger;
7833 	uint64_t rec_cores_after    = perfcontrol_requested_recommended_cores;
7834 	uint64_t failsafe_duration  = ctime - perfcontrol_failsafe_activation_time;
7835 	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7836 
7837 	print_diagnostic = TRUE;
7838 
7839 	/* Deactivate the failsafe and reinstate the requested recommendation settings */
7840 
7841 	perfcontrol_failsafe_deactivation_time = ctime;
7842 	perfcontrol_failsafe_active = FALSE;
7843 
7844 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7845 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7846 	    perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7847 
7848 	sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7849 	    REASON_NONE, 0);
7850 
7851 out:
7852 	simple_unlock(&sched_available_cores_lock);
7853 	splx(s);
7854 
7855 	if (print_diagnostic) {
7856 		uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7857 
7858 		absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7859 		failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7860 
7861 		absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7862 		thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7863 
7864 		printf("recommended core failsafe kicked in for %lld ms "
7865 		    "likely due to %s[%d] thread 0x%llx spending "
7866 		    "%lld ms on cpu at realtime priority - "
7867 		    "new recommendation: 0x%llx -> 0x%llx\n",
7868 		    failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7869 		    rec_cores_before, rec_cores_after);
7870 	}
7871 }
7872 
7873 #endif /* __arm64__ */
7874 
7875 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7876 sched_processor_enable(processor_t processor, boolean_t enable)
7877 {
7878 	assert(preemption_enabled());
7879 
7880 	if (processor == master_processor) {
7881 		/* The system can hang if this is allowed */
7882 		return KERN_NOT_SUPPORTED;
7883 	}
7884 
7885 	spl_t s = splsched();
7886 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7887 
7888 	if (enable) {
7889 		bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7890 	} else {
7891 		bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7892 	}
7893 
7894 #if __arm64__
7895 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7896 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7897 		    REASON_USER, 0);
7898 	} else {
7899 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7900 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7901 		    perfcontrol_requested_recommended_cores,
7902 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7903 	}
7904 #else /* __arm64__ */
7905 	sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7906 #endif /* ! __arm64__ */
7907 
7908 	simple_unlock(&sched_available_cores_lock);
7909 	splx(s);
7910 
7911 	return KERN_SUCCESS;
7912 }
7913 
7914 void
sched_mark_processor_online_locked(processor_t processor,__assert_only processor_reason_t reason)7915 sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7916 {
7917 	assert((processor != master_processor) || (reason == REASON_SYSTEM));
7918 
7919 	bit_set(sched_online_processors, processor->cpu_id);
7920 }
7921 
7922 kern_return_t
sched_mark_processor_offline(processor_t processor,processor_reason_t reason)7923 sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7924 {
7925 	assert((processor != master_processor) || (reason == REASON_SYSTEM));
7926 	kern_return_t ret = KERN_SUCCESS;
7927 
7928 	spl_t s = splsched();
7929 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7930 
7931 	if (reason == REASON_SYSTEM) {
7932 		bit_clear(sched_online_processors, processor->cpu_id);
7933 		simple_unlock(&sched_available_cores_lock);
7934 		splx(s);
7935 		return ret;
7936 	}
7937 
7938 	uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7939 
7940 	if (!bit_test(sched_online_processors, processor->cpu_id)) {
7941 		/* Processor is already offline */
7942 		ret = KERN_NOT_IN_SET;
7943 	} else if (available_cores == BIT(processor->cpu_id)) {
7944 		ret = KERN_RESOURCE_SHORTAGE;
7945 	} else {
7946 		bit_clear(sched_online_processors, processor->cpu_id);
7947 		ret = KERN_SUCCESS;
7948 	}
7949 
7950 	simple_unlock(&sched_available_cores_lock);
7951 	splx(s);
7952 
7953 	return ret;
7954 }
7955 
7956 /*
7957  * Apply a new recommended cores mask to the processors it affects
7958  * Runs after considering failsafes and such
7959  *
7960  * Iterate over processors and update their ->is_recommended field.
7961  * If a processor is running, we let it drain out at its next
7962  * quantum expiration or blocking point. If a processor is idle, there
7963  * may be more work for it to do, so IPI it.
7964  *
7965  * interrupts disabled, sched_available_cores_lock is held
7966  */
7967 static void
sched_update_recommended_cores(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)7968 sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7969 {
7970 	uint64_t        needs_exit_idle_mask = 0x0;
7971 
7972 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7973 	    recommended_cores,
7974 #if __arm64__
7975 	    perfcontrol_failsafe_active, 0, 0);
7976 #else /* __arm64__ */
7977 	    0, 0, 0);
7978 #endif /* ! __arm64__ */
7979 
7980 	if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7981 		bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7982 	}
7983 
7984 	/* First set recommended cores */
7985 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7986 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7987 			processor_set_t pset = pset_array[pset_id];
7988 
7989 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7990 			cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7991 
7992 			if (newly_recommended == 0) {
7993 				/* Nothing to do */
7994 				continue;
7995 			}
7996 
7997 			pset_lock(pset);
7998 
7999 			for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
8000 				processor_t processor = processor_array[cpu_id];
8001 				processor->is_recommended = TRUE;
8002 				processor->last_recommend_reason = reason;
8003 				bit_set(pset->recommended_bitmask, processor->cpu_id);
8004 
8005 				if (processor->state == PROCESSOR_IDLE) {
8006 					if (processor != current_processor()) {
8007 						bit_set(needs_exit_idle_mask, processor->cpu_id);
8008 					}
8009 				}
8010 				if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8011 					os_atomic_inc(&processor_avail_count_user, relaxed);
8012 					if (processor->processor_primary == processor) {
8013 						os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8014 					}
8015 					SCHED(pset_made_schedulable)(processor, pset, false);
8016 				}
8017 			}
8018 			pset_update_rt_stealable_state(pset);
8019 
8020 			pset_unlock(pset);
8021 
8022 			for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0;
8023 			    cpu_id = lsb_next(newly_recommended, cpu_id)) {
8024 				smr_cpu_up(processor_array[cpu_id],
8025 				    SMR_CPU_REASON_IGNORED);
8026 			}
8027 		}
8028 	}
8029 
8030 	/* Now shutdown not recommended cores */
8031 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8032 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8033 			processor_set_t pset = pset_array[pset_id];
8034 
8035 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8036 			cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8037 
8038 			if (newly_unrecommended == 0) {
8039 				/* Nothing to do */
8040 				continue;
8041 			}
8042 
8043 			pset_lock(pset);
8044 
8045 			for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
8046 				processor_t processor = processor_array[cpu_id];
8047 				sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8048 
8049 				processor->is_recommended = FALSE;
8050 				if (reason != REASON_NONE) {
8051 					processor->last_derecommend_reason = reason;
8052 				}
8053 				bit_clear(pset->recommended_bitmask, processor->cpu_id);
8054 				if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8055 					os_atomic_dec(&processor_avail_count_user, relaxed);
8056 					if (processor->processor_primary == processor) {
8057 						os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8058 					}
8059 				}
8060 				pset_update_rt_stealable_state(pset);
8061 
8062 				if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8063 					ipi_type = SCHED_IPI_IMMEDIATE;
8064 				}
8065 				SCHED(processor_queue_shutdown)(processor);
8066 				/* pset unlocked */
8067 
8068 				SCHED(rt_queue_shutdown)(processor);
8069 
8070 				if (ipi_type == SCHED_IPI_NONE) {
8071 					/*
8072 					 * If the core is idle,
8073 					 * we can directly mark the processor
8074 					 * as "Ignored"
8075 					 *
8076 					 * Otherwise, smr will detect this
8077 					 * during smr_cpu_leave() when the
8078 					 * processor actually idles.
8079 					 */
8080 					smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8081 				} else if (processor == current_processor()) {
8082 					ast_on(AST_PREEMPT);
8083 				} else {
8084 					sched_ipi_perform(processor, ipi_type);
8085 				}
8086 
8087 				pset_lock(pset);
8088 			}
8089 			pset_unlock(pset);
8090 		}
8091 	}
8092 
8093 #if defined(__x86_64__)
8094 	commpage_update_active_cpus();
8095 #endif
8096 	/* Issue all pending IPIs now that the pset lock has been dropped */
8097 	for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
8098 		processor_t processor = processor_array[cpuid];
8099 		machine_signal_idle(processor);
8100 	}
8101 
8102 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8103 	    needs_exit_idle_mask, 0, 0, 0);
8104 }
8105 
8106 static void
sched_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,uint32_t flags)8107 sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8108 {
8109 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8110 	    requested_powered_cores, reason, flags, 0);
8111 
8112 	assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
8113 
8114 	/*
8115 	 * Loop through newly set requested_powered_cores and start them.
8116 	 * Loop through newly cleared requested_powered_cores and shut them down.
8117 	 */
8118 
8119 	if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
8120 		flags |= SHUTDOWN_TEMPORARY;
8121 	}
8122 
8123 	/* First set powered cores */
8124 	cpumap_t started_cores = 0ull;
8125 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8126 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8127 			processor_set_t pset = pset_array[pset_id];
8128 
8129 			spl_t s = splsched();
8130 			pset_lock(pset);
8131 			cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8132 			cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8133 			cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8134 			pset_unlock(pset);
8135 			splx(s);
8136 
8137 			cpumap_t newly_powered = requested_changes & requested_powered_cores;
8138 
8139 			cpumap_t cpu_map = newly_powered;
8140 
8141 			if (flags & (LOCK_STATE | UNLOCK_STATE)) {
8142 				/*
8143 				 * We need to change the lock state even if
8144 				 * we don't need to change the actual state.
8145 				 */
8146 				cpu_map = pset_requested_powered_cores;
8147 				/* But not the master_processor, which is always implicitly locked */
8148 				bit_clear(cpu_map, master_processor->cpu_id);
8149 			}
8150 
8151 			if (cpu_map == 0) {
8152 				/* Nothing to do */
8153 				continue;
8154 			}
8155 
8156 			for (int cpu_id = lsb_first(cpu_map); cpu_id >= 0; cpu_id = lsb_next(cpu_map, cpu_id)) {
8157 				processor_t processor = processor_array[cpu_id];
8158 				processor_start_reason(processor, reason, flags);
8159 				bit_set(started_cores, cpu_id);
8160 			}
8161 		}
8162 	}
8163 	if (flags & WAIT_FOR_LAST_START) {
8164 		for (int cpu_id = lsb_first(started_cores); cpu_id >= 0; cpu_id = lsb_next(started_cores, cpu_id)) {
8165 			processor_t processor = processor_array[cpu_id];
8166 			processor_wait_for_start(processor);
8167 		}
8168 	}
8169 
8170 	/* Now shutdown not powered cores */
8171 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8172 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8173 			processor_set_t pset = pset_array[pset_id];
8174 
8175 			spl_t s = splsched();
8176 			pset_lock(pset);
8177 			cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8178 			cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8179 			pset_unlock(pset);
8180 			splx(s);
8181 
8182 			cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8183 
8184 			if (newly_unpowered == 0) {
8185 				/* Nothing to do */
8186 				continue;
8187 			}
8188 
8189 			for (int cpu_id = lsb_first(newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(newly_unpowered, cpu_id)) {
8190 				processor_t processor = processor_array[cpu_id];
8191 
8192 				processor_exit_reason(processor, reason, flags);
8193 			}
8194 		}
8195 	}
8196 
8197 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
8198 }
8199 
8200 void
thread_set_options(uint32_t thopt)8201 thread_set_options(uint32_t thopt)
8202 {
8203 	spl_t x;
8204 	thread_t t = current_thread();
8205 
8206 	x = splsched();
8207 	thread_lock(t);
8208 
8209 	t->options |= thopt;
8210 
8211 	thread_unlock(t);
8212 	splx(x);
8213 }
8214 
8215 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)8216 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8217 {
8218 	thread->pending_block_hint = block_hint;
8219 }
8220 
8221 uint32_t
qos_max_parallelism(int qos,uint64_t options)8222 qos_max_parallelism(int qos, uint64_t options)
8223 {
8224 	return SCHED(qos_max_parallelism)(qos, options);
8225 }
8226 
8227 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)8228 sched_qos_max_parallelism(__unused int qos, uint64_t options)
8229 {
8230 	host_basic_info_data_t hinfo;
8231 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8232 
8233 
8234 	/*
8235 	 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8236 	 * implement their own qos_max_parallelism() interfaces.
8237 	 */
8238 	assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
8239 
8240 	/* Query the machine layer for core information */
8241 	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
8242 	    (host_info_t)&hinfo, &count);
8243 	assert(kret == KERN_SUCCESS);
8244 
8245 	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8246 		return hinfo.logical_cpu;
8247 	} else {
8248 		return hinfo.physical_cpu;
8249 	}
8250 }
8251 
8252 int sched_allow_NO_SMT_threads = 1;
8253 bool
thread_no_smt(thread_t thread)8254 thread_no_smt(thread_t thread)
8255 {
8256 	return sched_allow_NO_SMT_threads &&
8257 	       (thread->bound_processor == PROCESSOR_NULL) &&
8258 	       ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
8259 }
8260 
8261 bool
processor_active_thread_no_smt(processor_t processor)8262 processor_active_thread_no_smt(processor_t processor)
8263 {
8264 	return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8265 }
8266 
8267 #if __arm64__
8268 
8269 /*
8270  * Set up or replace old timer with new timer
8271  *
8272  * Returns true if canceled old timer, false if it did not
8273  */
8274 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)8275 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8276 {
8277 	/*
8278 	 * Exchange deadline for new deadline, if old deadline was nonzero,
8279 	 * then I cancelled the callback, otherwise I didn't
8280 	 */
8281 
8282 	return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8283 	           relaxed) != 0;
8284 }
8285 
8286 /*
8287  * Set global SFI window (in usec)
8288  */
8289 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)8290 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8291 {
8292 	kern_return_t ret = KERN_NOT_SUPPORTED;
8293 #if CONFIG_THREAD_GROUPS
8294 	if (window_usecs == 0ULL) {
8295 		ret = sfi_window_cancel();
8296 	} else {
8297 		ret = sfi_set_window(window_usecs);
8298 	}
8299 #endif // CONFIG_THREAD_GROUPS
8300 	return ret;
8301 }
8302 
8303 /*
8304  * Set background and maintenance SFI class offtimes
8305  */
8306 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)8307 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8308 {
8309 	kern_return_t ret = KERN_NOT_SUPPORTED;
8310 #if CONFIG_THREAD_GROUPS
8311 	if (offtime_usecs == 0ULL) {
8312 		ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8313 		ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8314 	} else {
8315 		ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8316 		ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8317 	}
8318 #endif // CONFIG_THREAD_GROUPS
8319 	return ret;
8320 }
8321 
8322 /*
8323  * Set utility SFI class offtime
8324  */
8325 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)8326 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8327 {
8328 	kern_return_t ret = KERN_NOT_SUPPORTED;
8329 #if CONFIG_THREAD_GROUPS
8330 	if (offtime_usecs == 0ULL) {
8331 		ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8332 	} else {
8333 		ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8334 	}
8335 #endif // CONFIG_THREAD_GROUPS
8336 	return ret;
8337 }
8338 
8339 #endif /* __arm64__ */
8340 
8341 #if CONFIG_SCHED_EDGE
8342 
8343 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8344 
8345 /*
8346  * sched_edge_pset_running_higher_bucket()
8347  *
8348  * Routine to calculate cumulative running counts for each scheduling
8349  * bucket. This effectively lets the load calculation calculate if a
8350  * cluster is running any threads at a QoS lower than the thread being
8351  * migrated etc.
8352  */
8353 
8354 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)8355 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8356 {
8357 	bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8358 
8359 	/* Edge Scheduler Optimization */
8360 	for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8361 		sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8362 		for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8363 			running_higher[bucket]++;
8364 		}
8365 	}
8366 }
8367 
8368 /*
8369  * sched_update_pset_load_average()
8370  *
8371  * Updates the load average for each sched bucket for a cluster.
8372  * This routine must be called with the pset lock held.
8373  */
8374 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)8375 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8376 {
8377 	int avail_cpu_count = pset_available_cpu_count(pset);
8378 	if (avail_cpu_count == 0) {
8379 		/* Looks like the pset is not runnable any more; nothing to do here */
8380 		return;
8381 	}
8382 
8383 	/*
8384 	 * Edge Scheduler Optimization
8385 	 *
8386 	 * See if more callers of this routine can pass in timestamps to avoid the
8387 	 * mach_absolute_time() call here.
8388 	 */
8389 
8390 	if (!curtime) {
8391 		curtime = mach_absolute_time();
8392 	}
8393 	uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8394 	int64_t delta_ticks = curtime - last_update;
8395 	if (delta_ticks < 0) {
8396 		return;
8397 	}
8398 
8399 	uint64_t delta_nsecs = 0;
8400 	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8401 
8402 	if (__improbable(delta_nsecs > UINT32_MAX)) {
8403 		delta_nsecs = UINT32_MAX;
8404 	}
8405 
8406 #if CONFIG_SCHED_EDGE
8407 	/* Update the shared resource load on the pset */
8408 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8409 		uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8410 		uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8411 		uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8412 		uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8413 		if (old_shared_load != new_shared_load) {
8414 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8415 		}
8416 	}
8417 #endif /* CONFIG_SCHED_EDGE */
8418 
8419 	uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8420 	sched_edge_pset_running_higher_bucket(pset, running_higher);
8421 
8422 	for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8423 		uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8424 		uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8425 		uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) +  rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8426 
8427 		/*
8428 		 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8429 		 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8430 		 * new load averga needs to be shifted before it can be added to the old load average.
8431 		 */
8432 		uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8433 
8434 		/*
8435 		 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8436 		 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8437 		 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8438 		 */
8439 		int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8440 		boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8441 		boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8442 		uint64_t load_average;
8443 		if (load_uptick || load_downtick) {
8444 			load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8445 		} else {
8446 			/* Indicates a loaded system; use EWMA for load average calculation */
8447 			load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8448 		}
8449 		os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8450 		if (load_average != old_load_average) {
8451 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8452 		}
8453 	}
8454 	os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8455 }
8456 
8457 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)8458 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8459 {
8460 	pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8461 	uint64_t avg_thread_execution_time = 0;
8462 
8463 	os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8464 	    old_execution_time_packed.pset_execution_time_packed,
8465 	    new_execution_time_packed.pset_execution_time_packed, relaxed, {
8466 		uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8467 		int64_t delta_ticks = curtime - last_update;
8468 		if (delta_ticks < 0) {
8469 		        /*
8470 		         * Its possible that another CPU came in and updated the pset_execution_time
8471 		         * before this CPU could do it. Since the average execution time is meant to
8472 		         * be an approximate measure per cluster, ignore the older update.
8473 		         */
8474 		        os_atomic_rmw_loop_give_up(return );
8475 		}
8476 		uint64_t delta_nsecs = 0;
8477 		absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8478 
8479 		uint64_t nanotime = 0;
8480 		absolutetime_to_nanoseconds(execution_time, &nanotime);
8481 		uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8482 
8483 		uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8484 		uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8485 
8486 		avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8487 		new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8488 		new_execution_time_packed.pset_execution_time_last_update = curtime;
8489 	});
8490 	if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8491 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8492 	}
8493 }
8494 
8495 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)8496 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8497 {
8498 	return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8499 }
8500 
8501 #else /* CONFIG_SCHED_EDGE */
8502 
8503 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)8504 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8505 {
8506 	int non_rt_load = pset->pset_runq.count;
8507 	int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8508 	int new_load_average = ((int)pset->load_average + load) >> 1;
8509 
8510 	pset->load_average = new_load_average;
8511 #if (DEVELOPMENT || DEBUG)
8512 #if __AMP__
8513 	if (pset->pset_cluster_type == PSET_AMP_P) {
8514 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8515 	}
8516 #endif
8517 #endif
8518 }
8519 
8520 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)8521 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8522 {
8523 }
8524 
8525 #endif /* CONFIG_SCHED_EDGE */
8526 
8527 /* pset is locked */
8528 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)8529 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8530 {
8531 	int cpuid = processor->cpu_id;
8532 #if defined(__x86_64__)
8533 	if (sched_avoid_cpu0 && (cpuid == 0)) {
8534 		return false;
8535 	}
8536 #endif
8537 
8538 	cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8539 
8540 	return bit_test(fasttrack_map, cpuid);
8541 }
8542 
8543 /* pset is locked */
8544 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)8545 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8546 {
8547 #if defined(__x86_64__)
8548 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8549 #else
8550 	const bool avoid_cpu0 = false;
8551 #endif
8552 	cpumap_t cpu_map;
8553 
8554 try_again:
8555 	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8556 	if (skip_processor) {
8557 		bit_clear(cpu_map, skip_processor->cpu_id);
8558 	}
8559 	if (skip_spills) {
8560 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8561 	}
8562 
8563 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8564 		bit_clear(cpu_map, 0);
8565 	}
8566 
8567 	cpumap_t primary_map = cpu_map & pset->primary_map;
8568 	if (avoid_cpu0) {
8569 		primary_map = bit_ror64(primary_map, 1);
8570 	}
8571 
8572 	int rotid = lsb_first(primary_map);
8573 	if (rotid >= 0) {
8574 		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8575 
8576 		processor_t processor = processor_array[cpuid];
8577 
8578 		return processor;
8579 	}
8580 
8581 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8582 		goto out;
8583 	}
8584 
8585 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8586 		/* Also avoid cpu1 */
8587 		bit_clear(cpu_map, 1);
8588 	}
8589 
8590 	/* Consider secondary processors whose primary is actually running a realtime thread */
8591 	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8592 	if (avoid_cpu0) {
8593 		/* Also avoid cpu1 */
8594 		secondary_map = bit_ror64(secondary_map, 2);
8595 	}
8596 	rotid = lsb_first(secondary_map);
8597 	if (rotid >= 0) {
8598 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
8599 
8600 		processor_t processor = processor_array[cpuid];
8601 
8602 		return processor;
8603 	}
8604 
8605 	/* Consider secondary processors */
8606 	secondary_map = cpu_map & ~pset->primary_map;
8607 	if (avoid_cpu0) {
8608 		/* Also avoid cpu1 */
8609 		secondary_map = bit_ror64(secondary_map, 2);
8610 	}
8611 	rotid = lsb_first(secondary_map);
8612 	if (rotid >= 0) {
8613 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
8614 
8615 		processor_t processor = processor_array[cpuid];
8616 
8617 		return processor;
8618 	}
8619 
8620 	/*
8621 	 * I was hoping the compiler would optimize
8622 	 * this away when avoid_cpu0 is const bool false
8623 	 * but it still complains about the assignmnent
8624 	 * in that case.
8625 	 */
8626 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8627 #if defined(__x86_64__)
8628 		avoid_cpu0 = false;
8629 #else
8630 		assert(0);
8631 #endif
8632 		goto try_again;
8633 	}
8634 
8635 out:
8636 	if (skip_processor) {
8637 		return PROCESSOR_NULL;
8638 	}
8639 
8640 	/*
8641 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
8642 	 * not already running realtime threads than realtime threads in the realtime run queue,
8643 	 * this thread belongs in this pset, so choose some other processor in this pset
8644 	 * to ensure the thread is enqueued here.
8645 	 */
8646 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8647 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8648 		cpu_map = non_realtime_map;
8649 		assert(cpu_map != 0);
8650 		int cpuid = bit_first(cpu_map);
8651 		assert(cpuid >= 0);
8652 		return processor_array[cpuid];
8653 	}
8654 
8655 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8656 		goto skip_secondaries;
8657 	}
8658 
8659 	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8660 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8661 		cpu_map = non_realtime_map;
8662 		assert(cpu_map != 0);
8663 		int cpuid = bit_first(cpu_map);
8664 		assert(cpuid >= 0);
8665 		return processor_array[cpuid];
8666 	}
8667 
8668 skip_secondaries:
8669 	return PROCESSOR_NULL;
8670 }
8671 
8672 /*
8673  * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8674  * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8675  *
8676  * pset is locked.
8677  */
8678 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)8679 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8680 {
8681 	uint64_t  furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
8682 	processor_t fd_processor = PROCESSOR_NULL;
8683 	int lowest_priority = max_pri;
8684 
8685 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8686 	if (skip_processor) {
8687 		bit_clear(cpu_map, skip_processor->cpu_id);
8688 	}
8689 	if (skip_spills) {
8690 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8691 	}
8692 
8693 	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8694 		processor_t processor = processor_array[cpuid];
8695 
8696 		if (processor->current_pri > lowest_priority) {
8697 			continue;
8698 		}
8699 
8700 		if (processor->current_pri < lowest_priority) {
8701 			lowest_priority = processor->current_pri;
8702 			furthest_deadline = processor->deadline;
8703 			fd_processor = processor;
8704 			continue;
8705 		}
8706 
8707 		if (processor->deadline > furthest_deadline) {
8708 			furthest_deadline = processor->deadline;
8709 			fd_processor = processor;
8710 		}
8711 	}
8712 
8713 	if (fd_processor) {
8714 		return fd_processor;
8715 	}
8716 
8717 	/*
8718 	 * There is a race condition possible when there are multiple processor sets.
8719 	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8720 	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8721 	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8722 	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8723 	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8724 	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8725 	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8726 	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8727 	 *
8728 	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8729 	 * on the run queue of that pset.
8730 	 */
8731 	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8732 		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8733 		assert(skip_processor == PROCESSOR_NULL);
8734 		assert(skip_spills == false);
8735 
8736 		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8737 			processor_t processor = processor_array[cpuid];
8738 
8739 			if (processor->current_pri > lowest_priority) {
8740 				continue;
8741 			}
8742 
8743 			if (processor->current_pri < lowest_priority) {
8744 				lowest_priority = processor->current_pri;
8745 				furthest_deadline = processor->deadline;
8746 				fd_processor = processor;
8747 				continue;
8748 			}
8749 
8750 			if (processor->deadline > furthest_deadline) {
8751 				furthest_deadline = processor->deadline;
8752 				fd_processor = processor;
8753 			}
8754 		}
8755 	}
8756 
8757 	return fd_processor;
8758 }
8759 
8760 /* pset is locked */
8761 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)8762 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8763 {
8764 	bool skip_spills = true;
8765 	bool include_ast_urgent_pending_cpus = false;
8766 
8767 	processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8768 	if (next_processor != PROCESSOR_NULL) {
8769 		return next_processor;
8770 	}
8771 
8772 	next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8773 	return next_processor;
8774 }
8775 
8776 #if defined(__x86_64__)
8777 /* pset is locked */
8778 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)8779 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8780 {
8781 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8782 	int nbackup_cpus = 0;
8783 
8784 	if (include_backups && rt_runq_is_low_latency(pset)) {
8785 		nbackup_cpus = sched_rt_n_backup_processors;
8786 	}
8787 
8788 	cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8789 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8790 		bit_clear(cpu_map, 0);
8791 	}
8792 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8793 }
8794 
8795 /* pset is locked */
8796 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)8797 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8798 {
8799 	int nbackup_cpus = 0;
8800 
8801 	if (include_backups && rt_runq_is_low_latency(pset)) {
8802 		nbackup_cpus = sched_rt_n_backup_processors;
8803 	}
8804 
8805 	cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8806 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8807 }
8808 #endif
8809 
8810 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8811 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8812 {
8813 	if (!processor->is_recommended) {
8814 		return false;
8815 	}
8816 	bool ok_to_run_realtime_thread = true;
8817 #if defined(__x86_64__)
8818 	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8819 	if (spill_pending) {
8820 		return true;
8821 	}
8822 	if (processor->cpu_id == 0) {
8823 		if (sched_avoid_cpu0 == 1) {
8824 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8825 		} else if (sched_avoid_cpu0 == 2) {
8826 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8827 		}
8828 	} else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8829 		ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8830 	} else if (processor->processor_primary != processor) {
8831 		ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8832 	}
8833 #else
8834 	(void)pset;
8835 	(void)processor;
8836 	(void)as_backup;
8837 #endif
8838 	return ok_to_run_realtime_thread;
8839 }
8840 
8841 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8842 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8843 {
8844 	if (drop_lock) {
8845 		pset_unlock(pset);
8846 	}
8847 }
8848 
8849 void
thread_set_no_smt(bool set)8850 thread_set_no_smt(bool set)
8851 {
8852 	if (!system_is_SMT) {
8853 		/* Not a machine that supports SMT */
8854 		return;
8855 	}
8856 
8857 	thread_t thread = current_thread();
8858 
8859 	spl_t s = splsched();
8860 	thread_lock(thread);
8861 	if (set) {
8862 		thread->sched_flags |= TH_SFLAG_NO_SMT;
8863 	}
8864 	thread_unlock(thread);
8865 	splx(s);
8866 }
8867 
8868 bool
thread_get_no_smt(void)8869 thread_get_no_smt(void)
8870 {
8871 	return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8872 }
8873 
8874 extern void task_set_no_smt(task_t);
8875 void
task_set_no_smt(task_t task)8876 task_set_no_smt(task_t task)
8877 {
8878 	if (!system_is_SMT) {
8879 		/* Not a machine that supports SMT */
8880 		return;
8881 	}
8882 
8883 	if (task == TASK_NULL) {
8884 		task = current_task();
8885 	}
8886 
8887 	task_lock(task);
8888 	task->t_flags |= TF_NO_SMT;
8889 	task_unlock(task);
8890 }
8891 
8892 #if DEBUG || DEVELOPMENT
8893 extern void sysctl_task_set_no_smt(char no_smt);
8894 void
sysctl_task_set_no_smt(char no_smt)8895 sysctl_task_set_no_smt(char no_smt)
8896 {
8897 	if (!system_is_SMT) {
8898 		/* Not a machine that supports SMT */
8899 		return;
8900 	}
8901 
8902 	task_t task = current_task();
8903 
8904 	task_lock(task);
8905 	if (no_smt == '1') {
8906 		task->t_flags |= TF_NO_SMT;
8907 	}
8908 	task_unlock(task);
8909 }
8910 
8911 extern char sysctl_task_get_no_smt(void);
8912 char
sysctl_task_get_no_smt(void)8913 sysctl_task_get_no_smt(void)
8914 {
8915 	task_t task = current_task();
8916 
8917 	if (task->t_flags & TF_NO_SMT) {
8918 		return '1';
8919 	}
8920 	return '0';
8921 }
8922 #endif /* DEVELOPMENT || DEBUG */
8923 
8924 
8925 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8926 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8927 {
8928 #if __AMP__
8929 	spl_t s = splsched();
8930 	thread_lock(thread);
8931 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8932 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8933 	if (soft_bound) {
8934 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8935 	}
8936 	switch (cluster_type) {
8937 	case 'e':
8938 	case 'E':
8939 		if (pset0.pset_cluster_type == PSET_AMP_E) {
8940 			thread->th_bound_cluster_id = pset0.pset_id;
8941 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8942 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8943 		}
8944 		break;
8945 	case 'p':
8946 	case 'P':
8947 		if (pset0.pset_cluster_type == PSET_AMP_P) {
8948 			thread->th_bound_cluster_id = pset0.pset_id;
8949 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8950 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8951 		}
8952 		break;
8953 	default:
8954 		break;
8955 	}
8956 	thread_unlock(thread);
8957 	splx(s);
8958 
8959 	if (thread == current_thread()) {
8960 		thread_block(THREAD_CONTINUE_NULL);
8961 	}
8962 #else /* __AMP__ */
8963 	(void)thread;
8964 	(void)cluster_type;
8965 	(void)soft_bound;
8966 #endif /* __AMP__ */
8967 }
8968 
8969 extern uint32_t thread_bound_cluster_id(thread_t thread);
8970 uint32_t
thread_bound_cluster_id(thread_t thread)8971 thread_bound_cluster_id(thread_t thread)
8972 {
8973 	return thread->th_bound_cluster_id;
8974 }
8975 
8976 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8977 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8978 {
8979 #if __AMP__
8980 
8981 	processor_set_t pset = NULL;
8982 
8983 	/* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
8984 	if ((options & THREAD_UNBIND) || cluster_id == THREAD_BOUND_CLUSTER_NONE) {
8985 		/* If the thread was actually not bound to some cluster, nothing to do here */
8986 		if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8987 			return KERN_SUCCESS;
8988 		}
8989 	} else {
8990 		/* Validate the inputs for the bind case */
8991 		int max_clusters = ml_get_cluster_count();
8992 		if (cluster_id >= max_clusters) {
8993 			/* Invalid cluster id */
8994 			return KERN_INVALID_VALUE;
8995 		}
8996 		pset = pset_array[cluster_id];
8997 		if (pset == NULL) {
8998 			/* Cluster has not been initialized yet */
8999 			return KERN_INVALID_VALUE;
9000 		}
9001 		if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9002 			if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
9003 				/* Thread is not recommended for the cluster type */
9004 				return KERN_INVALID_POLICY;
9005 			}
9006 		}
9007 	}
9008 
9009 	spl_t s = splsched();
9010 	thread_lock(thread);
9011 
9012 	/* Unbind the thread from its previous bound state */
9013 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9014 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9015 
9016 	if (options & THREAD_UNBIND) {
9017 		/* Nothing more to do here */
9018 		goto thread_bind_cluster_complete;
9019 	}
9020 
9021 	if (options & THREAD_BIND_SOFT) {
9022 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
9023 	}
9024 	thread->th_bound_cluster_id = cluster_id;
9025 
9026 thread_bind_cluster_complete:
9027 	thread_unlock(thread);
9028 	splx(s);
9029 
9030 	if (thread == current_thread()) {
9031 		thread_block(THREAD_CONTINUE_NULL);
9032 	}
9033 #else /* __AMP__ */
9034 	(void)thread;
9035 	(void)cluster_id;
9036 	(void)options;
9037 #endif /* __AMP__ */
9038 	return KERN_SUCCESS;
9039 }
9040 
9041 #if DEVELOPMENT || DEBUG
9042 extern int32_t sysctl_get_bound_cpuid(void);
9043 int32_t
sysctl_get_bound_cpuid(void)9044 sysctl_get_bound_cpuid(void)
9045 {
9046 	int32_t cpuid = -1;
9047 	thread_t self = current_thread();
9048 
9049 	processor_t processor = self->bound_processor;
9050 	if (processor == NULL) {
9051 		cpuid = -1;
9052 	} else {
9053 		cpuid = processor->cpu_id;
9054 	}
9055 
9056 	return cpuid;
9057 }
9058 
9059 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9060 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)9061 sysctl_thread_bind_cpuid(int32_t cpuid)
9062 {
9063 	processor_t processor = PROCESSOR_NULL;
9064 
9065 	if (cpuid == -1) {
9066 		goto unbind;
9067 	}
9068 
9069 	if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9070 		return KERN_INVALID_VALUE;
9071 	}
9072 
9073 	processor = processor_array[cpuid];
9074 	if (processor == PROCESSOR_NULL) {
9075 		return KERN_INVALID_VALUE;
9076 	}
9077 
9078 #if __AMP__
9079 
9080 	thread_t thread = current_thread();
9081 
9082 	if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9083 		if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
9084 			/* Cannot hard-bind an already hard-cluster-bound thread */
9085 			return KERN_NOT_SUPPORTED;
9086 		}
9087 	}
9088 
9089 #endif /* __AMP__ */
9090 
9091 unbind:
9092 	thread_bind(processor);
9093 
9094 	thread_block(THREAD_CONTINUE_NULL);
9095 	return KERN_SUCCESS;
9096 }
9097 
9098 extern char sysctl_get_task_cluster_type(void);
9099 char
sysctl_get_task_cluster_type(void)9100 sysctl_get_task_cluster_type(void)
9101 {
9102 	task_t task = current_task();
9103 	processor_set_t pset_hint = task->pset_hint;
9104 
9105 	if (!pset_hint) {
9106 		return '0';
9107 	}
9108 
9109 #if __AMP__
9110 	if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9111 		return 'E';
9112 	} else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9113 		return 'P';
9114 	}
9115 #endif
9116 
9117 	return '0';
9118 }
9119 
9120 #if __AMP__
9121 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)9122 find_pset_of_type(pset_cluster_type_t t)
9123 {
9124 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9125 		if (node->pset_cluster_type != t) {
9126 			continue;
9127 		}
9128 
9129 		processor_set_t pset = PROCESSOR_SET_NULL;
9130 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9131 			pset = pset_array[pset_id];
9132 			/* Prefer one with recommended processsors */
9133 			if (pset->recommended_bitmask != 0) {
9134 				assert(pset->pset_cluster_type == t);
9135 				return pset;
9136 			}
9137 		}
9138 		/* Otherwise return whatever was found last */
9139 		return pset;
9140 	}
9141 
9142 	return PROCESSOR_SET_NULL;
9143 }
9144 #endif
9145 
9146 extern void sysctl_task_set_cluster_type(char cluster_type);
9147 void
sysctl_task_set_cluster_type(char cluster_type)9148 sysctl_task_set_cluster_type(char cluster_type)
9149 {
9150 	task_t task = current_task();
9151 	processor_set_t pset_hint = PROCESSOR_SET_NULL;
9152 
9153 #if __AMP__
9154 	switch (cluster_type) {
9155 	case 'e':
9156 	case 'E':
9157 		pset_hint = find_pset_of_type(PSET_AMP_E);
9158 		break;
9159 	case 'p':
9160 	case 'P':
9161 		pset_hint = find_pset_of_type(PSET_AMP_P);
9162 		break;
9163 	default:
9164 		break;
9165 	}
9166 
9167 	if (pset_hint) {
9168 		task_lock(task);
9169 		task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9170 		task->pset_hint = pset_hint;
9171 		task_unlock(task);
9172 
9173 		thread_block(THREAD_CONTINUE_NULL);
9174 	}
9175 #else
9176 	(void)cluster_type;
9177 	(void)task;
9178 	(void)pset_hint;
9179 #endif
9180 }
9181 
9182 /*
9183  * The quantum length used for Fixed and RT sched modes. In general the quantum
9184  * can vary - for example for background or QOS.
9185  */
9186 extern uint64_t sysctl_get_quantum_us(void);
9187 uint64_t
sysctl_get_quantum_us(void)9188 sysctl_get_quantum_us(void)
9189 {
9190 	uint32_t quantum;
9191 	uint64_t quantum_ns;
9192 
9193 	quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9194 	absolutetime_to_nanoseconds(quantum, &quantum_ns);
9195 
9196 	return quantum_ns / 1000;
9197 }
9198 
9199 #endif /* DEVELOPMENT || DEBUG */
9200