xref: /xnu-10063.121.3/osfmk/kern/sched_prim.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	sched_prim.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1986
62  *
63  *	Scheduling primitives
64  *
65  */
66 
67 #include <debug.h>
68 
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74 
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80 
81 #include <machine/commpage.h>
82 
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #include <kern/monotonic.h>
94 #include <kern/processor.h>
95 #include <kern/queue.h>
96 #include <kern/recount.h>
97 #include <kern/restartable.h>
98 #include <kern/sched.h>
99 #include <kern/sched_prim.h>
100 #include <kern/sfi.h>
101 #include <kern/syscall_subr.h>
102 #include <kern/task.h>
103 #include <kern/thread.h>
104 #include <kern/thread_group.h>
105 #include <kern/ledger.h>
106 #include <kern/timer_queue.h>
107 #include <kern/waitq.h>
108 #include <kern/policy_internal.h>
109 
110 #include <vm/pmap.h>
111 #include <vm/vm_kern.h>
112 #include <vm/vm_map.h>
113 #include <vm/vm_pageout.h>
114 
115 #include <mach/sdt.h>
116 #include <mach/mach_host.h>
117 #include <mach/host_info.h>
118 
119 #include <sys/kdebug.h>
120 #include <kperf/kperf.h>
121 #include <kern/kpc.h>
122 #include <san/kasan.h>
123 #include <kern/pms.h>
124 #include <kern/host.h>
125 #include <stdatomic.h>
126 #include <os/atomic_private.h>
127 
128 #ifdef KDBG_MACOS_RELEASE
129 #define KTRC KDBG_MACOS_RELEASE
130 #else
131 #define KTRC KDBG_RELEASE
132 #endif
133 
134 struct sched_statistics PERCPU_DATA(sched_stats);
135 bool sched_stats_active;
136 
137 static uint64_t
deadline_add(uint64_t d,uint64_t e)138 deadline_add(uint64_t d, uint64_t e)
139 {
140 	uint64_t sum;
141 	return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
142 }
143 
144 int
rt_runq_count(processor_set_t pset)145 rt_runq_count(processor_set_t pset)
146 {
147 	return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
148 }
149 
150 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)151 rt_runq_earliest_deadline(processor_set_t pset)
152 {
153 	return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
154 }
155 
156 static int
rt_runq_priority(processor_set_t pset)157 rt_runq_priority(processor_set_t pset)
158 {
159 	pset_assert_locked(pset);
160 	rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
161 
162 	bitmap_t *map = rt_run_queue->bitmap;
163 	int i = bitmap_first(map, NRTQS);
164 	assert(i < NRTQS);
165 
166 	if (i >= 0) {
167 		return i + BASEPRI_RTQUEUES;
168 	}
169 
170 	return i;
171 }
172 
173 static thread_t rt_runq_first(rt_queue_t rt_runq);
174 
175 #if DEBUG
176 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)177 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
178 {
179 	bitmap_t *map = rt_run_queue->bitmap;
180 
181 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
182 	uint32_t constraint = RT_CONSTRAINT_NONE;
183 	int ed_index = NOPRI;
184 	int count = 0;
185 	bool found_thread = false;
186 
187 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
188 		int i = pri - BASEPRI_RTQUEUES;
189 		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
190 		queue_t queue = &rt_runq->pri_queue;
191 		queue_entry_t iter;
192 		int n = 0;
193 		uint64_t previous_deadline = 0;
194 		qe_foreach(iter, queue) {
195 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
196 			assert_thread_magic(iter_thread);
197 			if (iter_thread == thread) {
198 				found_thread = true;
199 			}
200 			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
201 			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
202 			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
203 			assert(previous_deadline <= iter_thread->realtime.deadline);
204 			n++;
205 			if (iter == queue_first(queue)) {
206 				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
207 				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
208 			}
209 			previous_deadline = iter_thread->realtime.deadline;
210 		}
211 		assert(n == rt_runq->pri_count);
212 		if (n == 0) {
213 			assert(bitmap_test(map, i) == false);
214 			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
215 			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
216 		} else {
217 			assert(bitmap_test(map, i) == true);
218 		}
219 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
220 			earliest_deadline = rt_runq->pri_earliest_deadline;
221 			constraint = rt_runq->pri_constraint;
222 			ed_index = i;
223 		}
224 		count += n;
225 	}
226 	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
227 	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
228 	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
229 	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
230 	if (thread) {
231 		assert(found_thread);
232 	}
233 }
234 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    check_rt_runq_consistency(q, th)
235 #else
236 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    do {} while (0)
237 #endif
238 
239 uint32_t rt_constraint_threshold;
240 
241 static bool
rt_runq_is_low_latency(processor_set_t pset)242 rt_runq_is_low_latency(processor_set_t pset)
243 {
244 	return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
245 }
246 
247 TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
248 
249 /* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
250 TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
251 static uint64_t nonurgent_preemption_timer_abs = 0;
252 
253 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
254 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
255 
256 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
257 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
258 
259 #define         MAX_UNSAFE_RT_QUANTA               100
260 #define         SAFE_RT_MULTIPLIER                 2
261 
262 #define         MAX_UNSAFE_FIXED_QUANTA               100
263 #define         SAFE_FIXED_MULTIPLIER                 2
264 
265 TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
266 TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
267 
268 TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
269 TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
270 
271 #define         MAX_POLL_QUANTA                 2
272 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
273 
274 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
275 int             sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
276 
277 uint64_t        max_poll_computation;
278 
279 uint64_t        max_unsafe_rt_computation;
280 uint64_t        max_unsafe_fixed_computation;
281 uint64_t        sched_safe_rt_duration;
282 uint64_t        sched_safe_fixed_duration;
283 
284 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
285 
286 uint32_t        std_quantum;
287 uint32_t        min_std_quantum;
288 uint32_t        bg_quantum;
289 
290 uint32_t        std_quantum_us;
291 uint32_t        bg_quantum_us;
292 
293 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
294 
295 uint32_t        thread_depress_time;
296 uint32_t        default_timeshare_computation;
297 uint32_t        default_timeshare_constraint;
298 
299 uint32_t        max_rt_quantum;
300 uint32_t        min_rt_quantum;
301 
302 uint32_t        rt_deadline_epsilon;
303 
304 uint32_t        rt_constraint_threshold;
305 
306 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
307 
308 unsigned                sched_tick;
309 uint32_t                sched_tick_interval;
310 
311 /* Timeshare load calculation interval (15ms) */
312 uint32_t                sched_load_compute_interval_us = 15000;
313 uint64_t                sched_load_compute_interval_abs;
314 static _Atomic uint64_t sched_load_compute_deadline;
315 
316 uint32_t        sched_pri_shifts[TH_BUCKET_MAX];
317 uint32_t        sched_fixed_shift;
318 
319 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
320 
321 /* Allow foreground to decay past default to resolve inversions */
322 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
323 int             sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324 
325 /* Defaults for timer deadline profiling */
326 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
327 	                                               * 2ms */
328 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
329 	                                               *   <= 5ms */
330 
331 uint64_t timer_deadline_tracking_bin_1;
332 uint64_t timer_deadline_tracking_bin_2;
333 
334 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
335 
336 thread_t sched_maintenance_thread;
337 
338 /* interrupts disabled lock to guard recommended cores state */
339 decl_simple_lock_data(, sched_available_cores_lock);
340 uint64_t        perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
341 uint64_t        perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
342 uint64_t        perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343 static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344 static uint64_t sched_online_processors = 0;
345 static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
346 static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
347 
348 #if __arm64__
349 static void sched_recommended_cores_maintenance(void);
350 uint64_t    perfcontrol_failsafe_starvation_threshold;
351 extern char *proc_name_address(struct proc *p);
352 #endif /* __arm64__ */
353 
354 uint64_t        sched_one_second_interval;
355 boolean_t       allow_direct_handoff = TRUE;
356 
357 /* Forwards */
358 
359 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
360 
361 static void load_shift_init(void);
362 static void preempt_pri_init(void);
363 
364 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
365 
366 thread_t        processor_idle(
367 	thread_t                        thread,
368 	processor_t                     processor);
369 
370 static ast_t
371 csw_check_locked(
372 	thread_t        thread,
373 	processor_t     processor,
374 	processor_set_t pset,
375 	ast_t           check_reason);
376 
377 static void processor_setrun(
378 	processor_t                    processor,
379 	thread_t                       thread,
380 	integer_t                      options);
381 
382 static void
383 sched_realtime_timebase_init(void);
384 
385 static void
386 sched_timer_deadline_tracking_init(void);
387 
388 #if     DEBUG
389 extern int debug_task;
390 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
391 #else
392 #define TLOG(a, fmt, args...) do {} while (0)
393 #endif
394 
395 static processor_t
396 thread_bind_internal(
397 	thread_t                thread,
398 	processor_t             processor);
399 
400 static void
401 sched_vm_group_maintenance(void);
402 
403 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
404 int8_t          sched_load_shifts[NRQS];
405 bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
406 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
407 
408 /*
409  * Statically allocate a buffer to hold the longest possible
410  * scheduler description string, as currently implemented.
411  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
412  * to export to userspace via sysctl(3). If either version
413  * changes, update the other.
414  *
415  * Note that in addition to being an upper bound on the strings
416  * in the kernel, it's also an exact parameter to PE_get_default(),
417  * which interrogates the device tree on some platforms. That
418  * API requires the caller know the exact size of the device tree
419  * property, so we need both a legacy size (32) and the current size
420  * (48) to deal with old and new device trees. The device tree property
421  * is similarly padded to a fixed size so that the same kernel image
422  * can run on multiple devices with different schedulers configured
423  * in the device tree.
424  */
425 char sched_string[SCHED_STRING_MAX_LENGTH];
426 
427 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
428 
429 /* Global flag which indicates whether Background Stepper Context is enabled */
430 static int cpu_throttle_enabled = 1;
431 
432 #if DEVELOPMENT || DEBUG
433 int enable_task_set_cluster_type = 0;
434 bool system_ecore_only = false;
435 #endif /* DEVELOPMENT || DEBUG */
436 
437 void
sched_init(void)438 sched_init(void)
439 {
440 	boolean_t direct_handoff = FALSE;
441 	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
442 
443 	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
444 		/* No boot-args, check in device tree */
445 		if (!PE_get_default("kern.sched_pri_decay_limit",
446 		    &sched_pri_decay_band_limit,
447 		    sizeof(sched_pri_decay_band_limit))) {
448 			/* Allow decay all the way to normal limits */
449 			sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
450 		}
451 	}
452 
453 	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
454 
455 	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
456 		kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
457 	}
458 	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
459 
460 #if __arm64__
461 	clock_interval_to_absolutetime_interval(expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, &expecting_ipi_wfe_timeout_mt);
462 #endif /* __arm64__ */
463 
464 	SCHED(init)();
465 	SCHED(rt_init)(&pset0);
466 	sched_timer_deadline_tracking_init();
467 
468 	SCHED(pset_init)(&pset0);
469 	SCHED(processor_init)(master_processor);
470 
471 	if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
472 		allow_direct_handoff = direct_handoff;
473 	}
474 
475 #if DEVELOPMENT || DEBUG
476 	if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
477 		system_ecore_only = (enable_task_set_cluster_type == 2);
478 	}
479 #endif /* DEVELOPMENT || DEBUG */
480 
481 	simple_lock_init(&sched_available_cores_lock, 0);
482 }
483 
484 void
sched_timebase_init(void)485 sched_timebase_init(void)
486 {
487 	uint64_t        abstime;
488 
489 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
490 	sched_one_second_interval = abstime;
491 
492 	SCHED(timebase_init)();
493 	sched_realtime_timebase_init();
494 }
495 
496 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
497 
498 void
sched_timeshare_init(void)499 sched_timeshare_init(void)
500 {
501 	/*
502 	 * Calculate the timeslicing quantum
503 	 * in us.
504 	 */
505 	if (default_preemption_rate < 1) {
506 		default_preemption_rate = DEFAULT_PREEMPTION_RATE;
507 	}
508 	std_quantum_us = (1000 * 1000) / default_preemption_rate;
509 
510 	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
511 
512 	if (default_bg_preemption_rate < 1) {
513 		default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
514 	}
515 	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
516 
517 	printf("standard background quantum is %d us\n", bg_quantum_us);
518 
519 	load_shift_init();
520 	preempt_pri_init();
521 	sched_tick = 0;
522 }
523 
524 void
sched_set_max_unsafe_rt_quanta(int max)525 sched_set_max_unsafe_rt_quanta(int max)
526 {
527 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
528 
529 	max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
530 
531 	const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
532 	sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
533 
534 
535 #if DEVELOPMENT || DEBUG
536 	max_unsafe_rt_quanta = max;
537 #else
538 	/*
539 	 * On RELEASE kernels, this is only called on boot where
540 	 * max is already equal to max_unsafe_rt_quanta.
541 	 */
542 	assert3s(max, ==, max_unsafe_rt_quanta);
543 #endif
544 }
545 
546 void
sched_set_max_unsafe_fixed_quanta(int max)547 sched_set_max_unsafe_fixed_quanta(int max)
548 {
549 	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
550 
551 	max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
552 
553 	const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
554 	sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
555 
556 #if DEVELOPMENT || DEBUG
557 	max_unsafe_fixed_quanta = max;
558 #else
559 	/*
560 	 * On RELEASE kernels, this is only called on boot where
561 	 * max is already equal to max_unsafe_fixed_quanta.
562 	 */
563 	assert3s(max, ==, max_unsafe_fixed_quanta);
564 #endif
565 }
566 
567 void
sched_timeshare_timebase_init(void)568 sched_timeshare_timebase_init(void)
569 {
570 	uint64_t        abstime;
571 	uint32_t        shift;
572 
573 	/* standard timeslicing quantum */
574 	clock_interval_to_absolutetime_interval(
575 		std_quantum_us, NSEC_PER_USEC, &abstime);
576 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
577 	std_quantum = (uint32_t)abstime;
578 
579 	/* smallest remaining quantum (250 us) */
580 	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
581 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
582 	min_std_quantum = (uint32_t)abstime;
583 
584 	/* quantum for background tasks */
585 	clock_interval_to_absolutetime_interval(
586 		bg_quantum_us, NSEC_PER_USEC, &abstime);
587 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
588 	bg_quantum = (uint32_t)abstime;
589 
590 	/* scheduler tick interval */
591 	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
592 	    NSEC_PER_USEC, &abstime);
593 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
594 	sched_tick_interval = (uint32_t)abstime;
595 
596 	/* timeshare load calculation interval & deadline initialization */
597 	clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
598 	os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
599 
600 	/*
601 	 * Compute conversion factor from usage to
602 	 * timesharing priorities with 5/8 ** n aging.
603 	 */
604 	abstime = (abstime * 5) / 3;
605 	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
606 		abstime >>= 1;
607 	}
608 	sched_fixed_shift = shift;
609 
610 	for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
611 		sched_pri_shifts[i] = INT8_MAX;
612 	}
613 
614 	sched_set_max_unsafe_rt_quanta(max_unsafe_rt_quanta);
615 	sched_set_max_unsafe_fixed_quanta(max_unsafe_fixed_quanta);
616 
617 	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
618 	thread_depress_time = 1 * std_quantum;
619 	default_timeshare_computation = std_quantum / 2;
620 	default_timeshare_constraint = std_quantum;
621 
622 #if __arm64__
623 	perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
624 #endif /* __arm64__ */
625 
626 	if (nonurgent_preemption_timer_us) {
627 		clock_interval_to_absolutetime_interval(nonurgent_preemption_timer_us, NSEC_PER_USEC, &abstime);
628 		nonurgent_preemption_timer_abs = abstime;
629 	}
630 }
631 
632 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
633 
634 void
pset_rt_init(processor_set_t pset)635 pset_rt_init(processor_set_t pset)
636 {
637 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
638 		int i = pri - BASEPRI_RTQUEUES;
639 		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
640 		queue_init(&rqi->pri_queue);
641 		rqi->pri_count = 0;
642 		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
643 		rqi->pri_constraint = RT_CONSTRAINT_NONE;
644 	}
645 	os_atomic_init(&pset->rt_runq.count, 0);
646 	os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
647 	os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
648 	os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
649 	memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
650 }
651 
652 /* epsilon for comparing RT deadlines */
653 int rt_deadline_epsilon_us = 100;
654 
655 int
sched_get_rt_deadline_epsilon(void)656 sched_get_rt_deadline_epsilon(void)
657 {
658 	return rt_deadline_epsilon_us;
659 }
660 
661 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)662 sched_set_rt_deadline_epsilon(int new_epsilon_us)
663 {
664 	rt_deadline_epsilon_us = new_epsilon_us;
665 
666 	uint64_t abstime;
667 	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
668 	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
669 	rt_deadline_epsilon = (uint32_t)abstime;
670 }
671 
672 static void
sched_realtime_timebase_init(void)673 sched_realtime_timebase_init(void)
674 {
675 	uint64_t abstime;
676 
677 	/* smallest rt computation (50 us) */
678 	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
679 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
680 	min_rt_quantum = (uint32_t)abstime;
681 
682 	/* maximum rt computation (50 ms) */
683 	clock_interval_to_absolutetime_interval(
684 		50, 1000 * NSEC_PER_USEC, &abstime);
685 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
686 	max_rt_quantum = (uint32_t)abstime;
687 
688 	/* constraint threshold for sending backup IPIs (4 ms) */
689 	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
690 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
691 	rt_constraint_threshold = (uint32_t)abstime;
692 
693 	/* epsilon for comparing deadlines */
694 	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
695 }
696 
697 void
sched_check_spill(processor_set_t pset,thread_t thread)698 sched_check_spill(processor_set_t pset, thread_t thread)
699 {
700 	(void)pset;
701 	(void)thread;
702 
703 	return;
704 }
705 
706 bool
sched_thread_should_yield(processor_t processor,thread_t thread)707 sched_thread_should_yield(processor_t processor, thread_t thread)
708 {
709 	(void)thread;
710 
711 	return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
712 }
713 
714 /* Default implementations of .steal_thread_enabled */
715 bool
sched_steal_thread_DISABLED(processor_set_t pset)716 sched_steal_thread_DISABLED(processor_set_t pset)
717 {
718 	(void)pset;
719 	return false;
720 }
721 
722 bool
sched_steal_thread_enabled(processor_set_t pset)723 sched_steal_thread_enabled(processor_set_t pset)
724 {
725 	return bit_count(pset->node->pset_map) > 1;
726 }
727 
728 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
729 
730 /*
731  * Set up values for timeshare
732  * loading factors.
733  */
734 static void
load_shift_init(void)735 load_shift_init(void)
736 {
737 	int8_t          k, *p = sched_load_shifts;
738 	uint32_t        i, j;
739 
740 	uint32_t        sched_decay_penalty = 1;
741 
742 	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
743 		kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
744 	}
745 
746 	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
747 		kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
748 	}
749 
750 	if (sched_decay_penalty == 0) {
751 		/*
752 		 * There is no penalty for timeshare threads for using too much
753 		 * CPU, so set all load shifts to INT8_MIN. Even under high load,
754 		 * sched_pri_shift will be >INT8_MAX, and there will be no
755 		 * penalty applied to threads (nor will sched_usage be updated per
756 		 * thread).
757 		 */
758 		for (i = 0; i < NRQS; i++) {
759 			sched_load_shifts[i] = INT8_MIN;
760 		}
761 
762 		return;
763 	}
764 
765 	*p++ = INT8_MIN; *p++ = 0;
766 
767 	/*
768 	 * For a given system load "i", the per-thread priority
769 	 * penalty per quantum of CPU usage is ~2^k priority
770 	 * levels. "sched_decay_penalty" can cause more
771 	 * array entries to be filled with smaller "k" values
772 	 */
773 	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
774 		for (j <<= 1; (i < j) && (i < NRQS); ++i) {
775 			*p++ = k;
776 		}
777 	}
778 }
779 
780 static void
preempt_pri_init(void)781 preempt_pri_init(void)
782 {
783 	bitmap_t *p = sched_preempt_pri;
784 
785 	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
786 		bitmap_set(p, i);
787 	}
788 
789 	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
790 		bitmap_set(p, i);
791 	}
792 }
793 
794 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
795 
796 void
check_monotonic_time(uint64_t ctime)797 check_monotonic_time(uint64_t ctime)
798 {
799 	processor_t processor = current_processor();
800 	uint64_t last_dispatch = processor->last_dispatch;
801 
802 	if (last_dispatch > ctime) {
803 		panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
804 		    last_dispatch, ctime);
805 	}
806 }
807 
808 
809 /*
810  *	Thread wait timer expiration.
811  *	Runs in timer interrupt context with interrupts disabled.
812  */
813 void
thread_timer_expire(void * p0,__unused void * p1)814 thread_timer_expire(void *p0, __unused void *p1)
815 {
816 	thread_t thread = (thread_t)p0;
817 
818 	assert_thread_magic(thread);
819 
820 	assert(ml_get_interrupts_enabled() == FALSE);
821 
822 	thread_lock(thread);
823 
824 	if (thread->wait_timer_armed) {
825 		thread->wait_timer_armed = false;
826 		clear_wait_internal(thread, THREAD_TIMED_OUT);
827 		/* clear_wait_internal may have dropped and retaken the thread lock */
828 	}
829 
830 	thread->wait_timer_active--;
831 
832 	thread_unlock(thread);
833 }
834 
835 /*
836  *	thread_unblock:
837  *
838  *	Unblock thread on wake up.
839  *
840  *	Returns TRUE if the thread should now be placed on the runqueue.
841  *
842  *	Thread must be locked.
843  *
844  *	Called at splsched().
845  */
846 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)847 thread_unblock(
848 	thread_t                thread,
849 	wait_result_t   wresult)
850 {
851 	boolean_t               ready_for_runq = FALSE;
852 	thread_t                cthread = current_thread();
853 	uint32_t                new_run_count;
854 	int                             old_thread_state;
855 
856 	/*
857 	 *	Set wait_result.
858 	 */
859 	thread->wait_result = wresult;
860 
861 	/*
862 	 *	Cancel pending wait timer.
863 	 */
864 	if (thread->wait_timer_armed) {
865 		if (timer_call_cancel(thread->wait_timer)) {
866 			thread->wait_timer_active--;
867 		}
868 		thread->wait_timer_armed = false;
869 	}
870 
871 	boolean_t aticontext, pidle;
872 	ml_get_power_state(&aticontext, &pidle);
873 
874 	/*
875 	 *	Update scheduling state: not waiting,
876 	 *	set running.
877 	 */
878 	old_thread_state = thread->state;
879 	thread->state = (old_thread_state | TH_RUN) &
880 	    ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
881 
882 	if ((old_thread_state & TH_RUN) == 0) {
883 		uint64_t ctime = mach_approximate_time();
884 
885 		check_monotonic_time(ctime);
886 
887 		thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
888 		timer_start(&thread->runnable_timer, ctime);
889 
890 		ready_for_runq = TRUE;
891 
892 		if (old_thread_state & TH_WAIT_REPORT) {
893 			(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
894 		}
895 
896 		/* Update the runnable thread count */
897 		new_run_count = SCHED(run_count_incr)(thread);
898 
899 #if CONFIG_SCHED_AUTO_JOIN
900 		if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
901 			work_interval_auto_join_propagate(cthread, thread);
902 		}
903 #endif /*CONFIG_SCHED_AUTO_JOIN */
904 
905 	} else {
906 		/*
907 		 * Either the thread is idling in place on another processor,
908 		 * or it hasn't finished context switching yet.
909 		 */
910 		assert((thread->state & TH_IDLE) == 0);
911 		/*
912 		 * The run count is only dropped after the context switch completes
913 		 * and the thread is still waiting, so we should not run_incr here
914 		 */
915 		new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
916 	}
917 
918 	/*
919 	 * Calculate deadline for real-time threads.
920 	 */
921 	if (thread->sched_mode == TH_MODE_REALTIME) {
922 		uint64_t ctime = mach_absolute_time();
923 		thread->realtime.deadline = thread->realtime.constraint + ctime;
924 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
925 		    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
926 	}
927 
928 	/*
929 	 * Clear old quantum, fail-safe computation, etc.
930 	 */
931 	thread->quantum_remaining = 0;
932 	thread->computation_metered = 0;
933 	thread->reason = AST_NONE;
934 	thread->block_hint = kThreadWaitNone;
935 
936 	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
937 	 * We also account for "double hop" thread signaling via
938 	 * the thread callout infrastructure.
939 	 * DRK: consider removing the callout wakeup counters in the future
940 	 * they're present for verification at the moment.
941 	 */
942 
943 	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
944 		DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
945 
946 		uint64_t ttd = current_processor()->timer_call_ttd;
947 
948 		if (ttd) {
949 			if (ttd <= timer_deadline_tracking_bin_1) {
950 				thread->thread_timer_wakeups_bin_1++;
951 			} else if (ttd <= timer_deadline_tracking_bin_2) {
952 				thread->thread_timer_wakeups_bin_2++;
953 			}
954 		}
955 
956 		ledger_credit_thread(thread, thread->t_ledger,
957 		    task_ledgers.interrupt_wakeups, 1);
958 		if (pidle) {
959 			ledger_credit_thread(thread, thread->t_ledger,
960 			    task_ledgers.platform_idle_wakeups, 1);
961 		}
962 	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
963 		/* TODO: what about an interrupt that does a wake taken on a callout thread? */
964 		if (cthread->callout_woken_from_icontext) {
965 			ledger_credit_thread(thread, thread->t_ledger,
966 			    task_ledgers.interrupt_wakeups, 1);
967 			thread->thread_callout_interrupt_wakeups++;
968 
969 			if (cthread->callout_woken_from_platform_idle) {
970 				ledger_credit_thread(thread, thread->t_ledger,
971 				    task_ledgers.platform_idle_wakeups, 1);
972 				thread->thread_callout_platform_idle_wakeups++;
973 			}
974 
975 			cthread->callout_woke_thread = TRUE;
976 		}
977 	}
978 
979 	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
980 		thread->callout_woken_from_icontext = !!aticontext;
981 		thread->callout_woken_from_platform_idle = !!pidle;
982 		thread->callout_woke_thread = FALSE;
983 	}
984 
985 #if KPERF
986 	if (ready_for_runq) {
987 		kperf_make_runnable(thread, aticontext);
988 	}
989 #endif /* KPERF */
990 
991 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
992 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
993 	    (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
994 	    sched_run_buckets[TH_BUCKET_RUN], 0);
995 
996 	DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
997 
998 	return ready_for_runq;
999 }
1000 
1001 /*
1002  *	Routine:	thread_allowed_for_handoff
1003  *	Purpose:
1004  *		Check if the thread is allowed for handoff operation
1005  *	Conditions:
1006  *		thread lock held, IPC locks may be held.
1007  *	TODO: In future, do not allow handoff if threads have different cluster
1008  *	recommendations.
1009  */
1010 boolean_t
thread_allowed_for_handoff(thread_t thread)1011 thread_allowed_for_handoff(
1012 	thread_t         thread)
1013 {
1014 	thread_t self = current_thread();
1015 
1016 	if (allow_direct_handoff &&
1017 	    thread->sched_mode == TH_MODE_REALTIME &&
1018 	    self->sched_mode == TH_MODE_REALTIME) {
1019 		return TRUE;
1020 	}
1021 
1022 	return FALSE;
1023 }
1024 
1025 /*
1026  *	Routine:	thread_go
1027  *	Purpose:
1028  *		Unblock and dispatch thread.
1029  *	Conditions:
1030  *		thread lock held, IPC locks may be held.
1031  *		thread must have been waiting
1032  */
1033 void
thread_go(thread_t thread,wait_result_t wresult,bool try_handoff)1034 thread_go(
1035 	thread_t                thread,
1036 	wait_result_t           wresult,
1037 	bool                    try_handoff)
1038 {
1039 	thread_t self = current_thread();
1040 
1041 	assert_thread_magic(thread);
1042 
1043 	assert(thread->at_safe_point == FALSE);
1044 	assert(thread->wait_event == NO_EVENT64);
1045 	assert(waitq_is_null(thread->waitq));
1046 
1047 	assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1048 	assert(thread->state & TH_WAIT);
1049 
1050 	if (thread->started) {
1051 		assert(thread->state & TH_WAKING);
1052 	}
1053 
1054 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1055 
1056 	assert(ml_get_interrupts_enabled() == false);
1057 
1058 	if (thread_unblock(thread, wresult)) {
1059 #if SCHED_TRACE_THREAD_WAKEUPS
1060 		backtrace(&thread->thread_wakeup_bt[0],
1061 		    (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1062 		    NULL);
1063 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1064 		if (try_handoff && thread_allowed_for_handoff(thread)) {
1065 			thread_reference(thread);
1066 			assert(self->handoff_thread == NULL);
1067 			self->handoff_thread = thread;
1068 		} else {
1069 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1070 		}
1071 	}
1072 }
1073 
1074 /*
1075  *	Routine:	thread_mark_wait_locked
1076  *	Purpose:
1077  *		Mark a thread as waiting.  If, given the circumstances,
1078  *		it doesn't want to wait (i.e. already aborted), then
1079  *		indicate that in the return value.
1080  *	Conditions:
1081  *		at splsched() and thread is locked.
1082  */
1083 __private_extern__
1084 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1085 thread_mark_wait_locked(
1086 	thread_t                        thread,
1087 	wait_interrupt_t        interruptible_orig)
1088 {
1089 	boolean_t                       at_safe_point;
1090 	wait_interrupt_t        interruptible = interruptible_orig;
1091 
1092 	if (thread->state & TH_IDLE) {
1093 		panic("Invalid attempt to wait while running the idle thread");
1094 	}
1095 
1096 	assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1097 
1098 	/*
1099 	 *	The thread may have certain types of interrupts/aborts masked
1100 	 *	off.  Even if the wait location says these types of interrupts
1101 	 *	are OK, we have to honor mask settings (outer-scoped code may
1102 	 *	not be able to handle aborts at the moment).
1103 	 */
1104 	interruptible &= TH_OPT_INTMASK;
1105 	if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1106 		interruptible = thread->options & TH_OPT_INTMASK;
1107 	}
1108 
1109 	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1110 
1111 	if (interruptible == THREAD_UNINT ||
1112 	    !(thread->sched_flags & TH_SFLAG_ABORT) ||
1113 	    (!at_safe_point &&
1114 	    (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1115 		if (!(thread->state & TH_TERMINATE)) {
1116 			DTRACE_SCHED(sleep);
1117 		}
1118 
1119 		int state_bits = TH_WAIT;
1120 		if (!interruptible) {
1121 			state_bits |= TH_UNINT;
1122 		}
1123 		if (thread->sched_call) {
1124 			wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1125 			if (is_kerneltask(get_threadtask(thread))) {
1126 				mask = THREAD_WAIT_NOREPORT_KERNEL;
1127 			}
1128 			if ((interruptible_orig & mask) == 0) {
1129 				state_bits |= TH_WAIT_REPORT;
1130 			}
1131 		}
1132 		thread->state |= state_bits;
1133 		thread->at_safe_point = at_safe_point;
1134 
1135 		/* TODO: pass this through assert_wait instead, have
1136 		 * assert_wait just take a struct as an argument */
1137 		assert(!thread->block_hint);
1138 		thread->block_hint = thread->pending_block_hint;
1139 		thread->pending_block_hint = kThreadWaitNone;
1140 
1141 		return thread->wait_result = THREAD_WAITING;
1142 	} else {
1143 		if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1144 			thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1145 		}
1146 	}
1147 	thread->pending_block_hint = kThreadWaitNone;
1148 
1149 	return thread->wait_result = THREAD_INTERRUPTED;
1150 }
1151 
1152 /*
1153  *	Routine:	thread_interrupt_level
1154  *	Purpose:
1155  *	        Set the maximum interruptible state for the
1156  *		current thread.  The effective value of any
1157  *		interruptible flag passed into assert_wait
1158  *		will never exceed this.
1159  *
1160  *		Useful for code that must not be interrupted,
1161  *		but which calls code that doesn't know that.
1162  *	Returns:
1163  *		The old interrupt level for the thread.
1164  */
1165 __private_extern__
1166 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1167 thread_interrupt_level(
1168 	wait_interrupt_t new_level)
1169 {
1170 	thread_t thread = current_thread();
1171 	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1172 
1173 	thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1174 
1175 	return result;
1176 }
1177 
1178 /*
1179  *	assert_wait:
1180  *
1181  *	Assert that the current thread is about to go to
1182  *	sleep until the specified event occurs.
1183  */
1184 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1185 assert_wait(
1186 	event_t                         event,
1187 	wait_interrupt_t        interruptible)
1188 {
1189 	if (__improbable(event == NO_EVENT)) {
1190 		panic("%s() called with NO_EVENT", __func__);
1191 	}
1192 
1193 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1194 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1195 	    VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1196 
1197 	struct waitq *waitq;
1198 	waitq = global_eventq(event);
1199 	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1200 }
1201 
1202 /*
1203  *	assert_wait_queue:
1204  *
1205  *	Return the global waitq for the specified event
1206  */
1207 struct waitq *
assert_wait_queue(event_t event)1208 assert_wait_queue(
1209 	event_t                         event)
1210 {
1211 	return global_eventq(event);
1212 }
1213 
1214 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1215 assert_wait_timeout(
1216 	event_t                         event,
1217 	wait_interrupt_t        interruptible,
1218 	uint32_t                        interval,
1219 	uint32_t                        scale_factor)
1220 {
1221 	thread_t                        thread = current_thread();
1222 	wait_result_t           wresult;
1223 	uint64_t                        deadline;
1224 	spl_t                           s;
1225 
1226 	if (__improbable(event == NO_EVENT)) {
1227 		panic("%s() called with NO_EVENT", __func__);
1228 	}
1229 
1230 	struct waitq *waitq;
1231 	waitq = global_eventq(event);
1232 
1233 	s = splsched();
1234 	waitq_lock(waitq);
1235 
1236 	clock_interval_to_deadline(interval, scale_factor, &deadline);
1237 
1238 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1239 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1240 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1241 
1242 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1243 	    interruptible,
1244 	    TIMEOUT_URGENCY_SYS_NORMAL,
1245 	    deadline, TIMEOUT_NO_LEEWAY,
1246 	    thread);
1247 
1248 	waitq_unlock(waitq);
1249 	splx(s);
1250 	return wresult;
1251 }
1252 
1253 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1254 assert_wait_timeout_with_leeway(
1255 	event_t                         event,
1256 	wait_interrupt_t        interruptible,
1257 	wait_timeout_urgency_t  urgency,
1258 	uint32_t                        interval,
1259 	uint32_t                        leeway,
1260 	uint32_t                        scale_factor)
1261 {
1262 	thread_t                        thread = current_thread();
1263 	wait_result_t           wresult;
1264 	uint64_t                        deadline;
1265 	uint64_t                        abstime;
1266 	uint64_t                        slop;
1267 	uint64_t                        now;
1268 	spl_t                           s;
1269 
1270 	if (__improbable(event == NO_EVENT)) {
1271 		panic("%s() called with NO_EVENT", __func__);
1272 	}
1273 
1274 	now = mach_absolute_time();
1275 	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1276 	deadline = now + abstime;
1277 
1278 	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1279 
1280 	struct waitq *waitq;
1281 	waitq = global_eventq(event);
1282 
1283 	s = splsched();
1284 	waitq_lock(waitq);
1285 
1286 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1287 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1288 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1289 
1290 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1291 	    interruptible,
1292 	    urgency, deadline, slop,
1293 	    thread);
1294 
1295 	waitq_unlock(waitq);
1296 	splx(s);
1297 	return wresult;
1298 }
1299 
1300 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1301 assert_wait_deadline(
1302 	event_t                         event,
1303 	wait_interrupt_t        interruptible,
1304 	uint64_t                        deadline)
1305 {
1306 	thread_t                        thread = current_thread();
1307 	wait_result_t           wresult;
1308 	spl_t                           s;
1309 
1310 	if (__improbable(event == NO_EVENT)) {
1311 		panic("%s() called with NO_EVENT", __func__);
1312 	}
1313 
1314 	struct waitq *waitq;
1315 	waitq = global_eventq(event);
1316 
1317 	s = splsched();
1318 	waitq_lock(waitq);
1319 
1320 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1321 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1322 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1323 
1324 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1325 	    interruptible,
1326 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1327 	    TIMEOUT_NO_LEEWAY, thread);
1328 	waitq_unlock(waitq);
1329 	splx(s);
1330 	return wresult;
1331 }
1332 
1333 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1334 assert_wait_deadline_with_leeway(
1335 	event_t                         event,
1336 	wait_interrupt_t        interruptible,
1337 	wait_timeout_urgency_t  urgency,
1338 	uint64_t                        deadline,
1339 	uint64_t                        leeway)
1340 {
1341 	thread_t                        thread = current_thread();
1342 	wait_result_t           wresult;
1343 	spl_t                           s;
1344 
1345 	if (__improbable(event == NO_EVENT)) {
1346 		panic("%s() called with NO_EVENT", __func__);
1347 	}
1348 
1349 	struct waitq *waitq;
1350 	waitq = global_eventq(event);
1351 
1352 	s = splsched();
1353 	waitq_lock(waitq);
1354 
1355 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1356 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1357 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1358 
1359 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1360 	    interruptible,
1361 	    urgency, deadline, leeway,
1362 	    thread);
1363 	waitq_unlock(waitq);
1364 	splx(s);
1365 	return wresult;
1366 }
1367 
1368 void
sched_cond_init(sched_cond_atomic_t * cond)1369 sched_cond_init(
1370 	sched_cond_atomic_t *cond)
1371 {
1372 	os_atomic_init(cond, SCHED_COND_INIT);
1373 }
1374 
1375 wait_result_t
sched_cond_wait_parameter(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation,void * parameter)1376 sched_cond_wait_parameter(
1377 	sched_cond_atomic_t *cond,
1378 	wait_interrupt_t interruptible,
1379 	thread_continue_t continuation,
1380 	void *parameter)
1381 {
1382 	assert_wait((event_t) cond, interruptible);
1383 	/* clear active bit to indicate future wakeups will have to unblock this thread */
1384 	sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1385 	if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1386 		/* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1387 		thread_t thread = current_thread();
1388 		clear_wait(thread, THREAD_AWAKENED);
1389 		sched_cond_ack(cond);
1390 		return THREAD_AWAKENED;
1391 	}
1392 	return thread_block_parameter(continuation, parameter);
1393 }
1394 
1395 wait_result_t
sched_cond_wait(sched_cond_atomic_t * cond,wait_interrupt_t interruptible,thread_continue_t continuation)1396 sched_cond_wait(
1397 	sched_cond_atomic_t *cond,
1398 	wait_interrupt_t interruptible,
1399 	thread_continue_t continuation)
1400 {
1401 	return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1402 }
1403 
1404 sched_cond_t
sched_cond_ack(sched_cond_atomic_t * cond)1405 sched_cond_ack(
1406 	sched_cond_atomic_t *cond)
1407 {
1408 	sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1409 	assert(new_cond & SCHED_COND_ACTIVE);
1410 	return new_cond;
1411 }
1412 
1413 kern_return_t
sched_cond_signal(sched_cond_atomic_t * cond,thread_t thread)1414 sched_cond_signal(
1415 	sched_cond_atomic_t  *cond,
1416 	thread_t thread)
1417 {
1418 	disable_preemption();
1419 	sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1420 	if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1421 		/* this was the first wakeup to be issued AND the thread was inactive */
1422 		thread_wakeup_thread((event_t) cond, thread);
1423 	}
1424 	enable_preemption();
1425 	return KERN_SUCCESS;
1426 }
1427 
1428 /*
1429  * thread_isoncpu:
1430  *
1431  * Return TRUE if a thread is running on a processor such that an AST
1432  * is needed to pull it out of userspace execution, or if executing in
1433  * the kernel, bring to a context switch boundary that would cause
1434  * thread state to be serialized in the thread PCB.
1435  *
1436  * Thread locked, returns the same way. While locked, fields
1437  * like "state" cannot change. "runq" can change only from set to unset.
1438  */
1439 static inline boolean_t
thread_isoncpu(thread_t thread)1440 thread_isoncpu(thread_t thread)
1441 {
1442 	/* Not running or runnable */
1443 	if (!(thread->state & TH_RUN)) {
1444 		return FALSE;
1445 	}
1446 
1447 	/* Waiting on a runqueue, not currently running */
1448 	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1449 	if (thread_get_runq(thread) != PROCESSOR_NULL) {
1450 		return FALSE;
1451 	}
1452 
1453 	/*
1454 	 * Thread does not have a stack yet
1455 	 * It could be on the stack alloc queue or preparing to be invoked
1456 	 */
1457 	if (!thread->kernel_stack) {
1458 		return FALSE;
1459 	}
1460 
1461 	/*
1462 	 * Thread must be running on a processor, or
1463 	 * about to run, or just did run. In all these
1464 	 * cases, an AST to the processor is needed
1465 	 * to guarantee that the thread is kicked out
1466 	 * of userspace and the processor has
1467 	 * context switched (and saved register state).
1468 	 */
1469 	return TRUE;
1470 }
1471 
1472 /*
1473  * thread_stop:
1474  *
1475  * Force a preemption point for a thread and wait
1476  * for it to stop running on a CPU. If a stronger
1477  * guarantee is requested, wait until no longer
1478  * runnable. Arbitrates access among
1479  * multiple stop requests. (released by unstop)
1480  *
1481  * The thread must enter a wait state and stop via a
1482  * separate means.
1483  *
1484  * Returns FALSE if interrupted.
1485  */
1486 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1487 thread_stop(
1488 	thread_t                thread,
1489 	boolean_t       until_not_runnable)
1490 {
1491 	wait_result_t   wresult;
1492 	spl_t                   s = splsched();
1493 	boolean_t               oncpu;
1494 
1495 	wake_lock(thread);
1496 	thread_lock(thread);
1497 
1498 	while (thread->state & TH_SUSP) {
1499 		thread->wake_active = TRUE;
1500 		thread_unlock(thread);
1501 
1502 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1503 		wake_unlock(thread);
1504 		splx(s);
1505 
1506 		if (wresult == THREAD_WAITING) {
1507 			wresult = thread_block(THREAD_CONTINUE_NULL);
1508 		}
1509 
1510 		if (wresult != THREAD_AWAKENED) {
1511 			return FALSE;
1512 		}
1513 
1514 		s = splsched();
1515 		wake_lock(thread);
1516 		thread_lock(thread);
1517 	}
1518 
1519 	thread->state |= TH_SUSP;
1520 
1521 	while ((oncpu = thread_isoncpu(thread)) ||
1522 	    (until_not_runnable && (thread->state & TH_RUN))) {
1523 		processor_t             processor;
1524 
1525 		if (oncpu) {
1526 			assert(thread->state & TH_RUN);
1527 			processor = thread->chosen_processor;
1528 			cause_ast_check(processor);
1529 		}
1530 
1531 		thread->wake_active = TRUE;
1532 		thread_unlock(thread);
1533 
1534 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1535 		wake_unlock(thread);
1536 		splx(s);
1537 
1538 		if (wresult == THREAD_WAITING) {
1539 			wresult = thread_block(THREAD_CONTINUE_NULL);
1540 		}
1541 
1542 		if (wresult != THREAD_AWAKENED) {
1543 			thread_unstop(thread);
1544 			return FALSE;
1545 		}
1546 
1547 		s = splsched();
1548 		wake_lock(thread);
1549 		thread_lock(thread);
1550 	}
1551 
1552 	thread_unlock(thread);
1553 	wake_unlock(thread);
1554 	splx(s);
1555 
1556 	/*
1557 	 * We return with the thread unlocked. To prevent it from
1558 	 * transitioning to a runnable state (or from TH_RUN to
1559 	 * being on the CPU), the caller must ensure the thread
1560 	 * is stopped via an external means (such as an AST)
1561 	 */
1562 
1563 	return TRUE;
1564 }
1565 
1566 /*
1567  * thread_unstop:
1568  *
1569  * Release a previous stop request and set
1570  * the thread running if appropriate.
1571  *
1572  * Use only after a successful stop operation.
1573  */
1574 void
thread_unstop(thread_t thread)1575 thread_unstop(
1576 	thread_t        thread)
1577 {
1578 	spl_t           s = splsched();
1579 
1580 	wake_lock(thread);
1581 	thread_lock(thread);
1582 
1583 	assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1584 
1585 	if (thread->state & TH_SUSP) {
1586 		thread->state &= ~TH_SUSP;
1587 
1588 		if (thread->wake_active) {
1589 			thread->wake_active = FALSE;
1590 			thread_unlock(thread);
1591 
1592 			thread_wakeup(&thread->wake_active);
1593 			wake_unlock(thread);
1594 			splx(s);
1595 
1596 			return;
1597 		}
1598 	}
1599 
1600 	thread_unlock(thread);
1601 	wake_unlock(thread);
1602 	splx(s);
1603 }
1604 
1605 /*
1606  * thread_wait:
1607  *
1608  * Wait for a thread to stop running. (non-interruptible)
1609  *
1610  */
1611 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1612 thread_wait(
1613 	thread_t        thread,
1614 	boolean_t       until_not_runnable)
1615 {
1616 	wait_result_t   wresult;
1617 	boolean_t       oncpu;
1618 	processor_t     processor;
1619 	spl_t           s = splsched();
1620 
1621 	wake_lock(thread);
1622 	thread_lock(thread);
1623 
1624 	/*
1625 	 * Wait until not running on a CPU.  If stronger requirement
1626 	 * desired, wait until not runnable.  Assumption: if thread is
1627 	 * on CPU, then TH_RUN is set, so we're not waiting in any case
1628 	 * where the original, pure "TH_RUN" check would have let us
1629 	 * finish.
1630 	 */
1631 	while ((oncpu = thread_isoncpu(thread)) ||
1632 	    (until_not_runnable && (thread->state & TH_RUN))) {
1633 		if (oncpu) {
1634 			assert(thread->state & TH_RUN);
1635 			processor = thread->chosen_processor;
1636 			cause_ast_check(processor);
1637 		}
1638 
1639 		thread->wake_active = TRUE;
1640 		thread_unlock(thread);
1641 
1642 		wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1643 		wake_unlock(thread);
1644 		splx(s);
1645 
1646 		if (wresult == THREAD_WAITING) {
1647 			thread_block(THREAD_CONTINUE_NULL);
1648 		}
1649 
1650 		s = splsched();
1651 		wake_lock(thread);
1652 		thread_lock(thread);
1653 	}
1654 
1655 	thread_unlock(thread);
1656 	wake_unlock(thread);
1657 	splx(s);
1658 }
1659 
1660 /*
1661  *	Routine: clear_wait_internal
1662  *
1663  *		Clear the wait condition for the specified thread.
1664  *		Start the thread executing if that is appropriate.
1665  *	Arguments:
1666  *		thread		thread to awaken
1667  *		result		Wakeup result the thread should see
1668  *	Conditions:
1669  *		At splsched
1670  *		the thread is locked.
1671  *	Returns:
1672  *		KERN_SUCCESS		thread was rousted out a wait
1673  *		KERN_FAILURE		thread was waiting but could not be rousted
1674  *		KERN_NOT_WAITING	thread was not waiting
1675  */
1676 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1677 clear_wait_internal(
1678 	thread_t        thread,
1679 	wait_result_t   wresult)
1680 {
1681 	waitq_t waitq = thread->waitq;
1682 
1683 	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1684 		return KERN_FAILURE;
1685 	}
1686 
1687 	/*
1688 	 * Check that the thread is waiting and not waking, as a waking thread
1689 	 * has already cleared its waitq, and is destined to be go'ed, don't
1690 	 * need to do it again.
1691 	 */
1692 	if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1693 		assert(waitq_is_null(thread->waitq));
1694 		return KERN_NOT_WAITING;
1695 	}
1696 
1697 	/* may drop and retake the thread lock */
1698 	if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1699 		return KERN_NOT_WAITING;
1700 	}
1701 
1702 	thread_go(thread, wresult, /* handoff */ false);
1703 
1704 	return KERN_SUCCESS;
1705 }
1706 
1707 
1708 /*
1709  *	clear_wait:
1710  *
1711  *	Clear the wait condition for the specified thread.  Start the thread
1712  *	executing if that is appropriate.
1713  *
1714  *	parameters:
1715  *	  thread		thread to awaken
1716  *	  result		Wakeup result the thread should see
1717  */
1718 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1719 clear_wait(
1720 	thread_t                thread,
1721 	wait_result_t   result)
1722 {
1723 	kern_return_t ret;
1724 	spl_t           s;
1725 
1726 	s = splsched();
1727 	thread_lock(thread);
1728 
1729 	ret = clear_wait_internal(thread, result);
1730 
1731 	if (thread == current_thread()) {
1732 		/*
1733 		 * The thread must be ready to wait again immediately
1734 		 * after clearing its own wait.
1735 		 */
1736 		assert((thread->state & TH_WAKING) == 0);
1737 	}
1738 
1739 	thread_unlock(thread);
1740 	splx(s);
1741 	return ret;
1742 }
1743 
1744 
1745 /*
1746  *	thread_wakeup_prim:
1747  *
1748  *	Common routine for thread_wakeup, thread_wakeup_with_result,
1749  *	and thread_wakeup_one.
1750  *
1751  */
1752 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1753 thread_wakeup_prim(
1754 	event_t          event,
1755 	boolean_t        one_thread,
1756 	wait_result_t    result)
1757 {
1758 	if (__improbable(event == NO_EVENT)) {
1759 		panic("%s() called with NO_EVENT", __func__);
1760 	}
1761 
1762 	struct waitq *wq = global_eventq(event);
1763 
1764 	if (one_thread) {
1765 		return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1766 	} else {
1767 		return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT);
1768 	}
1769 }
1770 
1771 /*
1772  * Wakeup a specified thread if and only if it's waiting for this event
1773  */
1774 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1775 thread_wakeup_thread(
1776 	event_t         event,
1777 	thread_t        thread)
1778 {
1779 	if (__improbable(event == NO_EVENT)) {
1780 		panic("%s() called with NO_EVENT", __func__);
1781 	}
1782 
1783 	if (__improbable(thread == THREAD_NULL)) {
1784 		panic("%s() called with THREAD_NULL", __func__);
1785 	}
1786 
1787 	struct waitq *wq = global_eventq(event);
1788 
1789 	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1790 }
1791 
1792 /*
1793  * Wakeup a thread waiting on an event and promote it to a priority.
1794  *
1795  * Requires woken thread to un-promote itself when done.
1796  */
1797 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1798 thread_wakeup_one_with_pri(
1799 	event_t      event,
1800 	int          priority)
1801 {
1802 	if (__improbable(event == NO_EVENT)) {
1803 		panic("%s() called with NO_EVENT", __func__);
1804 	}
1805 
1806 	struct waitq *wq = global_eventq(event);
1807 
1808 	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1809 }
1810 
1811 /*
1812  * Wakeup a thread waiting on an event,
1813  * promote it to a priority,
1814  * and return a reference to the woken thread.
1815  *
1816  * Requires woken thread to un-promote itself when done.
1817  */
1818 thread_t
thread_wakeup_identify(event_t event,int priority)1819 thread_wakeup_identify(event_t  event,
1820     int      priority)
1821 {
1822 	if (__improbable(event == NO_EVENT)) {
1823 		panic("%s() called with NO_EVENT", __func__);
1824 	}
1825 
1826 	struct waitq *wq = global_eventq(event);
1827 
1828 	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1829 }
1830 
1831 /*
1832  *	thread_bind:
1833  *
1834  *	Force the current thread to execute on the specified processor.
1835  *	Takes effect after the next thread_block().
1836  *
1837  *	Returns the previous binding.  PROCESSOR_NULL means
1838  *	not bound.
1839  *
1840  *	XXX - DO NOT export this to users - XXX
1841  */
1842 processor_t
thread_bind(processor_t processor)1843 thread_bind(
1844 	processor_t             processor)
1845 {
1846 	thread_t                self = current_thread();
1847 	processor_t             prev;
1848 	spl_t                   s;
1849 
1850 	s = splsched();
1851 	thread_lock(self);
1852 
1853 	prev = thread_bind_internal(self, processor);
1854 
1855 	thread_unlock(self);
1856 	splx(s);
1857 
1858 	return prev;
1859 }
1860 
1861 void
thread_bind_during_wakeup(thread_t thread,processor_t processor)1862 thread_bind_during_wakeup(thread_t thread, processor_t processor)
1863 {
1864 	assert(!ml_get_interrupts_enabled());
1865 	assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1866 #if MACH_ASSERT
1867 	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1868 #endif
1869 
1870 	if (thread->bound_processor != processor) {
1871 		thread_bind_internal(thread, processor);
1872 	}
1873 }
1874 
1875 void
thread_unbind_after_queue_shutdown(thread_t thread,processor_t processor __assert_only)1876 thread_unbind_after_queue_shutdown(
1877 	thread_t                thread,
1878 	processor_t             processor __assert_only)
1879 {
1880 	assert(!ml_get_interrupts_enabled());
1881 
1882 	thread_lock(thread);
1883 
1884 	if (thread->bound_processor) {
1885 		bool removed;
1886 
1887 		assert(thread->bound_processor == processor);
1888 
1889 		removed = thread_run_queue_remove(thread);
1890 		/*
1891 		 * we can always unbind even if we didn't really remove the
1892 		 * thread from the runqueue
1893 		 */
1894 		thread_bind_internal(thread, PROCESSOR_NULL);
1895 		if (removed) {
1896 			thread_run_queue_reinsert(thread, SCHED_TAILQ);
1897 		}
1898 	}
1899 
1900 	thread_unlock(thread);
1901 }
1902 
1903 /*
1904  * thread_bind_internal:
1905  *
1906  * If the specified thread is not the current thread, and it is currently
1907  * running on another CPU, a remote AST must be sent to that CPU to cause
1908  * the thread to migrate to its bound processor. Otherwise, the migration
1909  * will occur at the next quantum expiration or blocking point.
1910  *
1911  * When the thread is the current thread, and explicit thread_block() should
1912  * be used to force the current processor to context switch away and
1913  * let the thread migrate to the bound processor.
1914  *
1915  * Thread must be locked, and at splsched.
1916  */
1917 
1918 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1919 thread_bind_internal(
1920 	thread_t                thread,
1921 	processor_t             processor)
1922 {
1923 	processor_t             prev;
1924 
1925 	/* <rdar://problem/15102234> */
1926 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1927 	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1928 	thread_assert_runq_null(thread);
1929 
1930 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1931 	    thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1932 
1933 	prev = thread->bound_processor;
1934 	thread->bound_processor = processor;
1935 
1936 	return prev;
1937 }
1938 
1939 /*
1940  * thread_vm_bind_group_add:
1941  *
1942  * The "VM bind group" is a special mechanism to mark a collection
1943  * of threads from the VM subsystem that, in general, should be scheduled
1944  * with only one CPU of parallelism. To accomplish this, we initially
1945  * bind all the threads to the master processor, which has the effect
1946  * that only one of the threads in the group can execute at once, including
1947  * preempting threads in the group that are a lower priority. Future
1948  * mechanisms may use more dynamic mechanisms to prevent the collection
1949  * of VM threads from using more CPU time than desired.
1950  *
1951  * The current implementation can result in priority inversions where
1952  * compute-bound priority 95 or realtime threads that happen to have
1953  * landed on the master processor prevent the VM threads from running.
1954  * When this situation is detected, we unbind the threads for one
1955  * scheduler tick to allow the scheduler to run the threads an
1956  * additional CPUs, before restoring the binding (assuming high latency
1957  * is no longer a problem).
1958  */
1959 
1960 /*
1961  * The current max is provisioned for:
1962  * vm_compressor_swap_trigger_thread (92)
1963  * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1964  * vm_pageout_continue (92)
1965  * memorystatus_thread (95)
1966  */
1967 #define MAX_VM_BIND_GROUP_COUNT (5)
1968 decl_simple_lock_data(static, sched_vm_group_list_lock);
1969 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1970 static int sched_vm_group_thread_count;
1971 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1972 
1973 void
thread_vm_bind_group_add(void)1974 thread_vm_bind_group_add(void)
1975 {
1976 	thread_t self = current_thread();
1977 
1978 	thread_reference(self);
1979 	self->options |= TH_OPT_SCHED_VM_GROUP;
1980 
1981 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1982 	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1983 	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1984 	simple_unlock(&sched_vm_group_list_lock);
1985 
1986 	thread_bind(master_processor);
1987 
1988 	/* Switch to bound processor if not already there */
1989 	thread_block(THREAD_CONTINUE_NULL);
1990 }
1991 
1992 static void
sched_vm_group_maintenance(void)1993 sched_vm_group_maintenance(void)
1994 {
1995 	uint64_t ctime = mach_absolute_time();
1996 	uint64_t longtime = ctime - sched_tick_interval;
1997 	int i;
1998 	spl_t s;
1999 	boolean_t high_latency_observed = FALSE;
2000 	boolean_t runnable_and_not_on_runq_observed = FALSE;
2001 	boolean_t bind_target_changed = FALSE;
2002 	processor_t bind_target = PROCESSOR_NULL;
2003 
2004 	/* Make sure nobody attempts to add new threads while we are enumerating them */
2005 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2006 
2007 	s = splsched();
2008 
2009 	for (i = 0; i < sched_vm_group_thread_count; i++) {
2010 		thread_t thread = sched_vm_group_thread_list[i];
2011 		assert(thread != THREAD_NULL);
2012 		thread_lock(thread);
2013 		if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2014 			if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2015 				high_latency_observed = TRUE;
2016 			} else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2017 				/* There are some cases where a thread be transitiong that also fall into this case */
2018 				runnable_and_not_on_runq_observed = TRUE;
2019 			}
2020 		}
2021 		thread_unlock(thread);
2022 
2023 		if (high_latency_observed && runnable_and_not_on_runq_observed) {
2024 			/* All the things we are looking for are true, stop looking */
2025 			break;
2026 		}
2027 	}
2028 
2029 	splx(s);
2030 
2031 	if (sched_vm_group_temporarily_unbound) {
2032 		/* If we turned off binding, make sure everything is OK before rebinding */
2033 		if (!high_latency_observed) {
2034 			/* rebind */
2035 			bind_target_changed = TRUE;
2036 			bind_target = master_processor;
2037 			sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2038 		}
2039 	} else {
2040 		/*
2041 		 * Check if we're in a bad state, which is defined by high
2042 		 * latency with no core currently executing a thread. If a
2043 		 * single thread is making progress on a CPU, that means the
2044 		 * binding concept to reduce parallelism is working as
2045 		 * designed.
2046 		 */
2047 		if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2048 			/* unbind */
2049 			bind_target_changed = TRUE;
2050 			bind_target = PROCESSOR_NULL;
2051 			sched_vm_group_temporarily_unbound = TRUE;
2052 		}
2053 	}
2054 
2055 	if (bind_target_changed) {
2056 		s = splsched();
2057 		for (i = 0; i < sched_vm_group_thread_count; i++) {
2058 			thread_t thread = sched_vm_group_thread_list[i];
2059 			boolean_t removed;
2060 			assert(thread != THREAD_NULL);
2061 
2062 			thread_lock(thread);
2063 			removed = thread_run_queue_remove(thread);
2064 			if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2065 				thread_bind_internal(thread, bind_target);
2066 			} else {
2067 				/*
2068 				 * Thread was in the middle of being context-switched-to,
2069 				 * or was in the process of blocking. To avoid switching the bind
2070 				 * state out mid-flight, defer the change if possible.
2071 				 */
2072 				if (bind_target == PROCESSOR_NULL) {
2073 					thread_bind_internal(thread, bind_target);
2074 				} else {
2075 					sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2076 				}
2077 			}
2078 
2079 			if (removed) {
2080 				thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
2081 			}
2082 			thread_unlock(thread);
2083 		}
2084 		splx(s);
2085 	}
2086 
2087 	simple_unlock(&sched_vm_group_list_lock);
2088 }
2089 
2090 #if defined(__x86_64__)
2091 #define SCHED_AVOID_CPU0 1
2092 #else
2093 #define SCHED_AVOID_CPU0 0
2094 #endif
2095 
2096 int sched_allow_rt_smt = 1;
2097 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2098 int sched_allow_rt_steal = 1;
2099 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2100 
2101 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2102 
2103 int
sched_get_rt_n_backup_processors(void)2104 sched_get_rt_n_backup_processors(void)
2105 {
2106 	return sched_rt_n_backup_processors;
2107 }
2108 
2109 void
sched_set_rt_n_backup_processors(int n)2110 sched_set_rt_n_backup_processors(int n)
2111 {
2112 	if (n < 0) {
2113 		n = 0;
2114 	} else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2115 		n = SCHED_MAX_BACKUP_PROCESSORS;
2116 	}
2117 
2118 	sched_rt_n_backup_processors = n;
2119 }
2120 
2121 int sched_rt_runq_strict_priority = false;
2122 
2123 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)2124 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2125 {
2126 	if (current_pset != new_pset) {
2127 		pset_unlock(current_pset);
2128 		pset_lock(new_pset);
2129 	}
2130 
2131 	return new_pset;
2132 }
2133 
2134 /*
2135  * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2136  * rebalancing opportunity exists when a core is (instantaneously) idle, but
2137  * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2138  * IPI thrash if this core does not remain idle following the load balancing ASTs
2139  * Idle "thrash", when IPI issue is followed by idle entry/core power down
2140  * followed by a wakeup shortly thereafter.
2141  */
2142 
2143 #if (DEVELOPMENT || DEBUG)
2144 int sched_smt_balance = 1;
2145 #endif
2146 
2147 /* Invoked with pset locked, returns with pset unlocked */
2148 bool
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)2149 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2150 {
2151 	processor_t ast_processor = NULL;
2152 
2153 #if (DEVELOPMENT || DEBUG)
2154 	if (__improbable(sched_smt_balance == 0)) {
2155 		goto smt_balance_exit;
2156 	}
2157 #endif
2158 
2159 	assert(cprocessor == current_processor());
2160 	if (cprocessor->is_SMT == FALSE) {
2161 		goto smt_balance_exit;
2162 	}
2163 
2164 	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2165 
2166 	/* Determine if both this processor and its sibling are idle,
2167 	 * indicating an SMT rebalancing opportunity.
2168 	 */
2169 	if (sib_processor->state != PROCESSOR_IDLE) {
2170 		goto smt_balance_exit;
2171 	}
2172 
2173 	processor_t sprocessor;
2174 
2175 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2176 	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2177 	    ~cpset->primary_map);
2178 	for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2179 		sprocessor = processor_array[cpuid];
2180 		if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2181 		    (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2182 			ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2183 			if (ipi_type != SCHED_IPI_NONE) {
2184 				assert(sprocessor != cprocessor);
2185 				ast_processor = sprocessor;
2186 				break;
2187 			}
2188 		}
2189 	}
2190 
2191 smt_balance_exit:
2192 	pset_unlock(cpset);
2193 
2194 	if (ast_processor) {
2195 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2196 		sched_ipi_perform(ast_processor, ipi_type);
2197 	}
2198 	return false;
2199 }
2200 
2201 static cpumap_t
pset_available_cpumap(processor_set_t pset)2202 pset_available_cpumap(processor_set_t pset)
2203 {
2204 	return pset->cpu_available_map & pset->recommended_bitmask;
2205 }
2206 
2207 int
pset_available_cpu_count(processor_set_t pset)2208 pset_available_cpu_count(processor_set_t pset)
2209 {
2210 	return bit_count(pset_available_cpumap(pset));
2211 }
2212 
2213 bool
pset_is_recommended(processor_set_t pset)2214 pset_is_recommended(processor_set_t pset)
2215 {
2216 	if (!pset) {
2217 		return false;
2218 	}
2219 	return pset_available_cpu_count(pset) > 0;
2220 }
2221 
2222 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2223 pset_available_but_not_running_cpumap(processor_set_t pset)
2224 {
2225 	return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2226 	       pset->recommended_bitmask;
2227 }
2228 
2229 bool
pset_has_stealable_threads(processor_set_t pset)2230 pset_has_stealable_threads(processor_set_t pset)
2231 {
2232 	pset_assert_locked(pset);
2233 
2234 	cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2235 	/*
2236 	 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2237 	 * available primary CPUs
2238 	 */
2239 	avail_map &= pset->primary_map;
2240 
2241 	return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2242 }
2243 
2244 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2245 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2246 {
2247 	cpumap_t avail_map = pset_available_cpumap(pset);
2248 	if (!sched_allow_rt_smt) {
2249 		/*
2250 		 * Secondary CPUs are not allowed to run RT threads, so
2251 		 * only primary CPUs should be included
2252 		 */
2253 		avail_map &= pset->primary_map;
2254 	}
2255 
2256 	return avail_map & ~pset->realtime_map;
2257 }
2258 
2259 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2260 pset_needs_a_followup_IPI(processor_set_t pset)
2261 {
2262 	int nbackup_cpus = 0;
2263 
2264 	if (rt_runq_is_low_latency(pset)) {
2265 		nbackup_cpus = sched_rt_n_backup_processors;
2266 	}
2267 
2268 	int rt_rq_count = rt_runq_count(pset);
2269 
2270 	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2271 }
2272 
2273 bool
pset_has_stealable_rt_threads(processor_set_t pset)2274 pset_has_stealable_rt_threads(processor_set_t pset)
2275 {
2276 	pset_node_t node = pset->node;
2277 	if (bit_count(node->pset_map) == 1) {
2278 		return false;
2279 	}
2280 
2281 	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2282 
2283 	return rt_runq_count(pset) > bit_count(avail_map);
2284 }
2285 
2286 static void
pset_update_rt_stealable_state(processor_set_t pset)2287 pset_update_rt_stealable_state(processor_set_t pset)
2288 {
2289 	if (pset_has_stealable_rt_threads(pset)) {
2290 		pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2291 	} else {
2292 		pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2293 	}
2294 }
2295 
2296 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2297 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2298 {
2299 	/* Acknowledge any pending IPIs here with pset lock held */
2300 	pset_assert_locked(pset);
2301 	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2302 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2303 		    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2304 	}
2305 	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2306 
2307 #if defined(CONFIG_SCHED_DEFERRED_AST)
2308 	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2309 #endif
2310 }
2311 
2312 /*
2313  * Called with pset locked, on a processor that is committing to run a new thread
2314  * Will transition an idle or dispatching processor to running as it picks up
2315  * the first new thread from the idle thread.
2316  */
2317 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2318 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2319 {
2320 	pset_assert_locked(pset);
2321 
2322 	if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2323 		assert(current_thread() == processor->idle_thread);
2324 
2325 		/*
2326 		 * Dispatching processor is now committed to running new_thread,
2327 		 * so change its state to PROCESSOR_RUNNING.
2328 		 */
2329 		pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2330 	} else {
2331 		assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2332 	}
2333 
2334 	processor_state_update_from_thread(processor, new_thread, true);
2335 
2336 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2337 		bit_set(pset->realtime_map, processor->cpu_id);
2338 	} else {
2339 		bit_clear(pset->realtime_map, processor->cpu_id);
2340 	}
2341 	pset_update_rt_stealable_state(pset);
2342 
2343 	pset_node_t node = pset->node;
2344 
2345 	if (bit_count(node->pset_map) == 1) {
2346 		/* Node has only a single pset, so skip node pset map updates */
2347 		return;
2348 	}
2349 
2350 	cpumap_t avail_map = pset_available_cpumap(pset);
2351 
2352 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2353 		if ((avail_map & pset->realtime_map) == avail_map) {
2354 			/* No more non-RT CPUs in this pset */
2355 			atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2356 		}
2357 		avail_map &= pset->primary_map;
2358 		if ((avail_map & pset->realtime_map) == avail_map) {
2359 			/* No more non-RT primary CPUs in this pset */
2360 			atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2361 		}
2362 	} else {
2363 		if ((avail_map & pset->realtime_map) != avail_map) {
2364 			if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2365 				atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2366 			}
2367 		}
2368 		avail_map &= pset->primary_map;
2369 		if ((avail_map & pset->realtime_map) != avail_map) {
2370 			if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2371 				atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2372 			}
2373 		}
2374 	}
2375 }
2376 
2377 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2378 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2379     processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2380 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2381 #if defined(__x86_64__)
2382 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2383 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2384 #endif
2385 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2386 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2387 
2388 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2389 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2390 {
2391 	pset_map_t pset_map = stealing_pset->node->pset_map;
2392 
2393 	bit_clear(pset_map, stealing_pset->pset_id);
2394 
2395 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2396 		processor_set_t nset = pset_array[pset_id];
2397 
2398 		if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2399 			return true;
2400 		}
2401 	}
2402 
2403 	return false;
2404 }
2405 
2406 /*
2407  * starting_pset must be locked, but returns true if it is unlocked before return
2408  */
2409 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2410 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2411     processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2412 {
2413 	bool starting_pset_is_unlocked = false;
2414 	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2415 	int max_pri = rt_runq_priority(starting_pset);
2416 	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2417 	processor_set_t pset = starting_pset;
2418 	processor_t next_rt_processor = PROCESSOR_NULL;
2419 	if (spill_ipi) {
2420 		processor_set_t nset = next_pset(pset);
2421 		assert(nset != starting_pset);
2422 		pset = change_locked_pset(pset, nset);
2423 		starting_pset_is_unlocked = true;
2424 	}
2425 	do {
2426 		const bool consider_secondaries = true;
2427 		next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2428 		if (next_rt_processor == PROCESSOR_NULL) {
2429 			if (!spill_ipi) {
2430 				break;
2431 			}
2432 			processor_set_t nset = next_pset(pset);
2433 			if (nset == starting_pset) {
2434 				break;
2435 			}
2436 			pset = change_locked_pset(pset, nset);
2437 			starting_pset_is_unlocked = true;
2438 		}
2439 	} while (next_rt_processor == PROCESSOR_NULL);
2440 	if (next_rt_processor) {
2441 		if (pset != starting_pset) {
2442 			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2443 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2444 				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2445 			}
2446 		}
2447 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2448 		*result_processor = next_rt_processor;
2449 	}
2450 	if (pset != starting_pset) {
2451 		pset_unlock(pset);
2452 	}
2453 
2454 	return starting_pset_is_unlocked;
2455 }
2456 
2457 /*
2458  * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2459  * followup processor - used in thread_select when there are still threads on the run queue and available processors
2460  * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2461  */
2462 typedef enum {
2463 	none,
2464 	backup,
2465 	followup,
2466 	spill
2467 } next_processor_type_t;
2468 
2469 #undef LOOP_COUNT
2470 #ifdef LOOP_COUNT
2471 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2472 #endif
2473 
2474 /*
2475  *	thread_select:
2476  *
2477  *	Select a new thread for the current processor to execute.
2478  *
2479  *	May select the current thread, which must be locked.
2480  */
2481 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2482 thread_select(thread_t          thread,
2483     processor_t       processor,
2484     ast_t            *reason)
2485 {
2486 	processor_set_t         pset = processor->processor_set;
2487 	thread_t                        new_thread = THREAD_NULL;
2488 
2489 	assert(processor == current_processor());
2490 	assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2491 
2492 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2493 	    0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2494 
2495 	__kdebug_only int idle_reason = 0;
2496 	__kdebug_only int delay_count = 0;
2497 
2498 #if defined(__x86_64__)
2499 	int timeout_count = sched_backup_cpu_timeout_count;
2500 	if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2501 		/* Prefer cpu0 as backup */
2502 		timeout_count--;
2503 	} else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2504 		/* Prefer secondary cpu as backup */
2505 		timeout_count--;
2506 	}
2507 #endif
2508 	bool pending_AST_URGENT = false;
2509 	bool pending_AST_PREEMPT = false;
2510 
2511 #ifdef LOOP_COUNT
2512 	int loop_count = -1;
2513 #endif
2514 
2515 	do {
2516 		/*
2517 		 *	Update the priority.
2518 		 */
2519 		if (SCHED(can_update_priority)(thread)) {
2520 			SCHED(update_priority)(thread);
2521 		}
2522 
2523 		pset_lock(pset);
2524 
2525 restart:
2526 #ifdef LOOP_COUNT
2527 		loop_count++;
2528 		if (loop_count > max_loop_count[processor->cpu_id]) {
2529 			max_loop_count[processor->cpu_id] = loop_count;
2530 			if (bit_count(loop_count) == 1) {
2531 				kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2532 			}
2533 		}
2534 #endif
2535 		pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2536 		pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2537 
2538 		processor_state_update_from_thread(processor, thread, true);
2539 
2540 		idle_reason = 0;
2541 
2542 		processor_t ast_processor = PROCESSOR_NULL;
2543 		processor_t next_rt_processor = PROCESSOR_NULL;
2544 		sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2545 		sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2546 
2547 		assert(processor->state != PROCESSOR_OFF_LINE);
2548 
2549 		/*
2550 		 * Bound threads are dispatched to a processor without going through
2551 		 * choose_processor(), so in those cases we must continue trying to dequeue work
2552 		 * as we are the only option.
2553 		 */
2554 		if (!SCHED(processor_bound_count)(processor)) {
2555 			if (!processor->is_recommended) {
2556 				/*
2557 				 * The performance controller has provided a hint to not dispatch more threads,
2558 				 */
2559 				idle_reason = 1;
2560 				goto send_followup_ipi_before_idle;
2561 			} else if (rt_runq_count(pset)) {
2562 				bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2563 				/* Give the current RT thread a chance to complete */
2564 				ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2565 #if defined(__x86_64__)
2566 				/*
2567 				 * On Intel we want to avoid SMT secondary processors and processor 0
2568 				 * but allow them to be used as backup processors in case the preferred chosen
2569 				 * processor is delayed by interrupts or processor stalls.  So if it is
2570 				 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2571 				 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2572 				 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2573 				 * to grab the thread before the (current) backup processor does.
2574 				 *
2575 				 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2576 				 * on DEVELOPMENT || DEBUG kernels.  It is also adjusted (see above) depending on whether we want to use
2577 				 * cpu0 before secondary cpus or not.
2578 				 */
2579 				if (!ok_to_run_realtime_thread) {
2580 					if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2581 						if (timeout_count-- > 0) {
2582 							pset_unlock(pset);
2583 							thread_unlock(thread);
2584 							delay(10);
2585 							delay_count++;
2586 							thread_lock(thread);
2587 							pset_lock(pset);
2588 							goto restart;
2589 						}
2590 						ok_to_run_realtime_thread = true;
2591 					}
2592 				}
2593 #endif
2594 				if (!ok_to_run_realtime_thread) {
2595 					idle_reason = 2;
2596 					goto send_followup_ipi_before_idle;
2597 				}
2598 			} else if (processor->processor_primary != processor) {
2599 				/*
2600 				 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2601 				 * we should look for work only under the same conditions that choose_processor()
2602 				 * would have assigned work, which is when all primary processors have been assigned work.
2603 				 */
2604 				if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2605 					/* There are idle primaries */
2606 					idle_reason = 3;
2607 					goto idle;
2608 				}
2609 			}
2610 		}
2611 
2612 		/*
2613 		 *	Test to see if the current thread should continue
2614 		 *	to run on this processor.  Must not be attempting to wait, and not
2615 		 *	bound to a different processor, nor be in the wrong
2616 		 *	processor set, nor be forced to context switch by TH_SUSP.
2617 		 *
2618 		 *	Note that there are never any RT threads in the regular runqueue.
2619 		 *
2620 		 *	This code is very insanely tricky.
2621 		 */
2622 
2623 		/* i.e. not waiting, not TH_SUSP'ed */
2624 		bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2625 
2626 		/*
2627 		 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2628 		 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2629 		 *       <rdar://problem/47907700>
2630 		 *
2631 		 * A yielding thread shouldn't be forced to context switch.
2632 		 */
2633 
2634 		bool is_yielding         = (*reason & AST_YIELD) == AST_YIELD;
2635 
2636 		bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2637 
2638 		bool affinity_mismatch   = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2639 
2640 		bool bound_elsewhere     = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2641 
2642 		bool avoid_processor     = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2643 
2644 		bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2645 
2646 		bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2647 		if (current_thread_can_keep_running) {
2648 			/*
2649 			 * This thread is eligible to keep running on this processor.
2650 			 *
2651 			 * RT threads with un-expired quantum stay on processor,
2652 			 * unless there's a valid RT thread with an earlier deadline
2653 			 * and it is still ok_to_run_realtime_thread.
2654 			 */
2655 			if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2656 				/*
2657 				 * Pick a new RT thread only if ok_to_run_realtime_thread
2658 				 * (but the current thread is allowed to complete).
2659 				 */
2660 				if (ok_to_run_realtime_thread) {
2661 					if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2662 						goto pick_new_rt_thread;
2663 					}
2664 					if (rt_runq_priority(pset) > thread->sched_pri) {
2665 						if (sched_rt_runq_strict_priority) {
2666 							/* The next RT thread is better, so pick it off the runqueue. */
2667 							goto pick_new_rt_thread;
2668 						}
2669 
2670 						/*
2671 						 * See if the current lower priority thread can continue to run without causing
2672 						 * the higher priority thread on the runq queue to miss its deadline.
2673 						 */
2674 						thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2675 						if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2676 							/* The next RT thread is better, so pick it off the runqueue. */
2677 							goto pick_new_rt_thread;
2678 						}
2679 					} else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2680 						/* The next RT thread is better, so pick it off the runqueue. */
2681 						goto pick_new_rt_thread;
2682 					}
2683 					if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2684 						goto pick_new_rt_thread;
2685 					}
2686 				}
2687 
2688 				/* This is still the best RT thread to run. */
2689 				processor->deadline = thread->realtime.deadline;
2690 
2691 				sched_update_pset_load_average(pset, 0);
2692 
2693 				clear_pending_AST_bits(pset, processor, 1);
2694 
2695 				next_rt_processor = PROCESSOR_NULL;
2696 				next_rt_ipi_type = SCHED_IPI_NONE;
2697 
2698 				bool pset_unlocked = false;
2699 				__kdebug_only next_processor_type_t nptype = none;
2700 				if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2701 					nptype = spill;
2702 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2703 				} else if (pset_needs_a_followup_IPI(pset)) {
2704 					nptype = followup;
2705 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2706 				}
2707 				if (!pset_unlocked) {
2708 					pset_unlock(pset);
2709 				}
2710 
2711 				if (next_rt_processor) {
2712 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2713 					    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2714 					sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2715 				}
2716 
2717 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2718 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2719 				return thread;
2720 			}
2721 
2722 			if ((rt_runq_count(pset) == 0) &&
2723 			    SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2724 				/* This thread is still the highest priority runnable (non-idle) thread */
2725 				processor->deadline = RT_DEADLINE_NONE;
2726 
2727 				sched_update_pset_load_average(pset, 0);
2728 
2729 				clear_pending_AST_bits(pset, processor, 2);
2730 
2731 				pset_unlock(pset);
2732 
2733 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2734 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2735 				return thread;
2736 			}
2737 		} else {
2738 			/*
2739 			 * This processor must context switch.
2740 			 * If it's due to a rebalance, we should aggressively find this thread a new home.
2741 			 */
2742 			if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2743 				*reason |= AST_REBALANCE;
2744 			}
2745 		}
2746 
2747 		bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2748 		    (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2749 		    (processor->processor_secondary->state == PROCESSOR_IDLE));
2750 
2751 		/* OK, so we're not going to run the current thread. Look at the RT queue. */
2752 		if (ok_to_run_realtime_thread) {
2753 pick_new_rt_thread:
2754 			new_thread = sched_rt_choose_thread(pset);
2755 			if (new_thread != THREAD_NULL) {
2756 				processor->deadline = new_thread->realtime.deadline;
2757 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2758 
2759 				clear_pending_AST_bits(pset, processor, 3);
2760 
2761 				if (processor->processor_secondary != NULL) {
2762 					processor_t sprocessor = processor->processor_secondary;
2763 					if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2764 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2765 						ast_processor = sprocessor;
2766 					}
2767 				}
2768 			}
2769 		}
2770 
2771 send_followup_ipi_before_idle:
2772 		/* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2773 		if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2774 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2775 		}
2776 		__kdebug_only next_processor_type_t nptype = none;
2777 		bool pset_unlocked = false;
2778 		if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2779 			nptype = spill;
2780 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2781 		} else if (pset_needs_a_followup_IPI(pset)) {
2782 			nptype = followup;
2783 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2784 		}
2785 
2786 		assert(new_thread || !ast_processor);
2787 		if (new_thread || next_rt_processor) {
2788 			if (!pset_unlocked) {
2789 				pset_unlock(pset);
2790 				pset_unlocked = true;
2791 			}
2792 			if (ast_processor == next_rt_processor) {
2793 				ast_processor = PROCESSOR_NULL;
2794 				ipi_type = SCHED_IPI_NONE;
2795 			}
2796 
2797 			if (ast_processor) {
2798 				sched_ipi_perform(ast_processor, ipi_type);
2799 			}
2800 
2801 			if (next_rt_processor) {
2802 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2803 				    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2804 				sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2805 			}
2806 
2807 			if (new_thread) {
2808 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2809 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2810 				return new_thread;
2811 			}
2812 		}
2813 
2814 		if (pset_unlocked) {
2815 			pset_lock(pset);
2816 		}
2817 
2818 		if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2819 			/* Things changed while we dropped the lock */
2820 			goto restart;
2821 		}
2822 
2823 		if (processor->is_recommended) {
2824 			bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2825 			if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2826 				/* Things changed while we dropped the lock */
2827 				goto restart;
2828 			}
2829 
2830 			if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2831 				/* secondary can only run realtime thread */
2832 				if (idle_reason == 0) {
2833 					idle_reason = 4;
2834 				}
2835 				goto idle;
2836 			}
2837 		} else if (!SCHED(processor_bound_count)(processor)) {
2838 			/* processor not recommended and no bound threads */
2839 			if (idle_reason == 0) {
2840 				idle_reason = 5;
2841 			}
2842 			goto idle;
2843 		}
2844 
2845 		processor->deadline = RT_DEADLINE_NONE;
2846 
2847 		/* No RT threads, so let's look at the regular threads. */
2848 		if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2849 			pset_commit_processor_to_new_thread(pset, processor, new_thread);
2850 
2851 			clear_pending_AST_bits(pset, processor, 4);
2852 
2853 			ast_processor = PROCESSOR_NULL;
2854 			ipi_type = SCHED_IPI_NONE;
2855 
2856 			processor_t sprocessor = processor->processor_secondary;
2857 			if (sprocessor != NULL) {
2858 				if (sprocessor->state == PROCESSOR_RUNNING) {
2859 					if (thread_no_smt(new_thread)) {
2860 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2861 						ast_processor = sprocessor;
2862 					}
2863 				} else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2864 					ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2865 					ast_processor = sprocessor;
2866 				}
2867 			}
2868 			pset_unlock(pset);
2869 
2870 			if (ast_processor) {
2871 				sched_ipi_perform(ast_processor, ipi_type);
2872 			}
2873 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2874 			    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2875 			return new_thread;
2876 		}
2877 
2878 		if (processor->must_idle) {
2879 			processor->must_idle = false;
2880 			*reason |= AST_REBALANCE;
2881 			idle_reason = 6;
2882 			goto idle;
2883 		}
2884 
2885 		if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2886 			/*
2887 			 * No runnable threads, attempt to steal
2888 			 * from other processors. Returns with pset lock dropped.
2889 			 */
2890 
2891 			if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2892 				pset_lock(pset);
2893 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2894 				if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2895 					/*
2896 					 * A realtime thread choose this processor while it was DISPATCHING
2897 					 * and the pset lock was dropped
2898 					 */
2899 					ast_on(AST_URGENT | AST_PREEMPT);
2900 				}
2901 
2902 				clear_pending_AST_bits(pset, processor, 5);
2903 
2904 				pset_unlock(pset);
2905 
2906 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2907 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2908 				return new_thread;
2909 			}
2910 
2911 			/*
2912 			 * If other threads have appeared, shortcut
2913 			 * around again.
2914 			 */
2915 			if (SCHED(processor_bound_count)(processor)) {
2916 				continue;
2917 			}
2918 			if (processor->is_recommended) {
2919 				if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2920 					continue;
2921 				}
2922 			}
2923 
2924 			pset_lock(pset);
2925 		}
2926 
2927 idle:
2928 		/* Someone selected this processor while we had dropped the lock */
2929 		if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2930 		    (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2931 			goto restart;
2932 		}
2933 
2934 		if ((idle_reason == 0) && current_thread_can_keep_running) {
2935 			/* This thread is the only runnable (non-idle) thread */
2936 			if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2937 				processor->deadline = thread->realtime.deadline;
2938 			} else {
2939 				processor->deadline = RT_DEADLINE_NONE;
2940 			}
2941 
2942 			sched_update_pset_load_average(pset, 0);
2943 
2944 			clear_pending_AST_bits(pset, processor, 6);
2945 
2946 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2947 			    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2948 			pset_unlock(pset);
2949 			return thread;
2950 		}
2951 
2952 		/*
2953 		 *	Nothing is runnable, or this processor must be forced idle,
2954 		 *	so set this processor idle if it was running.
2955 		 */
2956 		if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2957 			pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2958 			processor_state_update_idle(processor);
2959 		}
2960 		pset_update_rt_stealable_state(pset);
2961 
2962 		clear_pending_AST_bits(pset, processor, 7);
2963 
2964 		/* Invoked with pset locked, returns with pset unlocked */
2965 		processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2966 
2967 		new_thread = processor->idle_thread;
2968 	} while (new_thread == THREAD_NULL);
2969 
2970 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2971 	    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2972 	return new_thread;
2973 }
2974 
2975 /*
2976  * thread_invoke
2977  *
2978  * Called at splsched with neither thread locked.
2979  *
2980  * Perform a context switch and start executing the new thread.
2981  *
2982  * Returns FALSE when the context switch didn't happen.
2983  * The reference to the new thread is still consumed.
2984  *
2985  * "self" is what is currently running on the processor,
2986  * "thread" is the new thread to context switch to
2987  * (which may be the same thread in some cases)
2988  */
2989 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2990 thread_invoke(
2991 	thread_t                        self,
2992 	thread_t                        thread,
2993 	ast_t                           reason)
2994 {
2995 	if (__improbable(get_preemption_level() != 0)) {
2996 		int pl = get_preemption_level();
2997 		panic("thread_invoke: preemption_level %d, possible cause: %s",
2998 		    pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2999 		    "blocking while holding a spinlock, or within interrupt context"));
3000 	}
3001 
3002 	thread_continue_t       continuation = self->continuation;
3003 	void                    *parameter   = self->parameter;
3004 
3005 	struct recount_snap snap = { 0 };
3006 	recount_snapshot(&snap);
3007 	uint64_t ctime = snap.rsn_time_mach;
3008 
3009 	check_monotonic_time(ctime);
3010 
3011 #ifdef CONFIG_MACH_APPROXIMATE_TIME
3012 	commpage_update_mach_approximate_time(ctime);
3013 #endif
3014 
3015 	if (ctime < thread->last_made_runnable_time) {
3016 		panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3017 		    ctime, thread->last_made_runnable_time);
3018 	}
3019 
3020 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3021 	if (!((thread->state & TH_IDLE) != 0 ||
3022 	    ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3023 		sched_timeshare_consider_maintenance(ctime, true);
3024 	}
3025 #endif
3026 
3027 	recount_log_switch_thread(&snap);
3028 
3029 	assert_thread_magic(self);
3030 	assert(self == current_thread());
3031 	thread_assert_runq_null(self);
3032 	assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3033 
3034 	thread_lock(thread);
3035 
3036 	assert_thread_magic(thread);
3037 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3038 	assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
3039 	thread_assert_runq_null(thread);
3040 
3041 	/* Update SFI class based on other factors */
3042 	thread->sfi_class = sfi_thread_classify(thread);
3043 
3044 	/* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3045 	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3046 	/*
3047 	 * In case a base_pri update happened between the timestamp and
3048 	 * taking the thread lock
3049 	 */
3050 	if (ctime <= thread->last_basepri_change_time) {
3051 		thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3052 	}
3053 
3054 	/* Allow realtime threads to hang onto a stack. */
3055 	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3056 		self->reserved_stack = self->kernel_stack;
3057 	}
3058 
3059 	/* Prepare for spin debugging */
3060 #if SCHED_HYGIENE_DEBUG
3061 	ml_spin_debug_clear(thread);
3062 #endif
3063 
3064 	if (continuation != NULL) {
3065 		if (!thread->kernel_stack) {
3066 			/*
3067 			 * If we are using a privileged stack,
3068 			 * check to see whether we can exchange it with
3069 			 * that of the other thread.
3070 			 */
3071 			if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3072 				goto need_stack;
3073 			}
3074 
3075 			/*
3076 			 * Context switch by performing a stack handoff.
3077 			 * Requires both threads to be parked in a continuation.
3078 			 */
3079 			continuation = thread->continuation;
3080 			parameter = thread->parameter;
3081 
3082 			processor_t processor = current_processor();
3083 			processor->active_thread = thread;
3084 			processor_state_update_from_thread(processor, thread, false);
3085 
3086 			if (thread->last_processor != processor && thread->last_processor != NULL) {
3087 				if (thread->last_processor->processor_set != processor->processor_set) {
3088 					thread->ps_switch++;
3089 				}
3090 				thread->p_switch++;
3091 			}
3092 			thread->last_processor = processor;
3093 			thread->c_switch++;
3094 			ast_context(thread);
3095 
3096 			thread_unlock(thread);
3097 
3098 			self->reason = reason;
3099 
3100 			processor->last_dispatch = ctime;
3101 			self->last_run_time = ctime;
3102 			timer_update(&thread->runnable_timer, ctime);
3103 			recount_switch_thread(&snap, self, get_threadtask(self));
3104 
3105 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3106 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3107 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3108 
3109 			if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3110 				SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3111 				    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3112 			}
3113 
3114 			DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3115 
3116 			SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3117 
3118 #if KPERF
3119 			kperf_off_cpu(self);
3120 #endif /* KPERF */
3121 
3122 			/*
3123 			 * This is where we actually switch thread identity,
3124 			 * and address space if required.  However, register
3125 			 * state is not switched - this routine leaves the
3126 			 * stack and register state active on the current CPU.
3127 			 */
3128 			TLOG(1, "thread_invoke: calling stack_handoff\n");
3129 			stack_handoff(self, thread);
3130 
3131 			/* 'self' is now off core */
3132 			assert(thread == current_thread_volatile());
3133 
3134 			DTRACE_SCHED(on__cpu);
3135 
3136 #if KPERF
3137 			kperf_on_cpu(thread, continuation, NULL);
3138 #endif /* KPERF */
3139 
3140 			recount_log_switch_thread_on(&snap);
3141 
3142 			thread_dispatch(self, thread);
3143 
3144 #if KASAN
3145 			/* Old thread's stack has been moved to the new thread, so explicitly
3146 			 * unpoison it. */
3147 			kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3148 #endif
3149 
3150 			thread->continuation = thread->parameter = NULL;
3151 
3152 			boolean_t enable_interrupts = TRUE;
3153 
3154 			/* idle thread needs to stay interrupts-disabled */
3155 			if ((thread->state & TH_IDLE)) {
3156 				enable_interrupts = FALSE;
3157 			}
3158 
3159 			assert(continuation);
3160 			call_continuation(continuation, parameter,
3161 			    thread->wait_result, enable_interrupts);
3162 			/*NOTREACHED*/
3163 		} else if (thread == self) {
3164 			/* same thread but with continuation */
3165 			ast_context(self);
3166 
3167 			thread_unlock(self);
3168 
3169 #if KPERF
3170 			kperf_on_cpu(thread, continuation, NULL);
3171 #endif /* KPERF */
3172 
3173 			recount_log_switch_thread_on(&snap);
3174 
3175 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3177 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3178 
3179 #if KASAN
3180 			/* stack handoff to self - no thread_dispatch(), so clear the stack
3181 			 * and free the fakestack directly */
3182 #if KASAN_CLASSIC
3183 			kasan_fakestack_drop(self);
3184 			kasan_fakestack_gc(self);
3185 #endif /* KASAN_CLASSIC */
3186 			kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3187 #endif /* KASAN */
3188 
3189 			self->continuation = self->parameter = NULL;
3190 
3191 			boolean_t enable_interrupts = TRUE;
3192 
3193 			/* idle thread needs to stay interrupts-disabled */
3194 			if ((self->state & TH_IDLE)) {
3195 				enable_interrupts = FALSE;
3196 			}
3197 
3198 			call_continuation(continuation, parameter,
3199 			    self->wait_result, enable_interrupts);
3200 			/*NOTREACHED*/
3201 		}
3202 	} else {
3203 		/*
3204 		 * Check that the other thread has a stack
3205 		 */
3206 		if (!thread->kernel_stack) {
3207 need_stack:
3208 			if (!stack_alloc_try(thread)) {
3209 				thread_unlock(thread);
3210 				thread_stack_enqueue(thread);
3211 				return FALSE;
3212 			}
3213 		} else if (thread == self) {
3214 			ast_context(self);
3215 			thread_unlock(self);
3216 
3217 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3218 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3219 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3220 
3221 			return TRUE;
3222 		}
3223 	}
3224 
3225 	/*
3226 	 * Context switch by full context save.
3227 	 */
3228 	processor_t processor = current_processor();
3229 	processor->active_thread = thread;
3230 	processor_state_update_from_thread(processor, thread, false);
3231 
3232 	if (thread->last_processor != processor && thread->last_processor != NULL) {
3233 		if (thread->last_processor->processor_set != processor->processor_set) {
3234 			thread->ps_switch++;
3235 		}
3236 		thread->p_switch++;
3237 	}
3238 	thread->last_processor = processor;
3239 	thread->c_switch++;
3240 	ast_context(thread);
3241 
3242 	thread_unlock(thread);
3243 
3244 	self->reason = reason;
3245 
3246 	processor->last_dispatch = ctime;
3247 	self->last_run_time = ctime;
3248 	timer_update(&thread->runnable_timer, ctime);
3249 	recount_switch_thread(&snap, self, get_threadtask(self));
3250 
3251 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3252 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3253 	    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3254 
3255 	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3256 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3257 		    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3258 	}
3259 
3260 	DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3261 
3262 	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3263 
3264 #if KPERF
3265 	kperf_off_cpu(self);
3266 #endif /* KPERF */
3267 
3268 	/*
3269 	 * This is where we actually switch register context,
3270 	 * and address space if required.  We will next run
3271 	 * as a result of a subsequent context switch.
3272 	 *
3273 	 * Once registers are switched and the processor is running "thread",
3274 	 * the stack variables and non-volatile registers will contain whatever
3275 	 * was there the last time that thread blocked. No local variables should
3276 	 * be used after this point, except for the special case of "thread", which
3277 	 * the platform layer returns as the previous thread running on the processor
3278 	 * via the function call ABI as a return register, and "self", which may have
3279 	 * been stored on the stack or a non-volatile register, but a stale idea of
3280 	 * what was on the CPU is newly-accurate because that thread is again
3281 	 * running on the CPU.
3282 	 *
3283 	 * If one of the threads is using a continuation, thread_continue
3284 	 * is used to stitch up its context.
3285 	 *
3286 	 * If we are invoking a thread which is resuming from a continuation,
3287 	 * the CPU will invoke thread_continue next.
3288 	 *
3289 	 * If the current thread is parking in a continuation, then its state
3290 	 * won't be saved and the stack will be discarded. When the stack is
3291 	 * re-allocated, it will be configured to resume from thread_continue.
3292 	 */
3293 
3294 	assert(continuation == self->continuation);
3295 	thread = machine_switch_context(self, continuation, thread);
3296 	assert(self == current_thread_volatile());
3297 	TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3298 
3299 	assert(continuation == NULL && self->continuation == NULL);
3300 
3301 	DTRACE_SCHED(on__cpu);
3302 
3303 #if KPERF
3304 	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3305 #endif /* KPERF */
3306 
3307 	/* Previous snap on the old stack is gone. */
3308 	recount_log_switch_thread_on(NULL);
3309 
3310 	/* We have been resumed and are set to run. */
3311 	thread_dispatch(thread, self);
3312 
3313 	return TRUE;
3314 }
3315 
3316 #if defined(CONFIG_SCHED_DEFERRED_AST)
3317 /*
3318  *	pset_cancel_deferred_dispatch:
3319  *
3320  *	Cancels all ASTs that we can cancel for the given processor set
3321  *	if the current processor is running the last runnable thread in the
3322  *	system.
3323  *
3324  *	This function assumes the current thread is runnable.  This must
3325  *	be called with the pset unlocked.
3326  */
3327 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3328 pset_cancel_deferred_dispatch(
3329 	processor_set_t         pset,
3330 	processor_t             processor)
3331 {
3332 	processor_t             active_processor = NULL;
3333 	uint32_t                sampled_sched_run_count;
3334 
3335 	pset_lock(pset);
3336 	sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3337 
3338 	/*
3339 	 * If we have emptied the run queue, and our current thread is runnable, we
3340 	 * should tell any processors that are still DISPATCHING that they will
3341 	 * probably not have any work to do.  In the event that there are no
3342 	 * pending signals that we can cancel, this is also uninteresting.
3343 	 *
3344 	 * In the unlikely event that another thread becomes runnable while we are
3345 	 * doing this (sched_run_count is atomically updated, not guarded), the
3346 	 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3347 	 * in order to dispatch it to a processor in our pset.  So, the other
3348 	 * codepath will wait while we squash all cancelable ASTs, get the pset
3349 	 * lock, and then dispatch the freshly runnable thread.  So this should be
3350 	 * correct (we won't accidentally have a runnable thread that hasn't been
3351 	 * dispatched to an idle processor), if not ideal (we may be restarting the
3352 	 * dispatch process, which could have some overhead).
3353 	 */
3354 
3355 	if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3356 		uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3357 		    pset->pending_deferred_AST_cpu_mask &
3358 		    ~pset->pending_AST_URGENT_cpu_mask);
3359 		for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3360 			active_processor = processor_array[cpuid];
3361 			/*
3362 			 * If a processor is DISPATCHING, it could be because of
3363 			 * a cancelable signal.
3364 			 *
3365 			 * IF the processor is not our
3366 			 * current processor (the current processor should not
3367 			 * be DISPATCHING, so this is a bit paranoid), AND there
3368 			 * is a cancelable signal pending on the processor, AND
3369 			 * there is no non-cancelable signal pending (as there is
3370 			 * no point trying to backtrack on bringing the processor
3371 			 * up if a signal we cannot cancel is outstanding), THEN
3372 			 * it should make sense to roll back the processor state
3373 			 * to the IDLE state.
3374 			 *
3375 			 * If the racey nature of this approach (as the signal
3376 			 * will be arbitrated by hardware, and can fire as we
3377 			 * roll back state) results in the core responding
3378 			 * despite being pushed back to the IDLE state, it
3379 			 * should be no different than if the core took some
3380 			 * interrupt while IDLE.
3381 			 */
3382 			if (active_processor != processor) {
3383 				/*
3384 				 * Squash all of the processor state back to some
3385 				 * reasonable facsimile of PROCESSOR_IDLE.
3386 				 */
3387 
3388 				processor_state_update_idle(active_processor);
3389 				active_processor->deadline = RT_DEADLINE_NONE;
3390 				pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3391 				bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3392 				machine_signal_idle_cancel(active_processor);
3393 			}
3394 		}
3395 	}
3396 
3397 	pset_unlock(pset);
3398 }
3399 #else
3400 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3401 #endif
3402 
3403 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3404 thread_csw_callout(
3405 	thread_t            old,
3406 	thread_t            new,
3407 	uint64_t            timestamp)
3408 {
3409 	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3410 	uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3411 	machine_switch_perfcontrol_context(event, timestamp, 0,
3412 	    same_pri_latency, old, new);
3413 }
3414 
3415 
3416 /*
3417  *	thread_dispatch:
3418  *
3419  *	Handle threads at context switch.  Re-dispatch other thread
3420  *	if still running, otherwise update run state and perform
3421  *	special actions.  Update quantum for other thread and begin
3422  *	the quantum for ourselves.
3423  *
3424  *      "thread" is the old thread that we have switched away from.
3425  *      "self" is the new current thread that we have context switched to
3426  *
3427  *	Called at splsched.
3428  *
3429  */
3430 void
thread_dispatch(thread_t thread,thread_t self)3431 thread_dispatch(
3432 	thread_t                thread,
3433 	thread_t                self)
3434 {
3435 	processor_t             processor = self->last_processor;
3436 	bool was_idle = false;
3437 
3438 	assert(processor == current_processor());
3439 	assert(self == current_thread_volatile());
3440 	assert(thread != self);
3441 
3442 	if (thread != THREAD_NULL) {
3443 		/*
3444 		 * Do the perfcontrol callout for context switch.
3445 		 * The reason we do this here is:
3446 		 * - thread_dispatch() is called from various places that are not
3447 		 *   the direct context switch path for eg. processor shutdown etc.
3448 		 *   So adding the callout here covers all those cases.
3449 		 * - We want this callout as early as possible to be close
3450 		 *   to the timestamp taken in thread_invoke()
3451 		 * - We want to avoid holding the thread lock while doing the
3452 		 *   callout
3453 		 * - We do not want to callout if "thread" is NULL.
3454 		 */
3455 		thread_csw_callout(thread, self, processor->last_dispatch);
3456 
3457 #if KASAN
3458 		if (thread->continuation != NULL) {
3459 			/*
3460 			 * Thread has a continuation and the normal stack is going away.
3461 			 * Unpoison the stack and mark all fakestack objects as unused.
3462 			 */
3463 #if KASAN_CLASSIC
3464 			kasan_fakestack_drop(thread);
3465 #endif /* KASAN_CLASSIC */
3466 			if (thread->kernel_stack) {
3467 				kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3468 			}
3469 		}
3470 
3471 
3472 #if KASAN_CLASSIC
3473 		/*
3474 		 * Free all unused fakestack objects.
3475 		 */
3476 		kasan_fakestack_gc(thread);
3477 #endif /* KASAN_CLASSIC */
3478 #endif /* KASAN */
3479 
3480 		/*
3481 		 *	If blocked at a continuation, discard
3482 		 *	the stack.
3483 		 */
3484 		if (thread->continuation != NULL && thread->kernel_stack != 0) {
3485 			stack_free(thread);
3486 		}
3487 
3488 		if (thread->state & TH_IDLE) {
3489 			was_idle = true;
3490 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3491 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3492 			    (uintptr_t)thread_tid(thread), 0, thread->state,
3493 			    sched_run_buckets[TH_BUCKET_RUN], 0);
3494 		} else {
3495 			int64_t consumed;
3496 			int64_t remainder = 0;
3497 
3498 			if (processor->quantum_end > processor->last_dispatch) {
3499 				remainder = processor->quantum_end -
3500 				    processor->last_dispatch;
3501 			}
3502 
3503 			consumed = thread->quantum_remaining - remainder;
3504 
3505 			if ((thread->reason & AST_LEDGER) == 0) {
3506 				/*
3507 				 * Bill CPU time to both the task and
3508 				 * the individual thread.
3509 				 */
3510 				ledger_credit_thread(thread, thread->t_ledger,
3511 				    task_ledgers.cpu_time, consumed);
3512 				ledger_credit_thread(thread, thread->t_threadledger,
3513 				    thread_ledgers.cpu_time, consumed);
3514 				if (thread->t_bankledger) {
3515 					ledger_credit_thread(thread, thread->t_bankledger,
3516 					    bank_ledgers.cpu_time,
3517 					    (consumed - thread->t_deduct_bank_ledger_time));
3518 				}
3519 				thread->t_deduct_bank_ledger_time = 0;
3520 				if (consumed > 0) {
3521 					/*
3522 					 * This should never be negative, but in traces we are seeing some instances
3523 					 * of consumed being negative.
3524 					 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3525 					 */
3526 					sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3527 				}
3528 			}
3529 
3530 			/* For the thread that we just context switched away from, figure
3531 			 * out if we have expired the wq quantum and set the AST if we have
3532 			 */
3533 			if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3534 				thread_evaluate_workqueue_quantum_expiry(thread);
3535 			}
3536 
3537 			if (__improbable(thread->rwlock_count != 0)) {
3538 				smr_mark_active_trackers_stalled(thread);
3539 			}
3540 
3541 			/*
3542 			 * Pairs with task_restartable_ranges_synchronize
3543 			 */
3544 			wake_lock(thread);
3545 			thread_lock(thread);
3546 
3547 			/*
3548 			 * Same as ast_check(), in case we missed the IPI
3549 			 */
3550 			thread_reset_pcs_ack_IPI(thread);
3551 
3552 			/*
3553 			 * Apply a priority floor if the thread holds a kernel resource
3554 			 * or explicitly requested it.
3555 			 * Do this before checking starting_pri to avoid overpenalizing
3556 			 * repeated rwlock blockers.
3557 			 */
3558 			if (__improbable(thread->rwlock_count != 0)) {
3559 				lck_rw_set_promotion_locked(thread);
3560 			}
3561 			if (__improbable(thread->priority_floor_count != 0)) {
3562 				thread_floor_boost_set_promotion_locked(thread);
3563 			}
3564 
3565 			boolean_t keep_quantum = processor->first_timeslice;
3566 
3567 			/*
3568 			 * Treat a thread which has dropped priority since it got on core
3569 			 * as having expired its quantum.
3570 			 */
3571 			if (processor->starting_pri > thread->sched_pri) {
3572 				keep_quantum = FALSE;
3573 			}
3574 
3575 			/* Compute remainder of current quantum. */
3576 			if (keep_quantum &&
3577 			    processor->quantum_end > processor->last_dispatch) {
3578 				thread->quantum_remaining = (uint32_t)remainder;
3579 			} else {
3580 				thread->quantum_remaining = 0;
3581 			}
3582 
3583 			if (thread->sched_mode == TH_MODE_REALTIME) {
3584 				/*
3585 				 *	Cancel the deadline if the thread has
3586 				 *	consumed the entire quantum.
3587 				 */
3588 				if (thread->quantum_remaining == 0) {
3589 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3590 					    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3591 					thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3592 				}
3593 			} else {
3594 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3595 				/*
3596 				 *	For non-realtime threads treat a tiny
3597 				 *	remaining quantum as an expired quantum
3598 				 *	but include what's left next time.
3599 				 */
3600 				if (thread->quantum_remaining < min_std_quantum) {
3601 					thread->reason |= AST_QUANTUM;
3602 					thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3603 				}
3604 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3605 			}
3606 
3607 			/*
3608 			 *	If we are doing a direct handoff then
3609 			 *	take the remainder of the quantum.
3610 			 */
3611 			if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3612 				self->quantum_remaining = thread->quantum_remaining;
3613 				thread->reason |= AST_QUANTUM;
3614 				thread->quantum_remaining = 0;
3615 			} else {
3616 #if defined(CONFIG_SCHED_MULTIQ)
3617 				if (SCHED(sched_groups_enabled) &&
3618 				    thread->sched_group == self->sched_group) {
3619 					KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3620 					    MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3621 					    self->reason, (uintptr_t)thread_tid(thread),
3622 					    self->quantum_remaining, thread->quantum_remaining, 0);
3623 
3624 					self->quantum_remaining = thread->quantum_remaining;
3625 					thread->quantum_remaining = 0;
3626 					/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3627 				}
3628 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3629 			}
3630 
3631 			thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3632 
3633 			if (!(thread->state & TH_WAIT)) {
3634 				/*
3635 				 *	Still runnable.
3636 				 */
3637 				thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3638 
3639 				machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3640 
3641 				ast_t reason = thread->reason;
3642 				sched_options_t options = SCHED_NONE;
3643 
3644 				if (reason & AST_REBALANCE) {
3645 					options |= SCHED_REBALANCE;
3646 					if (reason & AST_QUANTUM) {
3647 						/*
3648 						 * Having gone to the trouble of forcing this thread off a less preferred core,
3649 						 * we should force the preferable core to reschedule immediately to give this
3650 						 * thread a chance to run instead of just sitting on the run queue where
3651 						 * it may just be stolen back by the idle core we just forced it off.
3652 						 * But only do this at the end of a quantum to prevent cascading effects.
3653 						 */
3654 						options |= SCHED_PREEMPT;
3655 					}
3656 				}
3657 
3658 				if (reason & AST_QUANTUM) {
3659 					options |= SCHED_TAILQ;
3660 				} else if (reason & AST_PREEMPT) {
3661 					options |= SCHED_HEADQ;
3662 				} else {
3663 					options |= (SCHED_PREEMPT | SCHED_TAILQ);
3664 				}
3665 
3666 				thread_setrun(thread, options);
3667 
3668 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3669 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3670 				    (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3671 				    sched_run_buckets[TH_BUCKET_RUN], 0);
3672 
3673 				if (thread->wake_active) {
3674 					thread->wake_active = FALSE;
3675 					thread_unlock(thread);
3676 
3677 					thread_wakeup(&thread->wake_active);
3678 				} else {
3679 					thread_unlock(thread);
3680 				}
3681 
3682 				wake_unlock(thread);
3683 			} else {
3684 				/*
3685 				 *	Waiting.
3686 				 */
3687 				boolean_t should_terminate = FALSE;
3688 				uint32_t new_run_count;
3689 				int thread_state = thread->state;
3690 
3691 				/* Only the first call to thread_dispatch
3692 				 * after explicit termination should add
3693 				 * the thread to the termination queue
3694 				 */
3695 				if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3696 					should_terminate = TRUE;
3697 					thread_state |= TH_TERMINATE2;
3698 				}
3699 
3700 				timer_stop(&thread->runnable_timer, processor->last_dispatch);
3701 
3702 				thread_state &= ~TH_RUN;
3703 				thread->state = thread_state;
3704 
3705 				thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3706 				thread->chosen_processor = PROCESSOR_NULL;
3707 
3708 				new_run_count = SCHED(run_count_decr)(thread);
3709 
3710 #if CONFIG_SCHED_AUTO_JOIN
3711 				if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3712 					work_interval_auto_join_unwind(thread);
3713 				}
3714 #endif /* CONFIG_SCHED_AUTO_JOIN */
3715 
3716 #if CONFIG_SCHED_SFI
3717 				if (thread->reason & AST_SFI) {
3718 					thread->wait_sfi_begin_time = processor->last_dispatch;
3719 				}
3720 #endif
3721 				machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3722 
3723 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3724 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3725 				    (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3726 				    new_run_count, 0);
3727 
3728 				if (thread_state & TH_WAIT_REPORT) {
3729 					(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3730 				}
3731 
3732 				if (thread->wake_active) {
3733 					thread->wake_active = FALSE;
3734 					thread_unlock(thread);
3735 
3736 					thread_wakeup(&thread->wake_active);
3737 				} else {
3738 					thread_unlock(thread);
3739 				}
3740 
3741 				wake_unlock(thread);
3742 
3743 				if (should_terminate) {
3744 					thread_terminate_enqueue(thread);
3745 				}
3746 			}
3747 		}
3748 		/*
3749 		 * The thread could have been added to the termination queue, so it's
3750 		 * unsafe to use after this point.
3751 		 */
3752 		thread = THREAD_NULL;
3753 	}
3754 
3755 	int urgency = THREAD_URGENCY_NONE;
3756 	uint64_t latency = 0;
3757 
3758 	/* Update (new) current thread and reprogram running timers */
3759 	thread_lock(self);
3760 
3761 	if (!(self->state & TH_IDLE)) {
3762 		uint64_t        arg1, arg2;
3763 
3764 #if CONFIG_SCHED_SFI
3765 		ast_t                   new_ast;
3766 
3767 		new_ast = sfi_thread_needs_ast(self, NULL);
3768 
3769 		if (new_ast != AST_NONE) {
3770 			ast_on(new_ast);
3771 		}
3772 #endif
3773 
3774 		if (processor->last_dispatch < self->last_made_runnable_time) {
3775 			panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3776 			    processor->last_dispatch, self->last_made_runnable_time);
3777 		}
3778 
3779 		assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3780 
3781 		latency = processor->last_dispatch - self->last_made_runnable_time;
3782 		assert(latency >= self->same_pri_latency);
3783 
3784 		urgency = thread_get_urgency(self, &arg1, &arg2);
3785 
3786 		thread_tell_urgency(urgency, arg1, arg2, latency, self);
3787 
3788 		/*
3789 		 *	Start a new CPU limit interval if the previous one has
3790 		 *	expired. This should happen before initializing a new
3791 		 *	quantum.
3792 		 */
3793 		if (cpulimit_affects_quantum &&
3794 		    thread_cpulimit_interval_has_expired(processor->last_dispatch)) {
3795 			thread_cpulimit_restart(processor->last_dispatch);
3796 		}
3797 
3798 		/*
3799 		 *	Get a new quantum if none remaining.
3800 		 */
3801 		if (self->quantum_remaining == 0) {
3802 			thread_quantum_init(self, processor->last_dispatch);
3803 		}
3804 
3805 		/*
3806 		 *	Set up quantum timer and timeslice.
3807 		 */
3808 		processor->quantum_end = processor->last_dispatch +
3809 		    self->quantum_remaining;
3810 
3811 		running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3812 		    processor->quantum_end, processor->last_dispatch);
3813 		if (was_idle) {
3814 			/*
3815 			 * kperf's running timer is active whenever the idle thread for a
3816 			 * CPU is not running.
3817 			 */
3818 			kperf_running_setup(processor, processor->last_dispatch);
3819 		}
3820 		running_timers_activate(processor);
3821 		processor->first_timeslice = TRUE;
3822 	} else {
3823 		running_timers_deactivate(processor);
3824 		processor->first_timeslice = FALSE;
3825 		thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3826 	}
3827 
3828 	assert(self->block_hint == kThreadWaitNone);
3829 	self->computation_epoch = processor->last_dispatch;
3830 	/*
3831 	 * This relies on the interrupt time being tallied up to the thread in the
3832 	 * exception handler epilogue, which is before AST context where preemption
3833 	 * is considered (and the scheduler is potentially invoked to
3834 	 * context switch, here).
3835 	 */
3836 	self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3837 	self->reason = AST_NONE;
3838 	processor->starting_pri = self->sched_pri;
3839 
3840 	thread_unlock(self);
3841 
3842 	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3843 	    processor->last_dispatch);
3844 
3845 #if defined(CONFIG_SCHED_DEFERRED_AST)
3846 	/*
3847 	 * TODO: Can we state that redispatching our old thread is also
3848 	 * uninteresting?
3849 	 */
3850 	if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3851 		pset_cancel_deferred_dispatch(processor->processor_set, processor);
3852 	}
3853 #endif
3854 }
3855 
3856 /*
3857  *	thread_block_reason:
3858  *
3859  *	Forces a reschedule, blocking the caller if a wait
3860  *	has been asserted.
3861  *
3862  *	If a continuation is specified, then thread_invoke will
3863  *	attempt to discard the thread's kernel stack.  When the
3864  *	thread resumes, it will execute the continuation function
3865  *	on a new kernel stack.
3866  */
3867 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3868 thread_block_reason(
3869 	thread_continue_t       continuation,
3870 	void                            *parameter,
3871 	ast_t                           reason)
3872 {
3873 	thread_t        self = current_thread();
3874 	processor_t     processor;
3875 	thread_t        new_thread;
3876 	spl_t           s;
3877 
3878 	s = splsched();
3879 
3880 	processor = current_processor();
3881 
3882 	/* If we're explicitly yielding, force a subsequent quantum */
3883 	if (reason & AST_YIELD) {
3884 		processor->first_timeslice = FALSE;
3885 	}
3886 
3887 	/* We're handling all scheduling AST's */
3888 	ast_off(AST_SCHEDULING);
3889 
3890 	clear_pending_nonurgent_preemption(processor);
3891 
3892 #if PROC_REF_DEBUG
3893 	if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3894 		uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3895 	}
3896 #endif
3897 
3898 #if CONFIG_EXCLAVES
3899 	if (continuation != NULL) {
3900 		assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, 0);
3901 	}
3902 #endif /* CONFIG_EXCLAVES */
3903 
3904 	self->continuation = continuation;
3905 	self->parameter = parameter;
3906 
3907 	if (self->state & ~(TH_RUN | TH_IDLE)) {
3908 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3909 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3910 		    reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3911 	}
3912 
3913 	do {
3914 		thread_lock(self);
3915 		new_thread = thread_select(self, processor, &reason);
3916 		thread_unlock(self);
3917 	} while (!thread_invoke(self, new_thread, reason));
3918 
3919 	splx(s);
3920 
3921 	return self->wait_result;
3922 }
3923 
3924 /*
3925  *	thread_block:
3926  *
3927  *	Block the current thread if a wait has been asserted.
3928  */
3929 wait_result_t
thread_block(thread_continue_t continuation)3930 thread_block(
3931 	thread_continue_t       continuation)
3932 {
3933 	return thread_block_reason(continuation, NULL, AST_NONE);
3934 }
3935 
3936 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3937 thread_block_parameter(
3938 	thread_continue_t       continuation,
3939 	void                            *parameter)
3940 {
3941 	return thread_block_reason(continuation, parameter, AST_NONE);
3942 }
3943 
3944 /*
3945  *	thread_run:
3946  *
3947  *	Switch directly from the current thread to the
3948  *	new thread, handing off our quantum if appropriate.
3949  *
3950  *	New thread must be runnable, and not on a run queue.
3951  *
3952  *	Called at splsched.
3953  */
3954 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3955 thread_run(
3956 	thread_t                        self,
3957 	thread_continue_t       continuation,
3958 	void                            *parameter,
3959 	thread_t                        new_thread)
3960 {
3961 	ast_t reason = AST_NONE;
3962 
3963 	if ((self->state & TH_IDLE) == 0) {
3964 		reason = AST_HANDOFF;
3965 	}
3966 
3967 	/*
3968 	 * If this thread hadn't been setrun'ed, it
3969 	 * might not have a chosen processor, so give it one
3970 	 */
3971 	if (new_thread->chosen_processor == NULL) {
3972 		new_thread->chosen_processor = current_processor();
3973 	}
3974 
3975 	self->continuation = continuation;
3976 	self->parameter = parameter;
3977 
3978 	while (!thread_invoke(self, new_thread, reason)) {
3979 		/* the handoff failed, so we have to fall back to the normal block path */
3980 		processor_t processor = current_processor();
3981 
3982 		reason = AST_NONE;
3983 
3984 		thread_lock(self);
3985 		new_thread = thread_select(self, processor, &reason);
3986 		thread_unlock(self);
3987 	}
3988 
3989 	return self->wait_result;
3990 }
3991 
3992 /*
3993  *	thread_continue:
3994  *
3995  *	Called at splsched when a thread first receives
3996  *	a new stack after a continuation.
3997  *
3998  *	Called with THREAD_NULL as the old thread when
3999  *	invoked by machine_load_context.
4000  */
4001 void
thread_continue(thread_t thread)4002 thread_continue(
4003 	thread_t        thread)
4004 {
4005 	thread_t                self = current_thread();
4006 	thread_continue_t       continuation;
4007 	void                    *parameter;
4008 
4009 	DTRACE_SCHED(on__cpu);
4010 
4011 	continuation = self->continuation;
4012 	parameter = self->parameter;
4013 
4014 	assert(continuation != NULL);
4015 
4016 #if KPERF
4017 	kperf_on_cpu(self, continuation, NULL);
4018 #endif
4019 
4020 	thread_dispatch(thread, self);
4021 
4022 	self->continuation = self->parameter = NULL;
4023 
4024 #if SCHED_HYGIENE_DEBUG
4025 	/* Reset interrupt-masked spin debugging timeout */
4026 	ml_spin_debug_clear(self);
4027 #endif
4028 
4029 	TLOG(1, "thread_continue: calling call_continuation\n");
4030 
4031 	boolean_t enable_interrupts = TRUE;
4032 
4033 	/* bootstrap thread, idle thread need to stay interrupts-disabled */
4034 	if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4035 		enable_interrupts = FALSE;
4036 	}
4037 
4038 #if KASAN_TBI
4039 	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4040 #endif /* KASAN_TBI */
4041 
4042 
4043 	call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
4044 	/*NOTREACHED*/
4045 }
4046 
4047 void
thread_quantum_init(thread_t thread,uint64_t now)4048 thread_quantum_init(thread_t thread, uint64_t now)
4049 {
4050 	uint64_t new_quantum = 0;
4051 
4052 	switch (thread->sched_mode) {
4053 	case TH_MODE_REALTIME:
4054 		new_quantum = thread->realtime.computation;
4055 		new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4056 		break;
4057 
4058 	case TH_MODE_FIXED:
4059 		new_quantum = SCHED(initial_quantum_size)(thread);
4060 		new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4061 		break;
4062 
4063 	default:
4064 		new_quantum = SCHED(initial_quantum_size)(thread);
4065 		break;
4066 	}
4067 
4068 	if (cpulimit_affects_quantum) {
4069 		const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4070 
4071 		/*
4072 		 * If there's no remaining CPU time, the ledger system will
4073 		 * notice and put the thread to sleep.
4074 		 */
4075 		if (cpulimit_remaining > 0) {
4076 			new_quantum = MIN(new_quantum, cpulimit_remaining);
4077 		}
4078 	}
4079 
4080 	assert3u(new_quantum, <, UINT32_MAX);
4081 	assert3u(new_quantum, >, 0);
4082 
4083 	thread->quantum_remaining = (uint32_t)new_quantum;
4084 }
4085 
4086 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)4087 sched_timeshare_initial_quantum_size(thread_t thread)
4088 {
4089 	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4090 		return bg_quantum;
4091 	} else {
4092 		return std_quantum;
4093 	}
4094 }
4095 
4096 /*
4097  *	run_queue_init:
4098  *
4099  *	Initialize a run queue before first use.
4100  */
4101 void
run_queue_init(run_queue_t rq)4102 run_queue_init(
4103 	run_queue_t             rq)
4104 {
4105 	rq->highq = NOPRI;
4106 	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4107 		rq->bitmap[i] = 0;
4108 	}
4109 	rq->urgency = rq->count = 0;
4110 	for (int i = 0; i < NRQS; i++) {
4111 		circle_queue_init(&rq->queues[i]);
4112 	}
4113 }
4114 
4115 /*
4116  *	run_queue_dequeue:
4117  *
4118  *	Perform a dequeue operation on a run queue,
4119  *	and return the resulting thread.
4120  *
4121  *	The run queue must be locked (see thread_run_queue_remove()
4122  *	for more info), and not empty.
4123  */
4124 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)4125 run_queue_dequeue(
4126 	run_queue_t     rq,
4127 	sched_options_t options)
4128 {
4129 	thread_t        thread;
4130 	circle_queue_t  queue = &rq->queues[rq->highq];
4131 
4132 	if (options & SCHED_HEADQ) {
4133 		thread = cqe_dequeue_head(queue, struct thread, runq_links);
4134 	} else {
4135 		thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4136 	}
4137 
4138 	assert(thread != THREAD_NULL);
4139 	assert_thread_magic(thread);
4140 
4141 	thread_clear_runq(thread);
4142 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4143 	rq->count--;
4144 	if (SCHED(priority_is_urgent)(rq->highq)) {
4145 		rq->urgency--; assert(rq->urgency >= 0);
4146 	}
4147 	if (circle_queue_empty(queue)) {
4148 		bitmap_clear(rq->bitmap, rq->highq);
4149 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4150 	}
4151 
4152 	return thread;
4153 }
4154 
4155 /*
4156  *	run_queue_enqueue:
4157  *
4158  *	Perform a enqueue operation on a run queue.
4159  *
4160  *	The run queue must be locked (see thread_run_queue_remove()
4161  *	for more info).
4162  */
4163 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)4164 run_queue_enqueue(
4165 	run_queue_t      rq,
4166 	thread_t         thread,
4167 	sched_options_t  options)
4168 {
4169 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4170 	boolean_t       result = FALSE;
4171 
4172 	assert_thread_magic(thread);
4173 
4174 	if (circle_queue_empty(queue)) {
4175 		circle_enqueue_tail(queue, &thread->runq_links);
4176 
4177 		rq_bitmap_set(rq->bitmap, thread->sched_pri);
4178 		if (thread->sched_pri > rq->highq) {
4179 			rq->highq = thread->sched_pri;
4180 			result = TRUE;
4181 		}
4182 	} else {
4183 		if (options & SCHED_TAILQ) {
4184 			circle_enqueue_tail(queue, &thread->runq_links);
4185 		} else {
4186 			circle_enqueue_head(queue, &thread->runq_links);
4187 		}
4188 	}
4189 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4190 		rq->urgency++;
4191 	}
4192 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4193 	rq->count++;
4194 
4195 	return result;
4196 }
4197 
4198 /*
4199  *	run_queue_remove:
4200  *
4201  *	Remove a specific thread from a runqueue.
4202  *
4203  *	The run queue must be locked.
4204  */
4205 void
run_queue_remove(run_queue_t rq,thread_t thread)4206 run_queue_remove(
4207 	run_queue_t    rq,
4208 	thread_t       thread)
4209 {
4210 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
4211 
4212 	thread_assert_runq_nonnull(thread);
4213 	assert_thread_magic(thread);
4214 
4215 	circle_dequeue(queue, &thread->runq_links);
4216 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4217 	rq->count--;
4218 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4219 		rq->urgency--; assert(rq->urgency >= 0);
4220 	}
4221 
4222 	if (circle_queue_empty(queue)) {
4223 		/* update run queue status */
4224 		bitmap_clear(rq->bitmap, thread->sched_pri);
4225 		rq->highq = bitmap_first(rq->bitmap, NRQS);
4226 	}
4227 
4228 	thread_clear_runq(thread);
4229 }
4230 
4231 /*
4232  *      run_queue_peek
4233  *
4234  *      Peek at the runq and return the highest
4235  *      priority thread from the runq.
4236  *
4237  *	The run queue must be locked.
4238  */
4239 thread_t
run_queue_peek(run_queue_t rq)4240 run_queue_peek(
4241 	run_queue_t    rq)
4242 {
4243 	if (rq->count > 0) {
4244 		circle_queue_t queue = &rq->queues[rq->highq];
4245 		thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4246 		assert_thread_magic(thread);
4247 		return thread;
4248 	} else {
4249 		return THREAD_NULL;
4250 	}
4251 }
4252 
4253 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4254 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4255 {
4256 	int pri = thread->sched_pri;
4257 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4258 	int i = pri - BASEPRI_RTQUEUES;
4259 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4260 	bitmap_t *map = rt_run_queue->bitmap;
4261 
4262 	bitmap_set(map, i);
4263 
4264 	queue_t     queue       = &rt_runq->pri_queue;
4265 	uint64_t    deadline    = thread->realtime.deadline;
4266 	bool        preempt     = false;
4267 	bool        earliest    = false;
4268 
4269 	if (queue_empty(queue)) {
4270 		enqueue_tail(queue, &thread->runq_links);
4271 		preempt = true;
4272 		earliest = true;
4273 		rt_runq->pri_earliest_deadline = deadline;
4274 		rt_runq->pri_constraint = thread->realtime.constraint;
4275 	} else {
4276 		/* Insert into rt_runq in thread deadline order */
4277 		queue_entry_t iter;
4278 		qe_foreach(iter, queue) {
4279 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4280 			assert_thread_magic(iter_thread);
4281 
4282 			if (deadline < iter_thread->realtime.deadline) {
4283 				if (iter == queue_first(queue)) {
4284 					preempt = true;
4285 					earliest = true;
4286 					rt_runq->pri_earliest_deadline = deadline;
4287 					rt_runq->pri_constraint = thread->realtime.constraint;
4288 				}
4289 				insque(&thread->runq_links, queue_prev(iter));
4290 				break;
4291 			} else if (iter == queue_last(queue)) {
4292 				enqueue_tail(queue, &thread->runq_links);
4293 				break;
4294 			}
4295 		}
4296 	}
4297 	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4298 		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4299 		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4300 		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4301 	}
4302 
4303 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4304 	rt_runq->pri_count++;
4305 	os_atomic_inc(&rt_run_queue->count, relaxed);
4306 
4307 	thread_set_runq_locked(thread, processor);
4308 
4309 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4310 
4311 	return preempt;
4312 }
4313 
4314 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4315 rt_runq_dequeue(rt_queue_t rt_run_queue)
4316 {
4317 	bitmap_t *map = rt_run_queue->bitmap;
4318 	int i = bitmap_first(map, NRTQS);
4319 	assert((i >= 0) && (i < NRTQS));
4320 
4321 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4322 
4323 	if (!sched_rt_runq_strict_priority) {
4324 		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4325 		if (ed_index != i) {
4326 			assert((ed_index >= 0) && (ed_index < NRTQS));
4327 			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4328 
4329 			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4330 			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4331 
4332 			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4333 				/* choose the earliest deadline thread */
4334 				rt_runq = ed_runq;
4335 				i = ed_index;
4336 			}
4337 		}
4338 	}
4339 
4340 	assert(rt_runq->pri_count > 0);
4341 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4342 	uint32_t constraint = RT_CONSTRAINT_NONE;
4343 	int ed_index = NOPRI;
4344 	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4345 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4346 	if (--rt_runq->pri_count > 0) {
4347 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4348 		assert(next_rt != THREAD_NULL);
4349 		earliest_deadline = next_rt->realtime.deadline;
4350 		constraint = next_rt->realtime.constraint;
4351 		ed_index = i;
4352 	} else {
4353 		bitmap_clear(map, i);
4354 	}
4355 	rt_runq->pri_earliest_deadline = earliest_deadline;
4356 	rt_runq->pri_constraint = constraint;
4357 
4358 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4359 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4360 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4361 			earliest_deadline = rt_runq->pri_earliest_deadline;
4362 			constraint = rt_runq->pri_constraint;
4363 			ed_index = i;
4364 		}
4365 	}
4366 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4367 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4368 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4369 	os_atomic_dec(&rt_run_queue->count, relaxed);
4370 
4371 	thread_clear_runq(new_thread);
4372 
4373 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4374 
4375 	return new_thread;
4376 }
4377 
4378 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4379 rt_runq_first(rt_queue_t rt_run_queue)
4380 {
4381 	bitmap_t *map = rt_run_queue->bitmap;
4382 	int i = bitmap_first(map, NRTQS);
4383 	if (i < 0) {
4384 		return THREAD_NULL;
4385 	}
4386 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4387 	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4388 
4389 	return next_rt;
4390 }
4391 
4392 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4393 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4394 {
4395 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4396 
4397 	int pri = thread->sched_pri;
4398 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4399 	int i = pri - BASEPRI_RTQUEUES;
4400 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4401 	bitmap_t *map = rt_run_queue->bitmap;
4402 
4403 	assert(rt_runq->pri_count > 0);
4404 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4405 	uint32_t constraint = RT_CONSTRAINT_NONE;
4406 	int ed_index = NOPRI;
4407 	remqueue(&thread->runq_links);
4408 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4409 	if (--rt_runq->pri_count > 0) {
4410 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4411 		earliest_deadline = next_rt->realtime.deadline;
4412 		constraint = next_rt->realtime.constraint;
4413 		ed_index = i;
4414 	} else {
4415 		bitmap_clear(map, i);
4416 	}
4417 	rt_runq->pri_earliest_deadline = earliest_deadline;
4418 	rt_runq->pri_constraint = constraint;
4419 
4420 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4421 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4422 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4423 			earliest_deadline = rt_runq->pri_earliest_deadline;
4424 			constraint = rt_runq->pri_constraint;
4425 			ed_index = i;
4426 		}
4427 	}
4428 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4429 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4430 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4431 	os_atomic_dec(&rt_run_queue->count, relaxed);
4432 
4433 	thread_clear_runq_locked(thread);
4434 
4435 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4436 }
4437 
4438 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4439 sched_rtlocal_runq(processor_set_t pset)
4440 {
4441 	return &pset->rt_runq;
4442 }
4443 
4444 void
sched_rtlocal_init(processor_set_t pset)4445 sched_rtlocal_init(processor_set_t pset)
4446 {
4447 	pset_rt_init(pset);
4448 }
4449 
4450 void
sched_rtlocal_queue_shutdown(processor_t processor)4451 sched_rtlocal_queue_shutdown(processor_t processor)
4452 {
4453 	processor_set_t pset = processor->processor_set;
4454 	thread_t        thread;
4455 	queue_head_t    tqueue;
4456 
4457 	pset_lock(pset);
4458 
4459 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4460 	if (bit_count(pset_available_cpumap(pset)) > 0) {
4461 		pset_unlock(pset);
4462 		return;
4463 	}
4464 
4465 	queue_init(&tqueue);
4466 
4467 	while (rt_runq_count(pset) > 0) {
4468 		thread = rt_runq_dequeue(&pset->rt_runq);
4469 		enqueue_tail(&tqueue, &thread->runq_links);
4470 	}
4471 	sched_update_pset_load_average(pset, 0);
4472 	pset_update_rt_stealable_state(pset);
4473 	pset_unlock(pset);
4474 
4475 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4476 		remqueue(&thread->runq_links);
4477 
4478 		thread_lock(thread);
4479 
4480 		thread_setrun(thread, SCHED_TAILQ);
4481 
4482 		thread_unlock(thread);
4483 	}
4484 }
4485 
4486 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4487 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4488 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4489 {
4490 	thread_t        thread;
4491 
4492 	pset_node_t node = &pset_node0;
4493 	processor_set_t pset = node->psets;
4494 
4495 	spl_t s = splsched();
4496 	do {
4497 		while (pset != NULL) {
4498 			pset_lock(pset);
4499 
4500 			bitmap_t *map = pset->rt_runq.bitmap;
4501 			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4502 				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4503 
4504 				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4505 					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4506 						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4507 					}
4508 				}
4509 			}
4510 
4511 			pset_unlock(pset);
4512 
4513 			pset = pset->pset_list;
4514 		}
4515 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4516 	splx(s);
4517 }
4518 
4519 int64_t
sched_rtlocal_runq_count_sum(void)4520 sched_rtlocal_runq_count_sum(void)
4521 {
4522 	pset_node_t node = &pset_node0;
4523 	processor_set_t pset = node->psets;
4524 	int64_t count = 0;
4525 
4526 	do {
4527 		while (pset != NULL) {
4528 			count += pset->rt_runq.runq_stats.count_sum;
4529 
4530 			pset = pset->pset_list;
4531 		}
4532 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4533 
4534 	return count;
4535 }
4536 
4537 /*
4538  * Called with stealing_pset locked and
4539  * returns with stealing_pset locked
4540  * but the lock will have been dropped
4541  * if a thread is returned.
4542  */
4543 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4544 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4545 {
4546 	if (!sched_allow_rt_steal) {
4547 		return THREAD_NULL;
4548 	}
4549 	pset_map_t pset_map = stealing_pset->node->pset_map;
4550 
4551 	bit_clear(pset_map, stealing_pset->pset_id);
4552 
4553 	processor_set_t pset = stealing_pset;
4554 
4555 	processor_set_t target_pset;
4556 	uint64_t target_deadline;
4557 
4558 retry:
4559 	target_pset = NULL;
4560 	target_deadline = earliest_deadline - rt_deadline_epsilon;
4561 
4562 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4563 		processor_set_t nset = pset_array[pset_id];
4564 
4565 		/*
4566 		 * During startup, while pset_array[] and node->pset_map are still being initialized,
4567 		 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4568 		 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4569 		 * so just check nset is not NULL instead.
4570 		 */
4571 		if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4572 			target_deadline = nset->stealable_rt_threads_earliest_deadline;
4573 			target_pset = nset;
4574 		}
4575 	}
4576 
4577 	if (target_pset != NULL) {
4578 		pset = change_locked_pset(pset, target_pset);
4579 		if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4580 			thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4581 			pset_update_rt_stealable_state(pset);
4582 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4583 
4584 			pset = change_locked_pset(pset, stealing_pset);
4585 			return new_thread;
4586 		}
4587 		pset = change_locked_pset(pset, stealing_pset);
4588 		earliest_deadline = rt_runq_earliest_deadline(pset);
4589 		goto retry;
4590 	}
4591 
4592 	pset = change_locked_pset(pset, stealing_pset);
4593 	return THREAD_NULL;
4594 }
4595 
4596 /*
4597  * pset is locked
4598  */
4599 thread_t
sched_rt_choose_thread(processor_set_t pset)4600 sched_rt_choose_thread(processor_set_t pset)
4601 {
4602 	processor_t processor = current_processor();
4603 
4604 	if (SCHED(steal_thread_enabled)(pset)) {
4605 		do {
4606 			bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4607 			if (spill_pending) {
4608 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4609 			}
4610 			thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4611 			if (new_thread != THREAD_NULL) {
4612 				if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4613 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4614 				}
4615 				return new_thread;
4616 			}
4617 		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4618 	}
4619 
4620 	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4621 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4622 	}
4623 
4624 	if (rt_runq_count(pset) > 0) {
4625 		thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4626 		assert(new_thread != THREAD_NULL);
4627 		pset_update_rt_stealable_state(pset);
4628 		return new_thread;
4629 	}
4630 
4631 	return THREAD_NULL;
4632 }
4633 
4634 /*
4635  *	realtime_queue_insert:
4636  *
4637  *	Enqueue a thread for realtime execution.
4638  */
4639 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4640 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4641 {
4642 	pset_assert_locked(pset);
4643 
4644 	bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4645 	pset_update_rt_stealable_state(pset);
4646 
4647 	return preempt;
4648 }
4649 
4650 /*
4651  *	realtime_setrun:
4652  *
4653  *	Dispatch a thread for realtime execution.
4654  *
4655  *	Thread must be locked.  Associated pset must
4656  *	be locked, and is returned unlocked.
4657  */
4658 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4659 realtime_setrun(
4660 	processor_t                     chosen_processor,
4661 	thread_t                        thread)
4662 {
4663 	processor_set_t pset = chosen_processor->processor_set;
4664 	pset_assert_locked(pset);
4665 	bool pset_is_locked = true;
4666 
4667 	int n_backup = 0;
4668 
4669 	if (thread->realtime.constraint <= rt_constraint_threshold) {
4670 		n_backup = sched_rt_n_backup_processors;
4671 	}
4672 	assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4673 
4674 	int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4675 	if (existing_backups > 0) {
4676 		n_backup = n_backup - existing_backups;
4677 		if (n_backup < 0) {
4678 			n_backup = 0;
4679 		}
4680 	}
4681 
4682 	sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4683 	processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4684 
4685 	thread->chosen_processor = chosen_processor;
4686 
4687 	/* <rdar://problem/15102234> */
4688 	assert(thread->bound_processor == PROCESSOR_NULL);
4689 
4690 	realtime_queue_insert(chosen_processor, pset, thread);
4691 
4692 	processor_t processor = chosen_processor;
4693 
4694 	int count = 0;
4695 	for (int i = 0; i <= n_backup; i++) {
4696 		if (i == 0) {
4697 			ipi_type[i] = SCHED_IPI_NONE;
4698 			ipi_processor[i] = processor;
4699 			count++;
4700 
4701 			ast_t preempt = AST_NONE;
4702 			if (thread->sched_pri > processor->current_pri) {
4703 				preempt = (AST_PREEMPT | AST_URGENT);
4704 			} else if (thread->sched_pri == processor->current_pri) {
4705 				if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4706 					preempt = (AST_PREEMPT | AST_URGENT);
4707 				}
4708 			}
4709 
4710 			if (preempt != AST_NONE) {
4711 				if (processor->state == PROCESSOR_IDLE) {
4712 					if (processor == current_processor()) {
4713 						pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4714 						ast_on(preempt);
4715 
4716 						if ((preempt & AST_URGENT) == AST_URGENT) {
4717 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4718 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4719 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4720 							}
4721 						}
4722 
4723 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4724 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4725 						}
4726 					} else {
4727 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4728 					}
4729 				} else if (processor->state == PROCESSOR_DISPATCHING) {
4730 					if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4731 						KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4732 						    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4733 					}
4734 				} else {
4735 					if (processor == current_processor()) {
4736 						ast_on(preempt);
4737 
4738 						if ((preempt & AST_URGENT) == AST_URGENT) {
4739 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4740 								KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4741 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4742 							}
4743 						}
4744 
4745 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4746 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4747 						}
4748 					} else {
4749 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4750 					}
4751 				}
4752 			} else {
4753 				/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4754 			}
4755 		} else {
4756 			if (!pset_is_locked) {
4757 				pset_lock(pset);
4758 			}
4759 			ipi_type[i] = SCHED_IPI_NONE;
4760 			ipi_processor[i] = PROCESSOR_NULL;
4761 			pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4762 			if (ipi_processor[i] == PROCESSOR_NULL) {
4763 				break;
4764 			}
4765 			count++;
4766 
4767 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4768 			    ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4769 #if defined(__x86_64__)
4770 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4771 			if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4772 				processor_t p0 = ipi_processor[0];
4773 				processor_t p1 = ipi_processor[1];
4774 				assert(p0 && p1);
4775 				if (p_is_good(p0) && p_is_good(p1)) {
4776 					/*
4777 					 * Both the chosen processor and the first backup are non-cpu0 primaries,
4778 					 * so there is no need for a 2nd backup processor.
4779 					 */
4780 					break;
4781 				}
4782 			}
4783 #endif
4784 		}
4785 	}
4786 
4787 	if (pset_is_locked) {
4788 		pset_unlock(pset);
4789 	}
4790 
4791 	assert((count > 0) && (count <= (n_backup + 1)));
4792 	for (int i = 0; i < count; i++) {
4793 		assert(ipi_processor[i] != PROCESSOR_NULL);
4794 		sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4795 	}
4796 }
4797 
4798 
4799 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4800 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4801     thread_t thread, __unused sched_ipi_event_t event)
4802 {
4803 #if defined(CONFIG_SCHED_DEFERRED_AST)
4804 #if CONFIG_THREAD_GROUPS
4805 	if (thread) {
4806 		struct thread_group *tg = thread_group_get(thread);
4807 		if (thread_group_uses_immediate_ipi(tg)) {
4808 			return SCHED_IPI_IMMEDIATE;
4809 		}
4810 	}
4811 #endif /* CONFIG_THREAD_GROUPS */
4812 	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4813 		return SCHED_IPI_DEFERRED;
4814 	}
4815 #else /* CONFIG_SCHED_DEFERRED_AST */
4816 	(void) thread;
4817 	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4818 #endif /* CONFIG_SCHED_DEFERRED_AST */
4819 	return SCHED_IPI_NONE;
4820 }
4821 
4822 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4823 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4824 {
4825 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4826 	assert(dst != NULL);
4827 
4828 	processor_set_t pset = dst->processor_set;
4829 	if (current_processor() == dst) {
4830 		return SCHED_IPI_NONE;
4831 	}
4832 
4833 	bool dst_idle = (dst->state == PROCESSOR_IDLE);
4834 	if (dst_idle) {
4835 		pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4836 	}
4837 
4838 	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4839 	switch (ipi_type) {
4840 	case SCHED_IPI_NONE:
4841 		return SCHED_IPI_NONE;
4842 #if defined(CONFIG_SCHED_DEFERRED_AST)
4843 	case SCHED_IPI_DEFERRED:
4844 		bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4845 		break;
4846 #endif /* CONFIG_SCHED_DEFERRED_AST */
4847 	default:
4848 		if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4849 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4850 			    dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4851 		}
4852 		bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4853 		break;
4854 	}
4855 	return ipi_type;
4856 }
4857 
4858 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4859 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4860 {
4861 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4862 	boolean_t deferred_ipi_supported = false;
4863 	processor_set_t pset = dst->processor_set;
4864 
4865 #if defined(CONFIG_SCHED_DEFERRED_AST)
4866 	deferred_ipi_supported = true;
4867 #endif /* CONFIG_SCHED_DEFERRED_AST */
4868 
4869 	switch (event) {
4870 	case SCHED_IPI_EVENT_SPILL:
4871 	case SCHED_IPI_EVENT_SMT_REBAL:
4872 	case SCHED_IPI_EVENT_REBALANCE:
4873 	case SCHED_IPI_EVENT_BOUND_THR:
4874 	case SCHED_IPI_EVENT_RT_PREEMPT:
4875 		/*
4876 		 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4877 		 * scenarios use immediate IPIs always.
4878 		 */
4879 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4880 		break;
4881 	case SCHED_IPI_EVENT_PREEMPT:
4882 		/* In the preemption case, use immediate IPIs for RT threads */
4883 		if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4884 			ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4885 			break;
4886 		}
4887 
4888 		/*
4889 		 * For Non-RT threads preemption,
4890 		 * If the core is active, use immediate IPIs.
4891 		 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4892 		 */
4893 		if (deferred_ipi_supported && dst_idle) {
4894 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4895 		}
4896 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4897 		break;
4898 	default:
4899 		panic("Unrecognized scheduler IPI event type %d", event);
4900 	}
4901 	assert(ipi_type != SCHED_IPI_NONE);
4902 	return ipi_type;
4903 }
4904 
4905 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4906 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4907 {
4908 	switch (ipi) {
4909 	case SCHED_IPI_NONE:
4910 		break;
4911 	case SCHED_IPI_IDLE:
4912 		machine_signal_idle(dst);
4913 		break;
4914 	case SCHED_IPI_IMMEDIATE:
4915 		cause_ast_check(dst);
4916 		break;
4917 	case SCHED_IPI_DEFERRED:
4918 		machine_signal_idle_deferred(dst);
4919 		break;
4920 	default:
4921 		panic("Unrecognized scheduler IPI type: %d", ipi);
4922 	}
4923 }
4924 
4925 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4926 
4927 boolean_t
priority_is_urgent(int priority)4928 priority_is_urgent(int priority)
4929 {
4930 	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4931 }
4932 
4933 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4934 
4935 /*
4936  *	processor_setrun:
4937  *
4938  *	Dispatch a thread for execution on a
4939  *	processor.
4940  *
4941  *	Thread must be locked.  Associated pset must
4942  *	be locked, and is returned unlocked.
4943  */
4944 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4945 processor_setrun(
4946 	processor_t                     processor,
4947 	thread_t                        thread,
4948 	integer_t                       options)
4949 {
4950 	processor_set_t pset = processor->processor_set;
4951 	pset_assert_locked(pset);
4952 	ast_t preempt = AST_NONE;
4953 	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4954 
4955 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4956 
4957 	thread->chosen_processor = processor;
4958 
4959 	/*
4960 	 *	Set preemption mode.
4961 	 */
4962 #if defined(CONFIG_SCHED_DEFERRED_AST)
4963 	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4964 #endif
4965 	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4966 		preempt = (AST_PREEMPT | AST_URGENT);
4967 	} else if (processor->current_is_eagerpreempt) {
4968 		preempt = (AST_PREEMPT | AST_URGENT);
4969 	} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4970 		if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4971 			preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4972 		} else {
4973 			preempt = AST_NONE;
4974 		}
4975 	} else {
4976 		preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4977 	}
4978 
4979 	if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4980 		/*
4981 		 * Having gone to the trouble of forcing this thread off a less preferred core,
4982 		 * we should force the preferable core to reschedule immediately to give this
4983 		 * thread a chance to run instead of just sitting on the run queue where
4984 		 * it may just be stolen back by the idle core we just forced it off.
4985 		 */
4986 		preempt |= AST_PREEMPT;
4987 	}
4988 
4989 	SCHED(processor_enqueue)(processor, thread, options);
4990 	sched_update_pset_load_average(pset, 0);
4991 
4992 	if (preempt != AST_NONE) {
4993 		if (processor->state == PROCESSOR_IDLE) {
4994 			ipi_action = eExitIdle;
4995 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4996 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4997 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4998 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4999 			}
5000 		} else if ((processor->state == PROCESSOR_RUNNING ||
5001 		    processor->state == PROCESSOR_SHUTDOWN) &&
5002 		    (thread->sched_pri >= processor->current_pri)) {
5003 			ipi_action = eInterruptRunning;
5004 		}
5005 	} else {
5006 		/*
5007 		 * New thread is not important enough to preempt what is running, but
5008 		 * special processor states may need special handling
5009 		 */
5010 		if (processor->state == PROCESSOR_SHUTDOWN &&
5011 		    thread->sched_pri >= processor->current_pri) {
5012 			ipi_action = eInterruptRunning;
5013 		} else if (processor->state == PROCESSOR_IDLE) {
5014 			ipi_action = eExitIdle;
5015 		} else if (processor->state == PROCESSOR_DISPATCHING) {
5016 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5017 				KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5018 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5019 			}
5020 		}
5021 	}
5022 
5023 	if (ipi_action != eDoNothing) {
5024 		if (processor == current_processor()) {
5025 			if (ipi_action == eExitIdle) {
5026 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
5027 			}
5028 			if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5029 				ast_on(preempt);
5030 			}
5031 
5032 			if ((preempt & AST_URGENT) == AST_URGENT) {
5033 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5034 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5035 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5036 				}
5037 			} else {
5038 				if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5039 					KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5040 				}
5041 			}
5042 
5043 			if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5044 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5045 			} else {
5046 				bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5047 			}
5048 		} else {
5049 			sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5050 			ipi_type = sched_ipi_action(processor, thread, event);
5051 		}
5052 	}
5053 
5054 	pset_unlock(pset);
5055 	sched_ipi_perform(processor, ipi_type);
5056 
5057 	if (ipi_action != eDoNothing && processor == current_processor()) {
5058 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
5059 		ast_on(new_preempt);
5060 	}
5061 }
5062 
5063 /*
5064  *	choose_next_pset:
5065  *
5066  *	Return the next sibling pset containing
5067  *	available processors.
5068  *
5069  *	Returns the original pset if none other is
5070  *	suitable.
5071  */
5072 static processor_set_t
choose_next_pset(processor_set_t pset)5073 choose_next_pset(
5074 	processor_set_t         pset)
5075 {
5076 	processor_set_t         nset = pset;
5077 
5078 	do {
5079 		nset = next_pset(nset);
5080 
5081 		/*
5082 		 * Sometimes during startup the pset_map can contain a bit
5083 		 * for a pset that isn't fully published in pset_array because
5084 		 * the pset_map read isn't an acquire load.
5085 		 *
5086 		 * In order to avoid needing an acquire barrier here, just bail
5087 		 * out.
5088 		 */
5089 		if (nset == PROCESSOR_SET_NULL) {
5090 			return pset;
5091 		}
5092 	} while (nset->online_processor_count < 1 && nset != pset);
5093 
5094 	return nset;
5095 }
5096 
5097 /*
5098  *	choose_processor:
5099  *
5100  *	Choose a processor for the thread, beginning at
5101  *	the pset.  Accepts an optional processor hint in
5102  *	the pset.
5103  *
5104  *	Returns a processor, possibly from a different pset.
5105  *
5106  *	The thread must be locked.  The pset must be locked,
5107  *	and the resulting pset is locked on return.
5108  */
5109 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)5110 choose_processor(
5111 	processor_set_t         starting_pset,
5112 	processor_t             processor,
5113 	thread_t                thread)
5114 {
5115 	processor_set_t pset = starting_pset;
5116 	processor_set_t nset;
5117 
5118 	assert(thread->sched_pri <= MAXPRI);
5119 
5120 	/*
5121 	 * Prefer the hinted processor, when appropriate.
5122 	 */
5123 
5124 	/* Fold last processor hint from secondary processor to its primary */
5125 	if (processor != PROCESSOR_NULL) {
5126 		processor = processor->processor_primary;
5127 	}
5128 
5129 	/*
5130 	 * Only consult platform layer if pset is active, which
5131 	 * it may not be in some cases when a multi-set system
5132 	 * is going to sleep.
5133 	 */
5134 	if (pset->online_processor_count) {
5135 		if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5136 			processor_t mc_processor = machine_choose_processor(pset, processor);
5137 			if (mc_processor != PROCESSOR_NULL) {
5138 				processor = mc_processor->processor_primary;
5139 			}
5140 		}
5141 	}
5142 
5143 	/*
5144 	 * At this point, we may have a processor hint, and we may have
5145 	 * an initial starting pset. If the hint is not in the pset, or
5146 	 * if the hint is for a processor in an invalid state, discard
5147 	 * the hint.
5148 	 */
5149 	if (processor != PROCESSOR_NULL) {
5150 		if (processor->processor_set != pset) {
5151 			processor = PROCESSOR_NULL;
5152 		} else if (!processor->is_recommended) {
5153 			processor = PROCESSOR_NULL;
5154 		} else {
5155 			switch (processor->state) {
5156 			case PROCESSOR_START:
5157 			case PROCESSOR_SHUTDOWN:
5158 			case PROCESSOR_PENDING_OFFLINE:
5159 			case PROCESSOR_OFF_LINE:
5160 				/*
5161 				 * Hint is for a processor that cannot support running new threads.
5162 				 */
5163 				processor = PROCESSOR_NULL;
5164 				break;
5165 			case PROCESSOR_IDLE:
5166 				/*
5167 				 * Hint is for an idle processor. Assume it is no worse than any other
5168 				 * idle processor. The platform layer had an opportunity to provide
5169 				 * the "least cost idle" processor above.
5170 				 */
5171 				if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5172 					uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5173 					uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5174 					/*
5175 					 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5176 					 * that needn't be avoided, don't continue running on the same core.
5177 					 */
5178 					if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
5179 						return processor;
5180 					}
5181 				}
5182 				processor = PROCESSOR_NULL;
5183 				break;
5184 			case PROCESSOR_RUNNING:
5185 			case PROCESSOR_DISPATCHING:
5186 				/*
5187 				 * Hint is for an active CPU. This fast-path allows
5188 				 * realtime threads to preempt non-realtime threads
5189 				 * to regain their previous executing processor.
5190 				 */
5191 				if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5192 					if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5193 						return processor;
5194 					}
5195 					processor = PROCESSOR_NULL;
5196 				}
5197 
5198 				/* Otherwise, use hint as part of search below */
5199 				break;
5200 			default:
5201 				processor = PROCESSOR_NULL;
5202 				break;
5203 			}
5204 		}
5205 	}
5206 
5207 	/*
5208 	 * Iterate through the processor sets to locate
5209 	 * an appropriate processor. Seed results with
5210 	 * a last-processor hint, if available, so that
5211 	 * a search must find something strictly better
5212 	 * to replace it.
5213 	 *
5214 	 * A primary/secondary pair of SMT processors are
5215 	 * "unpaired" if the primary is busy but its
5216 	 * corresponding secondary is idle (so the physical
5217 	 * core has full use of its resources).
5218 	 */
5219 
5220 	integer_t lowest_priority = MAXPRI + 1;
5221 	integer_t lowest_secondary_priority = MAXPRI + 1;
5222 	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5223 	integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5224 	integer_t lowest_count = INT_MAX;
5225 	processor_t lp_processor = PROCESSOR_NULL;
5226 	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5227 	processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5228 	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5229 	processor_t lc_processor = PROCESSOR_NULL;
5230 
5231 	if (processor != PROCESSOR_NULL) {
5232 		/* All other states should be enumerated above. */
5233 		assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5234 		assert(thread->sched_pri < BASEPRI_RTQUEUES);
5235 
5236 		lowest_priority = processor->current_pri;
5237 		lp_processor = processor;
5238 
5239 		lowest_count = SCHED(processor_runq_count)(processor);
5240 		lc_processor = processor;
5241 	}
5242 
5243 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5244 		pset_node_t node = pset->node;
5245 		bool include_ast_urgent_pending_cpus = false;
5246 		cpumap_t ast_urgent_pending;
5247 try_again:
5248 		ast_urgent_pending = 0;
5249 		int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5250 		for (; consider_secondaries < 2; consider_secondaries++) {
5251 			pset = change_locked_pset(pset, starting_pset);
5252 			do {
5253 				cpumap_t available_map = pset_available_cpumap(pset);
5254 				if (available_map == 0) {
5255 					goto no_available_cpus;
5256 				}
5257 
5258 				processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5259 				if (processor) {
5260 					return processor;
5261 				}
5262 
5263 				if (consider_secondaries) {
5264 					processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5265 					if (processor) {
5266 						/*
5267 						 * Instead of looping through all the psets to find the global
5268 						 * furthest deadline processor, preempt the first candidate found.
5269 						 * The preempted thread will then find any other available far deadline
5270 						 * processors to preempt.
5271 						 */
5272 						return processor;
5273 					}
5274 
5275 					ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5276 
5277 					if (rt_runq_count(pset) < lowest_count) {
5278 						int cpuid = bit_first(available_map);
5279 						assert(cpuid >= 0);
5280 						lc_processor = processor_array[cpuid];
5281 						lowest_count = rt_runq_count(pset);
5282 					}
5283 				}
5284 
5285 no_available_cpus:
5286 				nset = next_pset(pset);
5287 
5288 				if (nset != starting_pset) {
5289 					pset = change_locked_pset(pset, nset);
5290 				}
5291 			} while (nset != starting_pset);
5292 		}
5293 
5294 		/* Short cut for single pset nodes */
5295 		if (bit_count(node->pset_map) == 1) {
5296 			if (lc_processor) {
5297 				pset_assert_locked(lc_processor->processor_set);
5298 				return lc_processor;
5299 			}
5300 		} else {
5301 			if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5302 				/* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5303 				include_ast_urgent_pending_cpus = true;
5304 				goto try_again;
5305 			}
5306 		}
5307 
5308 		processor = lc_processor;
5309 
5310 		if (processor) {
5311 			pset = change_locked_pset(pset, processor->processor_set);
5312 			/* Check that chosen processor is still usable */
5313 			cpumap_t available_map = pset_available_cpumap(pset);
5314 			if (bit_test(available_map, processor->cpu_id)) {
5315 				return processor;
5316 			}
5317 
5318 			/* processor is no longer usable */
5319 			processor = PROCESSOR_NULL;
5320 		}
5321 
5322 		pset_assert_locked(pset);
5323 		pset_unlock(pset);
5324 		return PROCESSOR_NULL;
5325 	}
5326 
5327 	/* No realtime threads from this point on */
5328 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5329 
5330 	do {
5331 		/*
5332 		 * Choose an idle processor, in pset traversal order
5333 		 */
5334 		uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5335 		uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5336 
5337 		/* there shouldn't be a pending AST if the processor is idle */
5338 		assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5339 
5340 		/*
5341 		 * Look at the preferred cores first.
5342 		 */
5343 		int cpuid = lsb_next(preferred_idle_primary_map, pset->cpu_preferred_last_chosen);
5344 		if (cpuid < 0) {
5345 			cpuid = lsb_first(preferred_idle_primary_map);
5346 		}
5347 		if (cpuid >= 0) {
5348 			processor = processor_array[cpuid];
5349 			pset->cpu_preferred_last_chosen = cpuid;
5350 			return processor;
5351 		}
5352 
5353 		/*
5354 		 * Look at the cores that don't need to be avoided next.
5355 		 */
5356 		if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5357 			uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5358 			cpuid = lsb_next(non_avoided_idle_primary_map, pset->cpu_preferred_last_chosen);
5359 			if (cpuid < 0) {
5360 				cpuid = lsb_first(non_avoided_idle_primary_map);
5361 			}
5362 			if (cpuid >= 0) {
5363 				processor = processor_array[cpuid];
5364 				pset->cpu_preferred_last_chosen = cpuid;
5365 				return processor;
5366 			}
5367 		}
5368 
5369 		/*
5370 		 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5371 		 */
5372 		cpuid = lsb_first(idle_primary_map);
5373 		if (cpuid >= 0) {
5374 			processor = processor_array[cpuid];
5375 			return processor;
5376 		}
5377 
5378 		/*
5379 		 * Otherwise, enumerate active and idle processors to find primary candidates
5380 		 * with lower priority/etc.
5381 		 */
5382 
5383 		uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5384 		    pset->recommended_bitmask &
5385 		    ~pset->pending_AST_URGENT_cpu_mask);
5386 
5387 		if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5388 			active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5389 		}
5390 
5391 		active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5392 		for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5393 			cpuid = ((rotid + pset->last_chosen + 1) & 63);
5394 			processor = processor_array[cpuid];
5395 
5396 			integer_t cpri = processor->current_pri;
5397 			processor_t primary = processor->processor_primary;
5398 			if (primary != processor) {
5399 				/* If primary is running a NO_SMT thread, don't choose its secondary */
5400 				if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5401 					if (cpri < lowest_secondary_priority) {
5402 						lowest_secondary_priority = cpri;
5403 						lp_paired_secondary_processor = processor;
5404 					}
5405 				}
5406 			} else {
5407 				if (cpri < lowest_priority) {
5408 					lowest_priority = cpri;
5409 					lp_processor = processor;
5410 				}
5411 			}
5412 
5413 			integer_t ccount = SCHED(processor_runq_count)(processor);
5414 			if (ccount < lowest_count) {
5415 				lowest_count = ccount;
5416 				lc_processor = processor;
5417 			}
5418 		}
5419 
5420 		/*
5421 		 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5422 		 * the idle primary would have short-circuited the loop above
5423 		 */
5424 		uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5425 		    ~pset->primary_map &
5426 		    pset->recommended_bitmask);
5427 
5428 		/* there shouldn't be a pending AST if the processor is idle */
5429 		assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5430 		assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5431 
5432 		for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5433 			processor = processor_array[cpuid];
5434 
5435 			processor_t cprimary = processor->processor_primary;
5436 
5437 			integer_t primary_pri = cprimary->current_pri;
5438 
5439 			/*
5440 			 * TODO: This should also make the same decisions
5441 			 * as secondary_can_run_realtime_thread
5442 			 *
5443 			 * TODO: Keep track of the pending preemption priority
5444 			 * of the primary to make this more accurate.
5445 			 */
5446 
5447 			/* If the primary is running a no-smt thread, then don't choose its secondary */
5448 			if (cprimary->state == PROCESSOR_RUNNING &&
5449 			    processor_active_thread_no_smt(cprimary)) {
5450 				continue;
5451 			}
5452 
5453 			/*
5454 			 * Find the idle secondary processor with the lowest priority primary
5455 			 *
5456 			 * We will choose this processor as a fallback if we find no better
5457 			 * primary to preempt.
5458 			 */
5459 			if (primary_pri < lowest_idle_secondary_priority) {
5460 				lp_idle_secondary_processor = processor;
5461 				lowest_idle_secondary_priority = primary_pri;
5462 			}
5463 
5464 			/* Find the the lowest priority active primary with idle secondary */
5465 			if (primary_pri < lowest_unpaired_primary_priority) {
5466 				/* If the primary processor is offline or starting up, it's not a candidate for this path */
5467 				if (cprimary->state != PROCESSOR_RUNNING &&
5468 				    cprimary->state != PROCESSOR_DISPATCHING) {
5469 					continue;
5470 				}
5471 
5472 				if (!cprimary->is_recommended) {
5473 					continue;
5474 				}
5475 
5476 				/* if the primary is pending preemption, don't try to re-preempt it */
5477 				if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5478 					continue;
5479 				}
5480 
5481 				if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5482 				    bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5483 					continue;
5484 				}
5485 
5486 				lowest_unpaired_primary_priority = primary_pri;
5487 				lp_unpaired_primary_processor = cprimary;
5488 			}
5489 		}
5490 
5491 		/*
5492 		 * We prefer preempting a primary processor over waking up its secondary.
5493 		 * The secondary will then be woken up by the preempted thread.
5494 		 */
5495 		if (thread->sched_pri > lowest_unpaired_primary_priority) {
5496 			pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5497 			return lp_unpaired_primary_processor;
5498 		}
5499 
5500 		/*
5501 		 * We prefer preempting a lower priority active processor over directly
5502 		 * waking up an idle secondary.
5503 		 * The preempted thread will then find the idle secondary.
5504 		 */
5505 		if (thread->sched_pri > lowest_priority) {
5506 			pset->last_chosen = lp_processor->cpu_id;
5507 			return lp_processor;
5508 		}
5509 
5510 		/*
5511 		 * lc_processor is used to indicate the best processor set run queue
5512 		 * on which to enqueue a thread when all available CPUs are busy with
5513 		 * higher priority threads, so try to make sure it is initialized.
5514 		 */
5515 		if (lc_processor == PROCESSOR_NULL) {
5516 			cpumap_t available_map = pset_available_cpumap(pset);
5517 			cpuid = lsb_first(available_map);
5518 			if (cpuid >= 0) {
5519 				lc_processor = processor_array[cpuid];
5520 				lowest_count = SCHED(processor_runq_count)(lc_processor);
5521 			}
5522 		}
5523 
5524 		/*
5525 		 * Move onto the next processor set.
5526 		 *
5527 		 * If all primary processors in this pset are running a higher
5528 		 * priority thread, move on to next pset. Only when we have
5529 		 * exhausted the search for primary processors do we
5530 		 * fall back to secondaries.
5531 		 */
5532 #if CONFIG_SCHED_EDGE
5533 		/*
5534 		 * The edge scheduler expects a CPU to be selected from the pset it passed in
5535 		 * as the starting pset for non-RT workloads. The edge migration algorithm
5536 		 * should already have considered idle CPUs and loads to decide the starting_pset;
5537 		 * which means that this loop can be short-circuted.
5538 		 */
5539 		nset = starting_pset;
5540 #else /* CONFIG_SCHED_EDGE */
5541 		nset = next_pset(pset);
5542 #endif /* CONFIG_SCHED_EDGE */
5543 
5544 		if (nset != starting_pset) {
5545 			pset = change_locked_pset(pset, nset);
5546 		}
5547 	} while (nset != starting_pset);
5548 
5549 	/*
5550 	 * Make sure that we pick a running processor,
5551 	 * and that the correct processor set is locked.
5552 	 * Since we may have unlocked the candidate processor's
5553 	 * pset, it may have changed state.
5554 	 *
5555 	 * All primary processors are running a higher priority
5556 	 * thread, so the only options left are enqueuing on
5557 	 * the secondary processor that would perturb the least priority
5558 	 * primary, or the least busy primary.
5559 	 */
5560 
5561 	/* lowest_priority is evaluated in the main loops above */
5562 	if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5563 		processor = lp_idle_secondary_processor;
5564 	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5565 		processor = lp_paired_secondary_processor;
5566 	} else if (lc_processor != PROCESSOR_NULL) {
5567 		processor = lc_processor;
5568 	} else {
5569 		processor = PROCESSOR_NULL;
5570 	}
5571 
5572 	if (processor) {
5573 		pset = change_locked_pset(pset, processor->processor_set);
5574 		/* Check that chosen processor is still usable */
5575 		cpumap_t available_map = pset_available_cpumap(pset);
5576 		if (bit_test(available_map, processor->cpu_id)) {
5577 			pset->last_chosen = processor->cpu_id;
5578 			return processor;
5579 		}
5580 
5581 		/* processor is no longer usable */
5582 		processor = PROCESSOR_NULL;
5583 	}
5584 
5585 	pset_assert_locked(pset);
5586 	pset_unlock(pset);
5587 	return PROCESSOR_NULL;
5588 }
5589 
5590 /*
5591  * Default implementation of SCHED(choose_node)()
5592  * for single node systems
5593  */
5594 pset_node_t
sched_choose_node(__unused thread_t thread)5595 sched_choose_node(__unused thread_t thread)
5596 {
5597 	return &pset_node0;
5598 }
5599 
5600 /*
5601  *	choose_starting_pset:
5602  *
5603  *	Choose a starting processor set for the thread.
5604  *	May return a processor hint within the pset.
5605  *
5606  *	Returns a starting processor set, to be used by
5607  *      choose_processor.
5608  *
5609  *	The thread must be locked.  The resulting pset is unlocked on return,
5610  *      and is chosen without taking any pset locks.
5611  */
5612 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5613 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5614 {
5615 	processor_set_t pset;
5616 	processor_t processor = PROCESSOR_NULL;
5617 
5618 	if (thread->affinity_set != AFFINITY_SET_NULL) {
5619 		/*
5620 		 * Use affinity set policy hint.
5621 		 */
5622 		pset = thread->affinity_set->aset_pset;
5623 	} else if (thread->last_processor != PROCESSOR_NULL) {
5624 		/*
5625 		 *	Simple (last processor) affinity case.
5626 		 */
5627 		processor = thread->last_processor;
5628 		pset = processor->processor_set;
5629 	} else {
5630 		/*
5631 		 *	No Affinity case:
5632 		 *
5633 		 *	Utilitize a per task hint to spread threads
5634 		 *	among the available processor sets.
5635 		 * NRG this seems like the wrong thing to do.
5636 		 * See also task->pset_hint = pset in thread_setrun()
5637 		 */
5638 		pset = get_threadtask(thread)->pset_hint;
5639 		if (pset == PROCESSOR_SET_NULL) {
5640 			pset = current_processor()->processor_set;
5641 		}
5642 
5643 		pset = choose_next_pset(pset);
5644 	}
5645 
5646 	if (!bit_test(node->pset_map, pset->pset_id)) {
5647 		/* pset is not from this node so choose one that is */
5648 		int id = lsb_first(node->pset_map);
5649 		if (id < 0) {
5650 			/* startup race, so check again under the node lock */
5651 			lck_spin_lock(&pset_node_lock);
5652 			if (bit_test(node->pset_map, pset->pset_id)) {
5653 				id = pset->pset_id;
5654 			} else {
5655 				id = lsb_first(node->pset_map);
5656 			}
5657 			lck_spin_unlock(&pset_node_lock);
5658 		}
5659 		assert(id >= 0);
5660 		pset = pset_array[id];
5661 	}
5662 
5663 	if (bit_count(node->pset_map) == 1) {
5664 		/* Only a single pset in this node */
5665 		goto out;
5666 	}
5667 
5668 	bool avoid_cpu0 = false;
5669 
5670 #if defined(__x86_64__)
5671 	if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5672 		/* Avoid the pset containing cpu0 */
5673 		avoid_cpu0 = true;
5674 		/* Assert that cpu0 is in pset0.  I expect this to be true on __x86_64__ */
5675 		assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5676 	}
5677 #endif
5678 
5679 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5680 		pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5681 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5682 			if (avoid_cpu0) {
5683 				rt_target_map = bit_ror64(rt_target_map, 1);
5684 			}
5685 			int rotid = lsb_first(rt_target_map);
5686 			if (rotid >= 0) {
5687 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5688 				pset = pset_array[id];
5689 				goto out;
5690 			}
5691 		}
5692 		if (!pset->is_SMT || !sched_allow_rt_smt) {
5693 			/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5694 			goto out;
5695 		}
5696 		rt_target_map = atomic_load(&node->pset_non_rt_map);
5697 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5698 			if (avoid_cpu0) {
5699 				rt_target_map = bit_ror64(rt_target_map, 1);
5700 			}
5701 			int rotid = lsb_first(rt_target_map);
5702 			if (rotid >= 0) {
5703 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5704 				pset = pset_array[id];
5705 				goto out;
5706 			}
5707 		}
5708 		/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5709 	} else {
5710 		pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5711 		if (!bit_test(idle_map, pset->pset_id)) {
5712 			int next_idle_pset_id = lsb_first(idle_map);
5713 			if (next_idle_pset_id >= 0) {
5714 				pset = pset_array[next_idle_pset_id];
5715 			}
5716 		}
5717 	}
5718 
5719 out:
5720 	if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5721 		processor = PROCESSOR_NULL;
5722 	}
5723 	if (processor != PROCESSOR_NULL) {
5724 		*processor_hint = processor;
5725 	}
5726 
5727 	assert(pset != NULL);
5728 	return pset;
5729 }
5730 
5731 /*
5732  *	thread_setrun:
5733  *
5734  *	Dispatch thread for execution, onto an idle
5735  *	processor or run queue, and signal a preemption
5736  *	as appropriate.
5737  *
5738  *	Thread must be locked.
5739  */
5740 void
thread_setrun(thread_t thread,sched_options_t options)5741 thread_setrun(
5742 	thread_t                        thread,
5743 	sched_options_t                 options)
5744 {
5745 	processor_t                     processor = PROCESSOR_NULL;
5746 	processor_set_t         pset;
5747 
5748 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5749 	thread_assert_runq_null(thread);
5750 
5751 #if CONFIG_PREADOPT_TG
5752 	/* We know that the thread is not in the runq by virtue of being in this
5753 	 * function and the thread is not self since we are running. We can safely
5754 	 * resolve the thread group hierarchy and modify the thread's thread group
5755 	 * here. */
5756 	thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5757 #endif
5758 
5759 	/*
5760 	 *	Update priority if needed.
5761 	 */
5762 	if (SCHED(can_update_priority)(thread)) {
5763 		SCHED(update_priority)(thread);
5764 	}
5765 	thread->sfi_class = sfi_thread_classify(thread);
5766 
5767 	if (thread->bound_processor == PROCESSOR_NULL) {
5768 		/*
5769 		 * Unbound case.
5770 		 *
5771 		 * Usually, this loop will only be executed once,
5772 		 * but if CLPC derecommends a processor after it has been chosen,
5773 		 * or if a processor is shut down after it is chosen,
5774 		 * choose_processor() may return NULL, so a retry
5775 		 * may be necessary.  A single retry will usually
5776 		 * be enough, and we can't afford to retry too many times
5777 		 * because interrupts are disabled.
5778 		 */
5779 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5780 		for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5781 			processor_t processor_hint = PROCESSOR_NULL;
5782 			pset_node_t node = SCHED(choose_node)(thread);
5783 			processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5784 
5785 			pset_lock(starting_pset);
5786 
5787 			processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5788 			if (processor != PROCESSOR_NULL) {
5789 				pset = processor->processor_set;
5790 				pset_assert_locked(pset);
5791 				break;
5792 			}
5793 		}
5794 		/*
5795 		 * If choose_processor() still returns NULL,
5796 		 * which is very unlikely,
5797 		 * choose the master_processor, which is always
5798 		 * safe to choose.
5799 		 */
5800 		if (processor == PROCESSOR_NULL) {
5801 			/* Choose fallback processor */
5802 			processor = master_processor;
5803 			pset = processor->processor_set;
5804 			pset_lock(pset);
5805 			assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5806 		}
5807 		task_t task = get_threadtask(thread);
5808 		if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5809 			task->pset_hint = pset; /* NRG this is done without holding the task lock */
5810 		}
5811 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5812 		    (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5813 		assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5814 	} else {
5815 		/*
5816 		 *	Bound case:
5817 		 *
5818 		 *	Unconditionally dispatch on the processor.
5819 		 */
5820 		processor = thread->bound_processor;
5821 		pset = processor->processor_set;
5822 		pset_lock(pset);
5823 
5824 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5825 		    (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5826 	}
5827 
5828 	/*
5829 	 *	Dispatch the thread on the chosen processor.
5830 	 *	TODO: This should be based on sched_mode, not sched_pri
5831 	 */
5832 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5833 		realtime_setrun(processor, thread);
5834 	} else {
5835 		processor_setrun(processor, thread, options);
5836 	}
5837 	/* pset is now unlocked */
5838 	if (thread->bound_processor == PROCESSOR_NULL) {
5839 		SCHED(check_spill)(pset, thread);
5840 	}
5841 }
5842 
5843 processor_set_t
task_choose_pset(task_t task)5844 task_choose_pset(
5845 	task_t          task)
5846 {
5847 	processor_set_t         pset = task->pset_hint;
5848 
5849 	if (pset != PROCESSOR_SET_NULL) {
5850 		pset = choose_next_pset(pset);
5851 	}
5852 
5853 	return pset;
5854 }
5855 
5856 /*
5857  *	Check for a preemption point in
5858  *	the current context.
5859  *
5860  *	Called at splsched with thread locked.
5861  */
5862 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5863 csw_check(
5864 	thread_t                thread,
5865 	processor_t             processor,
5866 	ast_t                   check_reason)
5867 {
5868 	processor_set_t pset = processor->processor_set;
5869 
5870 	assert(thread == processor->active_thread);
5871 
5872 	pset_lock(pset);
5873 
5874 	processor_state_update_from_thread(processor, thread, true);
5875 
5876 	ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5877 
5878 	/* Acknowledge the IPI if we decided not to preempt */
5879 
5880 	if ((preempt & AST_URGENT) == 0) {
5881 		if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5882 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5883 		}
5884 	}
5885 
5886 	if ((preempt & AST_PREEMPT) == 0) {
5887 		bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5888 	}
5889 
5890 	pset_unlock(pset);
5891 
5892 	return update_pending_nonurgent_preemption(processor, preempt);
5893 }
5894 
5895 void
clear_pending_nonurgent_preemption(processor_t processor)5896 clear_pending_nonurgent_preemption(processor_t processor)
5897 {
5898 	if (!processor->pending_nonurgent_preemption) {
5899 		return;
5900 	}
5901 
5902 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5903 
5904 	processor->pending_nonurgent_preemption = false;
5905 	running_timer_clear(processor, RUNNING_TIMER_PREEMPT);
5906 }
5907 
5908 ast_t
update_pending_nonurgent_preemption(processor_t processor,ast_t reason)5909 update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5910 {
5911 	if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5912 		clear_pending_nonurgent_preemption(processor);
5913 		return reason;
5914 	}
5915 
5916 	if (nonurgent_preemption_timer_abs == 0) {
5917 		/* Preemption timer not enabled */
5918 		return reason;
5919 	}
5920 
5921 	if (current_thread()->state & TH_IDLE) {
5922 		/* idle threads don't need nonurgent preemption */
5923 		return reason;
5924 	}
5925 
5926 	if (processor->pending_nonurgent_preemption) {
5927 		/* Timer is already armed, no need to do it again */
5928 		return reason;
5929 	}
5930 
5931 	if (ml_did_interrupt_userspace()) {
5932 		/*
5933 		 * We're preempting userspace here, so we don't need
5934 		 * to defer the preemption.  Force AST_URGENT
5935 		 * so that we can avoid arming this timer without risking
5936 		 * ast_taken_user deciding to spend too long in kernel
5937 		 * space to handle other ASTs.
5938 		 */
5939 
5940 		return reason | AST_URGENT;
5941 	}
5942 
5943 	/*
5944 	 * We've decided to do a nonurgent preemption when running in
5945 	 * kernelspace. We defer the preemption until reaching userspace boundary
5946 	 * to give a grace period for locks etc to be dropped and to reach
5947 	 * a clean preemption point, so that the preempting thread doesn't
5948 	 * always immediately hit the lock that the waking thread still holds.
5949 	 *
5950 	 * Arm a timer to enforce that the preemption executes within a bounded
5951 	 * time if the thread doesn't block or return to userspace quickly.
5952 	 */
5953 
5954 	processor->pending_nonurgent_preemption = true;
5955 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5956 	    reason);
5957 
5958 	uint64_t now = mach_absolute_time();
5959 
5960 	uint64_t deadline = now + nonurgent_preemption_timer_abs;
5961 
5962 	running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL,
5963 	    now, deadline);
5964 
5965 	return reason;
5966 }
5967 
5968 /*
5969  * Check for preemption at splsched with
5970  * pset and thread locked
5971  */
5972 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5973 csw_check_locked(
5974 	thread_t                thread,
5975 	processor_t             processor,
5976 	processor_set_t         pset,
5977 	ast_t                   check_reason)
5978 {
5979 	/*
5980 	 * If the current thread is running on a processor that is no longer recommended,
5981 	 * urgently preempt it, at which point thread_select() should
5982 	 * try to idle the processor and re-dispatch the thread to a recommended processor.
5983 	 */
5984 	if (!processor->is_recommended) {
5985 		return check_reason | AST_PREEMPT | AST_URGENT;
5986 	}
5987 
5988 	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5989 		return check_reason | AST_PREEMPT | AST_URGENT;
5990 	}
5991 
5992 	if (rt_runq_count(pset) > 0) {
5993 		if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5994 			return check_reason | AST_PREEMPT | AST_URGENT;
5995 		} else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5996 			return check_reason | AST_PREEMPT | AST_URGENT;
5997 		} else {
5998 			return check_reason | AST_PREEMPT;
5999 		}
6000 	}
6001 
6002 	ast_t result = SCHED(processor_csw_check)(processor);
6003 	if (result != AST_NONE) {
6004 		return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
6005 	}
6006 
6007 	/*
6008 	 * Same for avoid-processor
6009 	 *
6010 	 * TODO: Should these set AST_REBALANCE?
6011 	 */
6012 	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6013 		return check_reason | AST_PREEMPT;
6014 	}
6015 
6016 	/*
6017 	 * Even though we could continue executing on this processor, a
6018 	 * secondary SMT core should try to shed load to another primary core.
6019 	 *
6020 	 * TODO: Should this do the same check that thread_select does? i.e.
6021 	 * if no bound threads target this processor, and idle primaries exist, preempt
6022 	 * The case of RT threads existing is already taken care of above
6023 	 */
6024 
6025 	if (processor->current_pri < BASEPRI_RTQUEUES &&
6026 	    processor->processor_primary != processor) {
6027 		return check_reason | AST_PREEMPT;
6028 	}
6029 
6030 	if (thread->state & TH_SUSP) {
6031 		return check_reason | AST_PREEMPT;
6032 	}
6033 
6034 #if CONFIG_SCHED_SFI
6035 	/*
6036 	 * Current thread may not need to be preempted, but maybe needs
6037 	 * an SFI wait?
6038 	 */
6039 	result = sfi_thread_needs_ast(thread, NULL);
6040 	if (result != AST_NONE) {
6041 		return result;
6042 	}
6043 #endif
6044 
6045 	return AST_NONE;
6046 }
6047 
6048 /*
6049  * Handle preemption IPI or IPI in response to setting an AST flag
6050  * Triggered by cause_ast_check
6051  * Called at splsched
6052  */
6053 void
ast_check(processor_t processor)6054 ast_check(processor_t processor)
6055 {
6056 	smr_ack_ipi();
6057 
6058 	if (processor->state != PROCESSOR_RUNNING &&
6059 	    processor->state != PROCESSOR_SHUTDOWN) {
6060 		return;
6061 	}
6062 
6063 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6064 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6065 
6066 	thread_t thread = processor->active_thread;
6067 
6068 	assert(thread == current_thread());
6069 
6070 	/*
6071 	 * Pairs with task_restartable_ranges_synchronize
6072 	 */
6073 	thread_lock(thread);
6074 
6075 	thread_reset_pcs_ack_IPI(thread);
6076 
6077 	/*
6078 	 * Propagate thread ast to processor.
6079 	 * (handles IPI in response to setting AST flag)
6080 	 */
6081 	ast_propagate(thread);
6082 
6083 	/*
6084 	 * Stash the old urgency and perfctl values to find out if
6085 	 * csw_check updates them.
6086 	 */
6087 	thread_urgency_t old_urgency = processor->current_urgency;
6088 	perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6089 
6090 	ast_t preempt;
6091 
6092 	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6093 		ast_on(preempt);
6094 	}
6095 
6096 	if (old_urgency != processor->current_urgency) {
6097 		/*
6098 		 * Urgency updates happen with the thread lock held (ugh).
6099 		 * TODO: This doesn't notice QoS changes...
6100 		 */
6101 		uint64_t urgency_param1, urgency_param2;
6102 
6103 		thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
6104 		thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
6105 	}
6106 
6107 	thread_unlock(thread);
6108 
6109 	if (old_perfctl_class != processor->current_perfctl_class) {
6110 		/*
6111 		 * We updated the perfctl class of this thread from another core.
6112 		 * Let CLPC know that the currently running thread has a new
6113 		 * class.
6114 		 */
6115 
6116 		machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
6117 		    mach_approximate_time(), 0, thread);
6118 	}
6119 
6120 	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6121 	    MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6122 }
6123 
6124 
6125 void
thread_preempt_expire(timer_call_param_t p0,__unused timer_call_param_t p1)6126 thread_preempt_expire(
6127 	timer_call_param_t      p0,
6128 	__unused timer_call_param_t      p1)
6129 {
6130 	processor_t processor = p0;
6131 
6132 	assert(processor == current_processor());
6133 	assert(p1 == NULL);
6134 
6135 	thread_t thread = current_thread();
6136 
6137 	/*
6138 	 * This is set and cleared by the current core, so we will
6139 	 * never see a race with running timer expiration
6140 	 */
6141 	assert(processor->pending_nonurgent_preemption);
6142 
6143 	clear_pending_nonurgent_preemption(processor);
6144 
6145 	thread_lock(thread);
6146 
6147 	/*
6148 	 * Check again to see if it's still worth a
6149 	 * context switch, but this time force enable kernel preemption
6150 	 */
6151 
6152 	ast_t preempt = csw_check(thread, processor, AST_URGENT);
6153 
6154 	if (preempt) {
6155 		ast_on(preempt);
6156 	}
6157 
6158 	thread_unlock(thread);
6159 
6160 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6161 }
6162 
6163 
6164 /*
6165  *	set_sched_pri:
6166  *
6167  *	Set the scheduled priority of the specified thread.
6168  *
6169  *	This may cause the thread to change queues.
6170  *
6171  *	Thread must be locked.
6172  */
6173 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)6174 set_sched_pri(
6175 	thread_t        thread,
6176 	int16_t         new_priority,
6177 	set_sched_pri_options_t options)
6178 {
6179 	bool is_current_thread = (thread == current_thread());
6180 	bool removed_from_runq = false;
6181 	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6182 
6183 	int16_t old_priority = thread->sched_pri;
6184 
6185 	/* If we're already at this priority, no need to mess with the runqueue */
6186 	if (new_priority == old_priority) {
6187 #if CONFIG_SCHED_CLUTCH
6188 		/* For the first thread in the system, the priority is correct but
6189 		 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6190 		 * scheduler relies on the bucket being set for all threads, update
6191 		 * its bucket here.
6192 		 */
6193 		if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6194 			assert(thread == vm_pageout_scan_thread);
6195 			SCHED(update_thread_bucket)(thread);
6196 		}
6197 #endif /* CONFIG_SCHED_CLUTCH */
6198 
6199 		return;
6200 	}
6201 
6202 	if (is_current_thread) {
6203 		assert(thread->state & TH_RUN);
6204 		thread_assert_runq_null(thread);
6205 	} else {
6206 		removed_from_runq = thread_run_queue_remove(thread);
6207 	}
6208 
6209 	thread->sched_pri = new_priority;
6210 
6211 #if CONFIG_SCHED_CLUTCH
6212 	/*
6213 	 * Since for the clutch scheduler, the thread's bucket determines its runq
6214 	 * in the hierarchy it is important to update the bucket when the thread
6215 	 * lock is held and the thread has been removed from the runq hierarchy.
6216 	 */
6217 	SCHED(update_thread_bucket)(thread);
6218 
6219 #endif /* CONFIG_SCHED_CLUTCH */
6220 
6221 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6222 	    (uintptr_t)thread_tid(thread),
6223 	    thread->base_pri,
6224 	    thread->sched_pri,
6225 	    thread->sched_usage,
6226 	    0);
6227 
6228 	if (removed_from_runq) {
6229 		thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
6230 	} else if (is_current_thread) {
6231 		processor_t processor = thread->last_processor;
6232 		assert(processor == current_processor());
6233 
6234 		thread_urgency_t old_urgency = processor->current_urgency;
6235 
6236 		/*
6237 		 * When dropping in priority, check if the thread no longer belongs on core.
6238 		 * If a thread raises its own priority, don't aggressively rebalance it.
6239 		 * <rdar://problem/31699165>
6240 		 *
6241 		 * csw_check does a processor_state_update_from_thread, but
6242 		 * we should do our own if we're being lazy.
6243 		 */
6244 		if (!lazy_update && new_priority < old_priority) {
6245 			ast_t preempt;
6246 
6247 			if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6248 				ast_on(preempt);
6249 			}
6250 		} else {
6251 			processor_state_update_from_thread(processor, thread, false);
6252 		}
6253 
6254 		/*
6255 		 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6256 		 * class alterations from user space to occur relatively infrequently, hence
6257 		 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6258 		 * inheritance is expected to involve priority changes.
6259 		 */
6260 		if (processor->current_urgency != old_urgency) {
6261 			uint64_t urgency_param1, urgency_param2;
6262 
6263 			thread_urgency_t new_urgency = thread_get_urgency(thread,
6264 			    &urgency_param1, &urgency_param2);
6265 
6266 			thread_tell_urgency(new_urgency, urgency_param1,
6267 			    urgency_param2, 0, thread);
6268 		}
6269 
6270 		/* TODO: only call this if current_perfctl_class changed */
6271 		uint64_t ctime = mach_approximate_time();
6272 		machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
6273 	} else if (thread->state & TH_RUN) {
6274 		processor_t processor = thread->last_processor;
6275 
6276 		if (!lazy_update &&
6277 		    processor != PROCESSOR_NULL &&
6278 		    processor != current_processor() &&
6279 		    processor->active_thread == thread) {
6280 			cause_ast_check(processor);
6281 		}
6282 	}
6283 }
6284 
6285 /*
6286  * thread_run_queue_remove_for_handoff
6287  *
6288  * Pull a thread or its (recursive) push target out of the runqueue
6289  * so that it is ready for thread_run()
6290  *
6291  * Called at splsched
6292  *
6293  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6294  * This may be different than the thread that was passed in.
6295  */
6296 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)6297 thread_run_queue_remove_for_handoff(thread_t thread)
6298 {
6299 	thread_t pulled_thread = THREAD_NULL;
6300 
6301 	thread_lock(thread);
6302 
6303 	/*
6304 	 * Check that the thread is not bound to a different processor,
6305 	 * NO_SMT flag is not set on the thread, cluster type of
6306 	 * processor matches with thread if the thread is pinned to a
6307 	 * particular cluster and that realtime is not involved.
6308 	 *
6309 	 * Next, pull it off its run queue.  If it doesn't come, it's not eligible.
6310 	 */
6311 	processor_t processor = current_processor();
6312 	if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6313 	    && (!thread_no_smt(thread))
6314 	    && (processor->current_pri < BASEPRI_RTQUEUES)
6315 	    && (thread->sched_pri < BASEPRI_RTQUEUES)
6316 #if __AMP__
6317 	    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6318 	    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6319 #endif /* __AMP__ */
6320 	    ) {
6321 		if (thread_run_queue_remove(thread)) {
6322 			pulled_thread = thread;
6323 		}
6324 	}
6325 
6326 	thread_unlock(thread);
6327 
6328 	return pulled_thread;
6329 }
6330 
6331 /*
6332  * thread_prepare_for_handoff
6333  *
6334  * Make the thread ready for handoff.
6335  * If the thread was runnable then pull it off the runq, if the thread could
6336  * not be pulled, return NULL.
6337  *
6338  * If the thread was woken up from wait for handoff, make sure it is not bound to
6339  * different processor.
6340  *
6341  * Called at splsched
6342  *
6343  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6344  * This may be different than the thread that was passed in.
6345  */
6346 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)6347 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6348 {
6349 	thread_t pulled_thread = THREAD_NULL;
6350 
6351 	if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6352 		processor_t processor = current_processor();
6353 		thread_lock(thread);
6354 
6355 		/*
6356 		 * Check that the thread is not bound to a different processor,
6357 		 * NO_SMT flag is not set on the thread and cluster type of
6358 		 * processor matches with thread if the thread is pinned to a
6359 		 * particular cluster. Call setrun instead if above conditions
6360 		 * are not satisfied.
6361 		 */
6362 		if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6363 		    && (!thread_no_smt(thread))
6364 #if __AMP__
6365 		    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6366 		    processor->processor_set->pset_id == thread->th_bound_cluster_id)
6367 #endif /* __AMP__ */
6368 		    ) {
6369 			pulled_thread = thread;
6370 		} else {
6371 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
6372 		}
6373 		thread_unlock(thread);
6374 	} else {
6375 		pulled_thread = thread_run_queue_remove_for_handoff(thread);
6376 	}
6377 
6378 	return pulled_thread;
6379 }
6380 
6381 /*
6382  *	thread_run_queue_remove:
6383  *
6384  *	Remove a thread from its current run queue and
6385  *	return TRUE if successful.
6386  *
6387  *	Thread must be locked.
6388  *
6389  *	If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6390  *	run queues because the caller locked the thread.  Otherwise
6391  *	the thread is on a run queue, but could be chosen for dispatch
6392  *	and removed by another processor under a different lock, which
6393  *	will set thread->runq to PROCESSOR_NULL.
6394  *
6395  *	Hence the thread select path must not rely on anything that could
6396  *	be changed under the thread lock after calling this function,
6397  *	most importantly thread->sched_pri.
6398  */
6399 boolean_t
thread_run_queue_remove(thread_t thread)6400 thread_run_queue_remove(
6401 	thread_t        thread)
6402 {
6403 	boolean_t removed = FALSE;
6404 
6405 	if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6406 		/* Thread isn't runnable */
6407 		thread_assert_runq_null(thread);
6408 		return FALSE;
6409 	}
6410 
6411 	processor_t processor = thread_get_runq(thread);
6412 	if (processor == PROCESSOR_NULL) {
6413 		/*
6414 		 * The thread is either not on the runq,
6415 		 * or is in the midst of being removed from the runq.
6416 		 *
6417 		 * runq is set to NULL under the pset lock, not the thread
6418 		 * lock, so the thread may still be in the process of being dequeued
6419 		 * from the runq. It will wait in invoke for the thread lock to be
6420 		 * dropped.
6421 		 */
6422 
6423 		return FALSE;
6424 	}
6425 
6426 	if (thread->sched_pri < BASEPRI_RTQUEUES) {
6427 		return SCHED(processor_queue_remove)(processor, thread);
6428 	}
6429 
6430 	processor_set_t pset = processor->processor_set;
6431 
6432 	pset_lock(pset);
6433 
6434 	/*
6435 	 * Must re-read the thread runq after acquiring the pset lock, in
6436 	 * case another core swooped in before us to dequeue the thread.
6437 	 */
6438 	if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6439 		/*
6440 		 *	Thread is on the RT run queue and we have a lock on
6441 		 *	that run queue.
6442 		 */
6443 		rt_runq_remove(SCHED(rt_runq)(pset), thread);
6444 		pset_update_rt_stealable_state(pset);
6445 
6446 		removed = TRUE;
6447 	}
6448 
6449 	pset_unlock(pset);
6450 
6451 	return removed;
6452 }
6453 
6454 /*
6455  * Put the thread back where it goes after a thread_run_queue_remove
6456  *
6457  * Thread must have been removed under the same thread lock hold
6458  *
6459  * thread locked, at splsched
6460  */
6461 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6462 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6463 {
6464 	thread_assert_runq_null(thread);
6465 	assert(thread->state & (TH_RUN));
6466 
6467 	thread_setrun(thread, options);
6468 }
6469 
6470 void
sys_override_cpu_throttle(boolean_t enable_override)6471 sys_override_cpu_throttle(boolean_t enable_override)
6472 {
6473 	if (enable_override) {
6474 		cpu_throttle_enabled = 0;
6475 	} else {
6476 		cpu_throttle_enabled = 1;
6477 	}
6478 }
6479 
6480 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6481 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6482 {
6483 	uint64_t urgency_param1 = 0, urgency_param2 = 0;
6484 	task_t task = get_threadtask_early(thread);
6485 
6486 	thread_urgency_t urgency;
6487 
6488 	if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6489 		urgency_param1 = 0;
6490 		urgency_param2 = 0;
6491 
6492 		urgency = THREAD_URGENCY_NONE;
6493 	} else if (thread->sched_mode == TH_MODE_REALTIME) {
6494 		urgency_param1 = thread->realtime.period;
6495 		urgency_param2 = thread->realtime.deadline;
6496 
6497 		urgency = THREAD_URGENCY_REAL_TIME;
6498 	} else if (cpu_throttle_enabled &&
6499 	    (thread->sched_pri <= MAXPRI_THROTTLE) &&
6500 	    (thread->base_pri <= MAXPRI_THROTTLE)) {
6501 		/*
6502 		 * Threads that are running at low priority but are not
6503 		 * tagged with a specific QoS are separated out from
6504 		 * the "background" urgency. Performance management
6505 		 * subsystem can decide to either treat these threads
6506 		 * as normal threads or look at other signals like thermal
6507 		 * levels for optimal power/perf tradeoffs for a platform.
6508 		 */
6509 		boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6510 		boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6511 
6512 		/*
6513 		 * Background urgency applied when thread priority is
6514 		 * MAXPRI_THROTTLE or lower and thread is not promoted
6515 		 * and thread has a QoS specified
6516 		 */
6517 		urgency_param1 = thread->sched_pri;
6518 		urgency_param2 = thread->base_pri;
6519 
6520 		if (thread_lacks_qos && !task_is_suppressed) {
6521 			urgency = THREAD_URGENCY_LOWPRI;
6522 		} else {
6523 			urgency = THREAD_URGENCY_BACKGROUND;
6524 		}
6525 	} else {
6526 		/* For otherwise unclassified threads, report throughput QoS parameters */
6527 		urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6528 		urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6529 		urgency = THREAD_URGENCY_NORMAL;
6530 	}
6531 
6532 	if (arg1 != NULL) {
6533 		*arg1 = urgency_param1;
6534 	}
6535 	if (arg2 != NULL) {
6536 		*arg2 = urgency_param2;
6537 	}
6538 
6539 	return urgency;
6540 }
6541 
6542 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6543 thread_get_perfcontrol_class(thread_t thread)
6544 {
6545 	/* Special case handling */
6546 	if (thread->state & TH_IDLE) {
6547 		return PERFCONTROL_CLASS_IDLE;
6548 	}
6549 
6550 	if (thread->sched_mode == TH_MODE_REALTIME) {
6551 		return PERFCONTROL_CLASS_REALTIME;
6552 	}
6553 
6554 	/* perfcontrol_class based on base_pri */
6555 	if (thread->base_pri <= MAXPRI_THROTTLE) {
6556 		return PERFCONTROL_CLASS_BACKGROUND;
6557 	} else if (thread->base_pri <= BASEPRI_UTILITY) {
6558 		return PERFCONTROL_CLASS_UTILITY;
6559 	} else if (thread->base_pri <= BASEPRI_DEFAULT) {
6560 		return PERFCONTROL_CLASS_NONUI;
6561 	} else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6562 		return PERFCONTROL_CLASS_USER_INITIATED;
6563 	} else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6564 		return PERFCONTROL_CLASS_UI;
6565 	} else {
6566 		if (get_threadtask(thread) == kernel_task) {
6567 			/*
6568 			 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6569 			 * All other lower priority kernel threads should be treated
6570 			 * as regular threads for performance control purposes.
6571 			 */
6572 			return PERFCONTROL_CLASS_KERNEL;
6573 		}
6574 		return PERFCONTROL_CLASS_ABOVEUI;
6575 	}
6576 }
6577 
6578 /*
6579  *	This is the processor idle loop, which just looks for other threads
6580  *	to execute.  Processor idle threads invoke this without supplying a
6581  *	current thread to idle without an asserted wait state.
6582  *
6583  *	Returns a the next thread to execute if dispatched directly.
6584  */
6585 
6586 #if 0
6587 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6588 #else
6589 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6590 #endif
6591 
6592 #if (DEVELOPMENT || DEBUG)
6593 int sched_idle_delay_cpuid = -1;
6594 #endif
6595 
6596 thread_t
processor_idle(thread_t thread,processor_t processor)6597 processor_idle(
6598 	thread_t                        thread,
6599 	processor_t                     processor)
6600 {
6601 	processor_set_t         pset = processor->processor_set;
6602 	struct recount_snap snap = { 0 };
6603 
6604 	(void)splsched();
6605 
6606 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6607 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6608 	    (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6609 
6610 	SCHED_STATS_INC(idle_transitions);
6611 	assert(processor->running_timers_active == false);
6612 
6613 	recount_snapshot(&snap);
6614 	recount_processor_idle(&processor->pr_recount, &snap);
6615 
6616 	while (1) {
6617 		/*
6618 		 * Ensure that updates to my processor and pset state,
6619 		 * made by the IPI source processor before sending the IPI,
6620 		 * are visible on this processor now (even though we don't
6621 		 * take the pset lock yet).
6622 		 */
6623 		atomic_thread_fence(memory_order_acquire);
6624 
6625 		if (processor->state != PROCESSOR_IDLE) {
6626 			break;
6627 		}
6628 		if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6629 			break;
6630 		}
6631 #if defined(CONFIG_SCHED_DEFERRED_AST)
6632 		if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6633 			break;
6634 		}
6635 #endif
6636 		if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6637 			break;
6638 		}
6639 
6640 		if (processor->is_recommended && (processor->processor_primary == processor)) {
6641 			if (rt_runq_count(pset)) {
6642 				break;
6643 			}
6644 		} else {
6645 			if (SCHED(processor_bound_count)(processor)) {
6646 				break;
6647 			}
6648 		}
6649 
6650 		IDLE_KERNEL_DEBUG_CONSTANT(
6651 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6652 
6653 		machine_track_platform_idle(TRUE);
6654 
6655 		machine_idle();
6656 		/* returns with interrupts enabled */
6657 
6658 		machine_track_platform_idle(FALSE);
6659 
6660 #if (DEVELOPMENT || DEBUG)
6661 		if (processor->cpu_id == sched_idle_delay_cpuid) {
6662 			delay(500);
6663 		}
6664 #endif
6665 
6666 		(void)splsched();
6667 
6668 		atomic_thread_fence(memory_order_acquire);
6669 
6670 		IDLE_KERNEL_DEBUG_CONSTANT(
6671 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6672 
6673 		/*
6674 		 * Check if we should call sched_timeshare_consider_maintenance() here.
6675 		 * The CPU was woken out of idle due to an interrupt and we should do the
6676 		 * call only if the processor is still idle. If the processor is non-idle,
6677 		 * the threads running on the processor would do the call as part of
6678 		 * context swithing.
6679 		 */
6680 		if (processor->state == PROCESSOR_IDLE) {
6681 			sched_timeshare_consider_maintenance(mach_absolute_time(), true);
6682 		}
6683 
6684 		if (!SCHED(processor_queue_empty)(processor)) {
6685 			/* Secondary SMT processors respond to directed wakeups
6686 			 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6687 			 */
6688 			if (processor->processor_primary == processor) {
6689 				break;
6690 			}
6691 		}
6692 	}
6693 
6694 	recount_snapshot(&snap);
6695 	recount_processor_run(&processor->pr_recount, &snap);
6696 	smr_cpu_join(processor, snap.rsn_time_mach);
6697 
6698 	ast_t reason = AST_NONE;
6699 
6700 	/* We're handling all scheduling AST's */
6701 	ast_off(AST_SCHEDULING);
6702 
6703 	/*
6704 	 * thread_select will move the processor from dispatching to running,
6705 	 * or put it in idle if there's nothing to do.
6706 	 */
6707 	thread_t cur_thread = current_thread();
6708 
6709 	thread_lock(cur_thread);
6710 	thread_t new_thread = thread_select(cur_thread, processor, &reason);
6711 	thread_unlock(cur_thread);
6712 
6713 	assert(processor->running_timers_active == false);
6714 
6715 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6716 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6717 	    (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6718 
6719 	return new_thread;
6720 }
6721 
6722 /*
6723  *	Each processor has a dedicated thread which
6724  *	executes the idle loop when there is no suitable
6725  *	previous context.
6726  *
6727  *	This continuation is entered with interrupts disabled.
6728  */
6729 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6730 idle_thread(__assert_only void* parameter,
6731     __unused wait_result_t result)
6732 {
6733 	assert(ml_get_interrupts_enabled() == FALSE);
6734 	assert(parameter == NULL);
6735 
6736 	processor_t processor = current_processor();
6737 
6738 	smr_cpu_leave(processor, processor->last_dispatch);
6739 
6740 	/*
6741 	 * Ensure that anything running in idle context triggers
6742 	 * preemption-disabled checks.
6743 	 */
6744 	disable_preemption_without_measurements();
6745 
6746 	/*
6747 	 * Enable interrupts temporarily to handle any pending interrupts
6748 	 * or IPIs before deciding to sleep
6749 	 */
6750 	spllo();
6751 
6752 	thread_t new_thread = processor_idle(THREAD_NULL, processor);
6753 	/* returns with interrupts disabled */
6754 
6755 	enable_preemption();
6756 
6757 	if (new_thread != THREAD_NULL) {
6758 		thread_run(processor->idle_thread,
6759 		    idle_thread, NULL, new_thread);
6760 		/*NOTREACHED*/
6761 	}
6762 
6763 	thread_block(idle_thread);
6764 	/*NOTREACHED*/
6765 }
6766 
6767 kern_return_t
idle_thread_create(processor_t processor)6768 idle_thread_create(
6769 	processor_t             processor)
6770 {
6771 	kern_return_t   result;
6772 	thread_t                thread;
6773 	spl_t                   s;
6774 	char                    name[MAXTHREADNAMESIZE];
6775 
6776 	result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6777 	if (result != KERN_SUCCESS) {
6778 		return result;
6779 	}
6780 
6781 	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6782 	thread_set_thread_name(thread, name);
6783 
6784 	s = splsched();
6785 	thread_lock(thread);
6786 	thread->bound_processor = processor;
6787 	processor->idle_thread = thread;
6788 	thread->sched_pri = thread->base_pri = IDLEPRI;
6789 	thread->state = (TH_RUN | TH_IDLE);
6790 	thread->options |= TH_OPT_IDLE_THREAD;
6791 	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6792 	thread_unlock(thread);
6793 	splx(s);
6794 
6795 	thread_deallocate(thread);
6796 
6797 	return KERN_SUCCESS;
6798 }
6799 
6800 static void sched_update_powered_cores_continue(void);
6801 
6802 /*
6803  * sched_startup:
6804  *
6805  * Kicks off scheduler services.
6806  *
6807  * Called at splsched.
6808  */
6809 void
sched_startup(void)6810 sched_startup(void)
6811 {
6812 	kern_return_t   result;
6813 	thread_t                thread;
6814 
6815 	simple_lock_init(&sched_vm_group_list_lock, 0);
6816 
6817 	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6818 	    NULL, MAXPRI_KERNEL, &thread);
6819 	if (result != KERN_SUCCESS) {
6820 		panic("sched_startup");
6821 	}
6822 
6823 	thread_deallocate(thread);
6824 
6825 	assert_thread_magic(thread);
6826 
6827 	/*
6828 	 * Yield to the sched_init_thread once, to
6829 	 * initialize our own thread after being switched
6830 	 * back to.
6831 	 *
6832 	 * The current thread is the only other thread
6833 	 * active at this point.
6834 	 */
6835 	thread_block(THREAD_CONTINUE_NULL);
6836 
6837 	result = kernel_thread_start_priority((thread_continue_t)sched_update_powered_cores_continue,
6838 	    NULL, MAXPRI_KERNEL, &thread);
6839 	if (result != KERN_SUCCESS) {
6840 		panic("sched_startup");
6841 	}
6842 
6843 	thread_deallocate(thread);
6844 
6845 	assert_thread_magic(thread);
6846 }
6847 
6848 #if __arm64__
6849 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6850 #endif /* __arm64__ */
6851 
6852 
6853 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6854 
6855 static volatile uint64_t                sched_maintenance_deadline;
6856 static uint64_t                         sched_tick_last_abstime;
6857 static uint64_t                         sched_tick_delta;
6858 uint64_t                                sched_tick_max_delta;
6859 
6860 
6861 /*
6862  *	sched_init_thread:
6863  *
6864  *	Perform periodic bookkeeping functions about ten
6865  *	times per second.
6866  */
6867 void
sched_timeshare_maintenance_continue(void)6868 sched_timeshare_maintenance_continue(void)
6869 {
6870 	uint64_t        sched_tick_ctime, late_time;
6871 
6872 	struct sched_update_scan_context scan_context = {
6873 		.earliest_bg_make_runnable_time = UINT64_MAX,
6874 		.earliest_normal_make_runnable_time = UINT64_MAX,
6875 		.earliest_rt_make_runnable_time = UINT64_MAX
6876 	};
6877 
6878 	sched_tick_ctime = mach_absolute_time();
6879 
6880 	if (__improbable(sched_tick_last_abstime == 0)) {
6881 		sched_tick_last_abstime = sched_tick_ctime;
6882 		late_time = 0;
6883 		sched_tick_delta = 1;
6884 	} else {
6885 		late_time = sched_tick_ctime - sched_tick_last_abstime;
6886 		sched_tick_delta = late_time / sched_tick_interval;
6887 		/* Ensure a delta of 1, since the interval could be slightly
6888 		 * smaller than the sched_tick_interval due to dispatch
6889 		 * latencies.
6890 		 */
6891 		sched_tick_delta = MAX(sched_tick_delta, 1);
6892 
6893 		/* In the event interrupt latencies or platform
6894 		 * idle events that advanced the timebase resulted
6895 		 * in periods where no threads were dispatched,
6896 		 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6897 		 * iterations.
6898 		 */
6899 		sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6900 
6901 		sched_tick_last_abstime = sched_tick_ctime;
6902 		sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6903 	}
6904 
6905 	scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6906 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6907 	    sched_tick_delta, late_time, 0, 0, 0);
6908 
6909 	/* Add a number of pseudo-ticks corresponding to the elapsed interval
6910 	 * This could be greater than 1 if substantial intervals where
6911 	 * all processors are idle occur, which rarely occurs in practice.
6912 	 */
6913 
6914 	sched_tick += sched_tick_delta;
6915 
6916 	update_vm_info();
6917 
6918 	/*
6919 	 *  Compute various averages.
6920 	 */
6921 	compute_averages(sched_tick_delta);
6922 
6923 	/*
6924 	 *  Scan the run queues for threads which
6925 	 *  may need to be updated, and find the earliest runnable thread on the runqueue
6926 	 *  to report its latency.
6927 	 */
6928 	SCHED(thread_update_scan)(&scan_context);
6929 
6930 	SCHED(rt_runq_scan)(&scan_context);
6931 
6932 	uint64_t ctime = mach_absolute_time();
6933 
6934 	uint64_t bg_max_latency       = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6935 	    ctime - scan_context.earliest_bg_make_runnable_time : 0;
6936 
6937 	uint64_t default_max_latency  = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6938 	    ctime - scan_context.earliest_normal_make_runnable_time : 0;
6939 
6940 	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6941 	    ctime - scan_context.earliest_rt_make_runnable_time : 0;
6942 
6943 	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6944 
6945 	/*
6946 	 * Check to see if the special sched VM group needs attention.
6947 	 */
6948 	sched_vm_group_maintenance();
6949 
6950 #if __arm64__
6951 	/* Check to see if the recommended cores failsafe is active */
6952 	sched_recommended_cores_maintenance();
6953 #endif /* __arm64__ */
6954 
6955 
6956 #if DEBUG || DEVELOPMENT
6957 #if __x86_64__
6958 #include <i386/misc_protos.h>
6959 	/* Check for long-duration interrupts */
6960 	mp_interrupt_watchdog();
6961 #endif /* __x86_64__ */
6962 #endif /* DEBUG || DEVELOPMENT */
6963 
6964 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6965 	    sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6966 	    sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6967 
6968 	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6969 	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6970 	/*NOTREACHED*/
6971 }
6972 
6973 static uint64_t sched_maintenance_wakeups;
6974 
6975 /*
6976  * Determine if the set of routines formerly driven by a maintenance timer
6977  * must be invoked, based on a deadline comparison. Signals the scheduler
6978  * maintenance thread on deadline expiration. Must be invoked at an interval
6979  * lower than the "sched_tick_interval", currently accomplished by
6980  * invocation via the quantum expiration timer and at context switch time.
6981  * Performance matters: this routine reuses a timestamp approximating the
6982  * current absolute time received from the caller, and should perform
6983  * no more than a comparison against the deadline in the common case.
6984  */
6985 void
sched_timeshare_consider_maintenance(uint64_t ctime,bool safe_point)6986 sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6987 {
6988 	uint64_t deadline = sched_maintenance_deadline;
6989 
6990 	if (__improbable(ctime >= deadline)) {
6991 		if (__improbable(current_thread() == sched_maintenance_thread)) {
6992 			return;
6993 		}
6994 		OSMemoryBarrier();
6995 
6996 		uint64_t ndeadline = ctime + sched_tick_interval;
6997 
6998 		if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6999 			thread_wakeup((event_t)sched_timeshare_maintenance_continue);
7000 			sched_maintenance_wakeups++;
7001 			smr_maintenance(ctime);
7002 		}
7003 	}
7004 
7005 	smr_cpu_tick(ctime, safe_point);
7006 
7007 #if !CONFIG_SCHED_CLUTCH
7008 	/*
7009 	 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7010 	 * scheduler, the load is maintained at the thread group and bucket level.
7011 	 */
7012 	uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7013 
7014 	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7015 		uint64_t new_deadline = 0;
7016 		if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7017 			compute_sched_load();
7018 			new_deadline = ctime + sched_load_compute_interval_abs;
7019 			os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7020 		}
7021 	}
7022 #endif /* CONFIG_SCHED_CLUTCH */
7023 
7024 #if __arm64__
7025 	uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7026 
7027 	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7028 		/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7029 		if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7030 			machine_perfcontrol_deadline_passed(perf_deadline);
7031 		}
7032 	}
7033 #endif /* __arm64__ */
7034 }
7035 
7036 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7037 
7038 void
sched_init_thread(void)7039 sched_init_thread(void)
7040 {
7041 	thread_block(THREAD_CONTINUE_NULL);
7042 
7043 	thread_t thread = current_thread();
7044 
7045 	thread_set_thread_name(thread, "sched_maintenance_thread");
7046 
7047 	sched_maintenance_thread = thread;
7048 
7049 	SCHED(maintenance_continuation)();
7050 
7051 	/*NOTREACHED*/
7052 }
7053 
7054 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
7055 
7056 /*
7057  *	thread_update_scan / runq_scan:
7058  *
7059  *	Scan the run queues to account for timesharing threads
7060  *	which need to be updated.
7061  *
7062  *	Scanner runs in two passes.  Pass one squirrels likely
7063  *	threads away in an array, pass two does the update.
7064  *
7065  *	This is necessary because the run queue is locked for
7066  *	the candidate scan, but	the thread is locked for the update.
7067  *
7068  *	Array should be sized to make forward progress, without
7069  *	disabling preemption for long periods.
7070  */
7071 
7072 #define THREAD_UPDATE_SIZE              128
7073 
7074 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7075 static uint32_t thread_update_count = 0;
7076 
7077 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7078 boolean_t
thread_update_add_thread(thread_t thread)7079 thread_update_add_thread(thread_t thread)
7080 {
7081 	if (thread_update_count == THREAD_UPDATE_SIZE) {
7082 		return FALSE;
7083 	}
7084 
7085 	thread_update_array[thread_update_count++] = thread;
7086 	thread_reference(thread);
7087 	return TRUE;
7088 }
7089 
7090 void
thread_update_process_threads(void)7091 thread_update_process_threads(void)
7092 {
7093 	assert(thread_update_count <= THREAD_UPDATE_SIZE);
7094 
7095 	for (uint32_t i = 0; i < thread_update_count; i++) {
7096 		thread_t thread = thread_update_array[i];
7097 		assert_thread_magic(thread);
7098 		thread_update_array[i] = THREAD_NULL;
7099 
7100 		spl_t s = splsched();
7101 		thread_lock(thread);
7102 		if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7103 			SCHED(update_priority)(thread);
7104 		}
7105 		thread_unlock(thread);
7106 		splx(s);
7107 
7108 		thread_deallocate(thread);
7109 	}
7110 
7111 	thread_update_count = 0;
7112 }
7113 
7114 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)7115 runq_scan_thread(
7116 	thread_t thread,
7117 	sched_update_scan_context_t scan_context)
7118 {
7119 	assert_thread_magic(thread);
7120 
7121 	if (thread->sched_stamp != sched_tick &&
7122 	    thread->sched_mode == TH_MODE_TIMESHARE) {
7123 		if (thread_update_add_thread(thread) == FALSE) {
7124 			return TRUE;
7125 		}
7126 	}
7127 
7128 	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7129 		if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7130 			scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7131 		}
7132 	} else {
7133 		if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7134 			scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7135 		}
7136 	}
7137 
7138 	return FALSE;
7139 }
7140 
7141 /*
7142  *	Scan a runq for candidate threads.
7143  *
7144  *	Returns TRUE if retry is needed.
7145  */
7146 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)7147 runq_scan(
7148 	run_queue_t                   runq,
7149 	sched_update_scan_context_t   scan_context)
7150 {
7151 	int count       = runq->count;
7152 	int queue_index;
7153 
7154 	assert(count >= 0);
7155 
7156 	if (count == 0) {
7157 		return FALSE;
7158 	}
7159 
7160 	for (queue_index = bitmap_first(runq->bitmap, NRQS);
7161 	    queue_index >= 0;
7162 	    queue_index = bitmap_next(runq->bitmap, queue_index)) {
7163 		thread_t thread;
7164 		circle_queue_t queue = &runq->queues[queue_index];
7165 
7166 		cqe_foreach_element(thread, queue, runq_links) {
7167 			assert(count > 0);
7168 			if (runq_scan_thread(thread, scan_context) == TRUE) {
7169 				return TRUE;
7170 			}
7171 			count--;
7172 		}
7173 	}
7174 
7175 	return FALSE;
7176 }
7177 
7178 #if CONFIG_SCHED_CLUTCH
7179 
7180 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)7181 sched_clutch_timeshare_scan(
7182 	queue_t thread_queue,
7183 	uint16_t thread_count,
7184 	sched_update_scan_context_t scan_context)
7185 {
7186 	if (thread_count == 0) {
7187 		return FALSE;
7188 	}
7189 
7190 	thread_t thread;
7191 	qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7192 		if (runq_scan_thread(thread, scan_context) == TRUE) {
7193 			return TRUE;
7194 		}
7195 		thread_count--;
7196 	}
7197 
7198 	assert(thread_count == 0);
7199 	return FALSE;
7200 }
7201 
7202 
7203 #endif /* CONFIG_SCHED_CLUTCH */
7204 
7205 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
7206 
7207 bool
thread_is_eager_preempt(thread_t thread)7208 thread_is_eager_preempt(thread_t thread)
7209 {
7210 	return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7211 }
7212 
7213 void
thread_set_eager_preempt(thread_t thread)7214 thread_set_eager_preempt(thread_t thread)
7215 {
7216 	spl_t s = splsched();
7217 	thread_lock(thread);
7218 
7219 	assert(!thread_is_eager_preempt(thread));
7220 
7221 	thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7222 
7223 	if (thread == current_thread()) {
7224 		/* csw_check updates current_is_eagerpreempt on the processor */
7225 		ast_t ast = csw_check(thread, current_processor(), AST_NONE);
7226 
7227 		thread_unlock(thread);
7228 
7229 		if (ast != AST_NONE) {
7230 			thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
7231 		}
7232 	} else {
7233 		processor_t last_processor = thread->last_processor;
7234 
7235 		if (last_processor != PROCESSOR_NULL &&
7236 		    last_processor->state == PROCESSOR_RUNNING &&
7237 		    last_processor->active_thread == thread) {
7238 			cause_ast_check(last_processor);
7239 		}
7240 
7241 		thread_unlock(thread);
7242 	}
7243 
7244 	splx(s);
7245 }
7246 
7247 void
thread_clear_eager_preempt(thread_t thread)7248 thread_clear_eager_preempt(thread_t thread)
7249 {
7250 	spl_t s = splsched();
7251 	thread_lock(thread);
7252 
7253 	assert(thread_is_eager_preempt(thread));
7254 
7255 	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7256 
7257 	if (thread == current_thread()) {
7258 		current_processor()->current_is_eagerpreempt = false;
7259 	}
7260 
7261 	thread_unlock(thread);
7262 	splx(s);
7263 }
7264 
7265 /*
7266  * Scheduling statistics
7267  */
7268 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)7269 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7270 {
7271 	struct sched_statistics *stats;
7272 	boolean_t to_realtime = FALSE;
7273 
7274 	stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7275 	stats->csw_count++;
7276 
7277 	if (otherpri >= BASEPRI_REALTIME) {
7278 		stats->rt_sched_count++;
7279 		to_realtime = TRUE;
7280 	}
7281 
7282 	if ((reasons & AST_PREEMPT) != 0) {
7283 		stats->preempt_count++;
7284 
7285 		if (selfpri >= BASEPRI_REALTIME) {
7286 			stats->preempted_rt_count++;
7287 		}
7288 
7289 		if (to_realtime) {
7290 			stats->preempted_by_rt_count++;
7291 		}
7292 	}
7293 }
7294 
7295 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)7296 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7297 {
7298 	uint64_t timestamp = mach_absolute_time();
7299 
7300 	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7301 	stats->last_change_timestamp = timestamp;
7302 }
7303 
7304 /*
7305  *     For calls from assembly code
7306  */
7307 #undef thread_wakeup
7308 void
7309 thread_wakeup(
7310 	event_t         x);
7311 
7312 void
thread_wakeup(event_t x)7313 thread_wakeup(
7314 	event_t         x)
7315 {
7316 	thread_wakeup_with_result(x, THREAD_AWAKENED);
7317 }
7318 
7319 boolean_t
preemption_enabled(void)7320 preemption_enabled(void)
7321 {
7322 	return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7323 }
7324 
7325 static void
sched_timer_deadline_tracking_init(void)7326 sched_timer_deadline_tracking_init(void)
7327 {
7328 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
7329 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
7330 }
7331 
7332 static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7333 processor_reason_t latest_requested_reason = REASON_NONE;
7334 static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7335 bool perfcontrol_sleep_override = false;
7336 
7337 LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7338 LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7339 int32_t cluster_powerdown_suspend_count = 0;
7340 
7341 bool
sched_is_in_sleep(void)7342 sched_is_in_sleep(void)
7343 {
7344 	os_atomic_thread_fence(acquire);
7345 	return perfcontrol_sleep_override;
7346 }
7347 
7348 static void
sched_update_powered_cores_continue(void)7349 sched_update_powered_cores_continue(void)
7350 {
7351 	lck_mtx_lock(&cluster_powerdown_lock);
7352 
7353 	if (!cluster_powerdown_suspend_count) {
7354 		spl_t s = splsched();
7355 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7356 
7357 		uint64_t latest = latest_requested_powered_cores;
7358 		processor_reason_t reason = latest_requested_reason;
7359 		uint64_t current = current_requested_powered_cores;
7360 		current_requested_powered_cores = latest;
7361 		bool in_sleep = perfcontrol_sleep_override;
7362 
7363 		simple_unlock(&sched_available_cores_lock);
7364 		splx(s);
7365 
7366 		while (latest != current) {
7367 			if (!in_sleep) {
7368 				assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7369 				sched_update_powered_cores(latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7370 			}
7371 
7372 			s = splsched();
7373 			simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7374 
7375 			latest = latest_requested_powered_cores;
7376 			reason = latest_requested_reason;
7377 			current = current_requested_powered_cores;
7378 			current_requested_powered_cores = latest;
7379 			in_sleep = perfcontrol_sleep_override;
7380 
7381 			simple_unlock(&sched_available_cores_lock);
7382 			splx(s);
7383 		}
7384 
7385 		assert_wait((event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7386 
7387 		s = splsched();
7388 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7389 		if (latest_requested_powered_cores != current_requested_powered_cores) {
7390 			clear_wait(current_thread(), THREAD_AWAKENED);
7391 		}
7392 		simple_unlock(&sched_available_cores_lock);
7393 		splx(s);
7394 	}
7395 
7396 	lck_mtx_unlock(&cluster_powerdown_lock);
7397 
7398 	thread_block((thread_continue_t)sched_update_powered_cores_continue);
7399 	/*NOTREACHED*/
7400 }
7401 
7402 void
sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,__unused uint32_t flags)7403 sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7404 {
7405 	assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7406 
7407 #if DEVELOPMENT || DEBUG
7408 	if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7409 		if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7410 			assert(cluster_powerdown_suspend_count > 0);
7411 		}
7412 		if (flags & ASSERT_IN_SLEEP) {
7413 			assert(perfcontrol_sleep_override == true);
7414 		}
7415 		return;
7416 	}
7417 #endif
7418 
7419 	spl_t s = splsched();
7420 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7421 
7422 	bool should_wakeup = !cluster_powerdown_suspend_count;
7423 	if (should_wakeup) {
7424 		latest_requested_powered_cores = requested_powered_cores;
7425 		latest_requested_reason = reason;
7426 	}
7427 
7428 	simple_unlock(&sched_available_cores_lock);
7429 	splx(s);
7430 
7431 	if (should_wakeup) {
7432 		thread_wakeup((event_t)sched_update_powered_cores_continue);
7433 	}
7434 }
7435 
7436 void
suspend_cluster_powerdown(void)7437 suspend_cluster_powerdown(void)
7438 {
7439 	lck_mtx_lock(&cluster_powerdown_lock);
7440 
7441 	assert(cluster_powerdown_suspend_count >= 0);
7442 
7443 	bool first_suspend = (cluster_powerdown_suspend_count == 0);
7444 	if (first_suspend) {
7445 		spl_t s = splsched();
7446 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7447 		latest_requested_powered_cores = ALL_CORES_POWERED;
7448 		current_requested_powered_cores = ALL_CORES_POWERED;
7449 		latest_requested_reason = REASON_SYSTEM;
7450 		simple_unlock(&sched_available_cores_lock);
7451 		splx(s);
7452 	}
7453 
7454 	cluster_powerdown_suspend_count++;
7455 
7456 	if (first_suspend) {
7457 		kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7458 		sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7459 	}
7460 
7461 	lck_mtx_unlock(&cluster_powerdown_lock);
7462 }
7463 
7464 void
resume_cluster_powerdown(void)7465 resume_cluster_powerdown(void)
7466 {
7467 	lck_mtx_lock(&cluster_powerdown_lock);
7468 
7469 	if (cluster_powerdown_suspend_count <= 0) {
7470 		panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7471 	}
7472 
7473 	cluster_powerdown_suspend_count--;
7474 
7475 	bool last_resume = (cluster_powerdown_suspend_count == 0);
7476 
7477 	if (last_resume) {
7478 		spl_t s = splsched();
7479 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7480 		latest_requested_powered_cores = ALL_CORES_POWERED;
7481 		current_requested_powered_cores = ALL_CORES_POWERED;
7482 		latest_requested_reason = REASON_SYSTEM;
7483 		simple_unlock(&sched_available_cores_lock);
7484 		splx(s);
7485 
7486 		kprintf("%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7487 		sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE);
7488 	}
7489 
7490 	lck_mtx_unlock(&cluster_powerdown_lock);
7491 }
7492 
7493 LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7494 static bool user_suspended_cluster_powerdown = false;
7495 
7496 kern_return_t
suspend_cluster_powerdown_from_user(void)7497 suspend_cluster_powerdown_from_user(void)
7498 {
7499 	kern_return_t ret = KERN_FAILURE;
7500 
7501 	lck_mtx_lock(&user_cluster_powerdown_lock);
7502 
7503 	if (!user_suspended_cluster_powerdown) {
7504 		suspend_cluster_powerdown();
7505 		user_suspended_cluster_powerdown = true;
7506 		ret = KERN_SUCCESS;
7507 	}
7508 
7509 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7510 
7511 	return ret;
7512 }
7513 
7514 kern_return_t
resume_cluster_powerdown_from_user(void)7515 resume_cluster_powerdown_from_user(void)
7516 {
7517 	kern_return_t ret = KERN_FAILURE;
7518 
7519 	lck_mtx_lock(&user_cluster_powerdown_lock);
7520 
7521 	if (user_suspended_cluster_powerdown) {
7522 		resume_cluster_powerdown();
7523 		user_suspended_cluster_powerdown = false;
7524 		ret = KERN_SUCCESS;
7525 	}
7526 
7527 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7528 
7529 	return ret;
7530 }
7531 
7532 int
get_cluster_powerdown_user_suspended(void)7533 get_cluster_powerdown_user_suspended(void)
7534 {
7535 	lck_mtx_lock(&user_cluster_powerdown_lock);
7536 
7537 	int ret = (int)user_suspended_cluster_powerdown;
7538 
7539 	lck_mtx_unlock(&user_cluster_powerdown_lock);
7540 
7541 	return ret;
7542 }
7543 
7544 #if DEVELOPMENT || DEBUG
7545 /* Functions to support the temporary sysctl */
7546 static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7547 void
sched_set_powered_cores(int requested_powered_cores)7548 sched_set_powered_cores(int requested_powered_cores)
7549 {
7550 	processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7551 	uint32_t flags = requested_powered_cores & 0x30000000;
7552 
7553 	saved_requested_powered_cores = requested_powered_cores;
7554 
7555 	requested_powered_cores = bits(requested_powered_cores, 28, 0);
7556 
7557 	sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7558 }
7559 int
sched_get_powered_cores(void)7560 sched_get_powered_cores(void)
7561 {
7562 	return (int)saved_requested_powered_cores;
7563 }
7564 #endif
7565 
7566 /*
7567  * Ensure that all cores are powered and recommended before sleep
7568  */
7569 void
sched_override_available_cores_for_sleep(void)7570 sched_override_available_cores_for_sleep(void)
7571 {
7572 	spl_t s = splsched();
7573 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7574 
7575 	if (perfcontrol_sleep_override == false) {
7576 		perfcontrol_sleep_override = true;
7577 #if __arm__ || __arm64__
7578 		sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7579 #endif
7580 	}
7581 
7582 	simple_unlock(&sched_available_cores_lock);
7583 	splx(s);
7584 
7585 	suspend_cluster_powerdown();
7586 }
7587 
7588 /*
7589  * Restore the previously recommended cores, but leave all cores powered
7590  * after sleep
7591  */
7592 void
sched_restore_available_cores_after_sleep(void)7593 sched_restore_available_cores_after_sleep(void)
7594 {
7595 	spl_t s = splsched();
7596 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7597 
7598 	if (perfcontrol_sleep_override == true) {
7599 		perfcontrol_sleep_override = false;
7600 #if __arm__ || __arm64__
7601 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7602 		    REASON_NONE, 0);
7603 #endif
7604 	}
7605 
7606 	simple_unlock(&sched_available_cores_lock);
7607 	splx(s);
7608 
7609 	resume_cluster_powerdown();
7610 }
7611 
7612 #if __arm__ || __arm64__
7613 
7614 uint32_t    perfcontrol_requested_recommended_core_count = MAX_CPUS;
7615 bool        perfcontrol_failsafe_active = false;
7616 
7617 uint64_t    perfcontrol_failsafe_maintenance_runnable_time;
7618 uint64_t    perfcontrol_failsafe_activation_time;
7619 uint64_t    perfcontrol_failsafe_deactivation_time;
7620 
7621 /* data covering who likely caused it and how long they ran */
7622 #define FAILSAFE_NAME_LEN       33 /* (2*MAXCOMLEN)+1 from size of p_name */
7623 char        perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7624 int         perfcontrol_failsafe_pid;
7625 uint64_t    perfcontrol_failsafe_tid;
7626 uint64_t    perfcontrol_failsafe_thread_timer_at_start;
7627 uint64_t    perfcontrol_failsafe_thread_timer_last_seen;
7628 uint64_t    perfcontrol_failsafe_recommended_at_trigger;
7629 
7630 /*
7631  * Perf controller calls here to update the recommended core bitmask.
7632  * If the failsafe is active, we don't immediately apply the new value.
7633  * Instead, we store the new request and use it after the failsafe deactivates.
7634  *
7635  * If the failsafe is not active, immediately apply the update.
7636  *
7637  * No scheduler locks are held, no other locks are held that scheduler might depend on,
7638  * interrupts are enabled
7639  *
7640  * currently prototype is in osfmk/arm/machine_routines.h
7641  */
7642 void
sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores,processor_reason_t reason,uint32_t flags)7643 sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7644 {
7645 	assert(preemption_enabled());
7646 
7647 	spl_t s = splsched();
7648 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7649 
7650 	if (reason == REASON_CLPC_SYSTEM) {
7651 		perfcontrol_system_requested_recommended_cores = recommended_cores;
7652 	} else {
7653 		assert(reason == REASON_CLPC_USER);
7654 		perfcontrol_user_requested_recommended_cores = recommended_cores;
7655 	}
7656 
7657 	perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7658 	perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7659 
7660 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7661 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7662 	} else {
7663 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7664 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7665 		    perfcontrol_requested_recommended_cores,
7666 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7667 	}
7668 
7669 	simple_unlock(&sched_available_cores_lock);
7670 	splx(s);
7671 }
7672 
7673 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)7674 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7675 {
7676 	sched_perfcontrol_update_recommended_cores_reason(recommended_cores, REASON_CLPC_USER, 0);
7677 }
7678 
7679 /*
7680  * Consider whether we need to activate the recommended cores failsafe
7681  *
7682  * Called from quantum timer interrupt context of a realtime thread
7683  * No scheduler locks are held, interrupts are disabled
7684  */
7685 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7686 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7687 {
7688 	/*
7689 	 * Check if a realtime thread is starving the system
7690 	 * and bringing up non-recommended cores would help
7691 	 *
7692 	 * TODO: Is this the correct check for recommended == possible cores?
7693 	 * TODO: Validate the checks without the relevant lock are OK.
7694 	 */
7695 
7696 	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7697 		/* keep track of how long the responsible thread runs */
7698 		uint64_t cur_th_time = recount_current_thread_time_mach();
7699 
7700 		simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7701 
7702 		if (perfcontrol_failsafe_active == TRUE &&
7703 		    cur_thread->thread_id == perfcontrol_failsafe_tid) {
7704 			perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7705 		}
7706 
7707 		simple_unlock(&sched_available_cores_lock);
7708 
7709 		/* we're already trying to solve the problem, so bail */
7710 		return;
7711 	}
7712 
7713 	/* The failsafe won't help if there are no more processors to enable */
7714 	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7715 		return;
7716 	}
7717 
7718 	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7719 
7720 	/* Use the maintenance thread as our canary in the coal mine */
7721 	thread_t m_thread = sched_maintenance_thread;
7722 
7723 	/* If it doesn't look bad, nothing to see here */
7724 	if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7725 		return;
7726 	}
7727 
7728 	/* It looks bad, take the lock to be sure */
7729 	thread_lock(m_thread);
7730 
7731 	if (thread_get_runq(m_thread) == PROCESSOR_NULL ||
7732 	    (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7733 	    m_thread->last_made_runnable_time >= too_long_ago) {
7734 		/*
7735 		 * Maintenance thread is either on cpu or blocked, and
7736 		 * therefore wouldn't benefit from more cores
7737 		 */
7738 		thread_unlock(m_thread);
7739 		return;
7740 	}
7741 
7742 	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7743 
7744 	thread_unlock(m_thread);
7745 
7746 	/*
7747 	 * There are cores disabled at perfcontrol's recommendation, but the
7748 	 * system is so overloaded that the maintenance thread can't run.
7749 	 * That likely means that perfcontrol can't run either, so it can't fix
7750 	 * the recommendation.  We have to kick in a failsafe to keep from starving.
7751 	 *
7752 	 * When the maintenance thread has been starved for too long,
7753 	 * ignore the recommendation from perfcontrol and light up all the cores.
7754 	 *
7755 	 * TODO: Consider weird states like boot, sleep, or debugger
7756 	 */
7757 
7758 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7759 
7760 	if (perfcontrol_failsafe_active == TRUE) {
7761 		simple_unlock(&sched_available_cores_lock);
7762 		return;
7763 	}
7764 
7765 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7766 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7767 	    perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7768 
7769 	perfcontrol_failsafe_active = TRUE;
7770 	perfcontrol_failsafe_activation_time = mach_absolute_time();
7771 	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7772 	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7773 
7774 	/* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7775 	task_t task = get_threadtask(cur_thread);
7776 	perfcontrol_failsafe_pid = task_pid(task);
7777 	strlcpy(perfcontrol_failsafe_name, proc_name_address(get_bsdtask_info(task)), sizeof(perfcontrol_failsafe_name));
7778 
7779 	perfcontrol_failsafe_tid = cur_thread->thread_id;
7780 
7781 	/* Blame the thread for time it has run recently */
7782 	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7783 
7784 	uint64_t last_seen = recount_current_thread_time_mach();
7785 
7786 	/* Compute the start time of the bad behavior in terms of the thread's on core time */
7787 	perfcontrol_failsafe_thread_timer_at_start  = last_seen - recent_computation;
7788 	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7789 
7790 	/* Ignore the previously recommended core configuration */
7791 	sched_update_recommended_cores(ALL_CORES_RECOMMENDED, REASON_SYSTEM, 0);
7792 
7793 	simple_unlock(&sched_available_cores_lock);
7794 }
7795 
7796 /*
7797  * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7798  *
7799  * Runs in the context of the maintenance thread, no locks held
7800  */
7801 static void
sched_recommended_cores_maintenance(void)7802 sched_recommended_cores_maintenance(void)
7803 {
7804 	/* Common case - no failsafe, nothing to be done here */
7805 	if (__probable(perfcontrol_failsafe_active == FALSE)) {
7806 		return;
7807 	}
7808 
7809 	uint64_t ctime = mach_absolute_time();
7810 
7811 	boolean_t print_diagnostic = FALSE;
7812 	char p_name[FAILSAFE_NAME_LEN] = "";
7813 
7814 	spl_t s = splsched();
7815 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7816 
7817 	/* Check again, under the lock, to avoid races */
7818 	if (perfcontrol_failsafe_active == FALSE) {
7819 		goto out;
7820 	}
7821 
7822 	/*
7823 	 * Ensure that the other cores get another few ticks to run some threads
7824 	 * If we don't have this hysteresis, the maintenance thread is the first
7825 	 * to run, and then it immediately kills the other cores
7826 	 */
7827 	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7828 		goto out;
7829 	}
7830 
7831 	/* Capture some diagnostic state under the lock so we can print it out later */
7832 
7833 	int      pid = perfcontrol_failsafe_pid;
7834 	uint64_t tid = perfcontrol_failsafe_tid;
7835 
7836 	uint64_t thread_usage       = perfcontrol_failsafe_thread_timer_last_seen -
7837 	    perfcontrol_failsafe_thread_timer_at_start;
7838 	uint64_t rec_cores_before   = perfcontrol_failsafe_recommended_at_trigger;
7839 	uint64_t rec_cores_after    = perfcontrol_requested_recommended_cores;
7840 	uint64_t failsafe_duration  = ctime - perfcontrol_failsafe_activation_time;
7841 	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7842 
7843 	print_diagnostic = TRUE;
7844 
7845 	/* Deactivate the failsafe and reinstate the requested recommendation settings */
7846 
7847 	perfcontrol_failsafe_deactivation_time = ctime;
7848 	perfcontrol_failsafe_active = FALSE;
7849 
7850 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7851 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7852 	    perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7853 
7854 	sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7855 	    REASON_NONE, 0);
7856 
7857 out:
7858 	simple_unlock(&sched_available_cores_lock);
7859 	splx(s);
7860 
7861 	if (print_diagnostic) {
7862 		uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7863 
7864 		absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7865 		failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7866 
7867 		absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7868 		thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7869 
7870 		printf("recommended core failsafe kicked in for %lld ms "
7871 		    "likely due to %s[%d] thread 0x%llx spending "
7872 		    "%lld ms on cpu at realtime priority - "
7873 		    "new recommendation: 0x%llx -> 0x%llx\n",
7874 		    failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7875 		    rec_cores_before, rec_cores_after);
7876 	}
7877 }
7878 
7879 #endif /* __arm64__ */
7880 
7881 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7882 sched_processor_enable(processor_t processor, boolean_t enable)
7883 {
7884 	assert(preemption_enabled());
7885 
7886 	if (processor == master_processor) {
7887 		/* The system can hang if this is allowed */
7888 		return KERN_NOT_SUPPORTED;
7889 	}
7890 
7891 	spl_t s = splsched();
7892 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7893 
7894 	if (enable) {
7895 		bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7896 	} else {
7897 		bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7898 	}
7899 
7900 #if __arm64__
7901 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7902 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7903 		    REASON_USER, 0);
7904 	} else {
7905 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7906 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7907 		    perfcontrol_requested_recommended_cores,
7908 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7909 	}
7910 #else /* __arm64__ */
7911 	sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7912 #endif /* ! __arm64__ */
7913 
7914 	simple_unlock(&sched_available_cores_lock);
7915 	splx(s);
7916 
7917 	return KERN_SUCCESS;
7918 }
7919 
7920 void
sched_mark_processor_online_locked(processor_t processor,__assert_only processor_reason_t reason)7921 sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7922 {
7923 	assert((processor != master_processor) || (reason == REASON_SYSTEM));
7924 
7925 	bit_set(sched_online_processors, processor->cpu_id);
7926 }
7927 
7928 kern_return_t
sched_mark_processor_offline(processor_t processor,processor_reason_t reason)7929 sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7930 {
7931 	assert((processor != master_processor) || (reason == REASON_SYSTEM));
7932 	kern_return_t ret = KERN_SUCCESS;
7933 
7934 	spl_t s = splsched();
7935 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7936 
7937 	if (reason == REASON_SYSTEM) {
7938 		bit_clear(sched_online_processors, processor->cpu_id);
7939 		simple_unlock(&sched_available_cores_lock);
7940 		splx(s);
7941 		return ret;
7942 	}
7943 
7944 	uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7945 
7946 	if (!bit_test(sched_online_processors, processor->cpu_id)) {
7947 		/* Processor is already offline */
7948 		ret = KERN_NOT_IN_SET;
7949 	} else if (available_cores == BIT(processor->cpu_id)) {
7950 		ret = KERN_RESOURCE_SHORTAGE;
7951 	} else {
7952 		bit_clear(sched_online_processors, processor->cpu_id);
7953 		ret = KERN_SUCCESS;
7954 	}
7955 
7956 	simple_unlock(&sched_available_cores_lock);
7957 	splx(s);
7958 
7959 	return ret;
7960 }
7961 
7962 /*
7963  * Apply a new recommended cores mask to the processors it affects
7964  * Runs after considering failsafes and such
7965  *
7966  * Iterate over processors and update their ->is_recommended field.
7967  * If a processor is running, we let it drain out at its next
7968  * quantum expiration or blocking point. If a processor is idle, there
7969  * may be more work for it to do, so IPI it.
7970  *
7971  * interrupts disabled, sched_available_cores_lock is held
7972  */
7973 static void
sched_update_recommended_cores(uint64_t recommended_cores,processor_reason_t reason,__unused uint32_t flags)7974 sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7975 {
7976 	uint64_t        needs_exit_idle_mask = 0x0;
7977 
7978 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7979 	    recommended_cores,
7980 #if __arm64__
7981 	    perfcontrol_failsafe_active, 0, 0);
7982 #else /* __arm64__ */
7983 	    0, 0, 0);
7984 #endif /* ! __arm64__ */
7985 
7986 	if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7987 		bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7988 	}
7989 
7990 	/* First set recommended cores */
7991 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7992 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7993 			processor_set_t pset = pset_array[pset_id];
7994 
7995 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7996 			cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7997 
7998 			if (newly_recommended == 0) {
7999 				/* Nothing to do */
8000 				continue;
8001 			}
8002 
8003 			pset_lock(pset);
8004 
8005 			for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
8006 				processor_t processor = processor_array[cpu_id];
8007 				processor->is_recommended = TRUE;
8008 				processor->last_recommend_reason = reason;
8009 				bit_set(pset->recommended_bitmask, processor->cpu_id);
8010 
8011 				if (processor->state == PROCESSOR_IDLE) {
8012 					if (processor != current_processor()) {
8013 						bit_set(needs_exit_idle_mask, processor->cpu_id);
8014 					}
8015 				}
8016 				if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8017 					os_atomic_inc(&processor_avail_count_user, relaxed);
8018 					if (processor->processor_primary == processor) {
8019 						os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8020 					}
8021 					SCHED(pset_made_schedulable)(processor, pset, false);
8022 				}
8023 			}
8024 			pset_update_rt_stealable_state(pset);
8025 
8026 			pset_unlock(pset);
8027 
8028 			for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0;
8029 			    cpu_id = lsb_next(newly_recommended, cpu_id)) {
8030 				smr_cpu_up(processor_array[cpu_id],
8031 				    SMR_CPU_REASON_IGNORED);
8032 			}
8033 		}
8034 	}
8035 
8036 	/* Now shutdown not recommended cores */
8037 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8038 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8039 			processor_set_t pset = pset_array[pset_id];
8040 
8041 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8042 			cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8043 
8044 			if (newly_unrecommended == 0) {
8045 				/* Nothing to do */
8046 				continue;
8047 			}
8048 
8049 			pset_lock(pset);
8050 
8051 			for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
8052 				processor_t processor = processor_array[cpu_id];
8053 				sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8054 
8055 				processor->is_recommended = FALSE;
8056 				if (reason != REASON_NONE) {
8057 					processor->last_derecommend_reason = reason;
8058 				}
8059 				bit_clear(pset->recommended_bitmask, processor->cpu_id);
8060 				if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8061 					os_atomic_dec(&processor_avail_count_user, relaxed);
8062 					if (processor->processor_primary == processor) {
8063 						os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8064 					}
8065 				}
8066 				pset_update_rt_stealable_state(pset);
8067 
8068 				if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8069 					ipi_type = SCHED_IPI_IMMEDIATE;
8070 				}
8071 				SCHED(processor_queue_shutdown)(processor);
8072 				/* pset unlocked */
8073 
8074 				SCHED(rt_queue_shutdown)(processor);
8075 
8076 				if (ipi_type == SCHED_IPI_NONE) {
8077 					/*
8078 					 * If the core is idle,
8079 					 * we can directly mark the processor
8080 					 * as "Ignored"
8081 					 *
8082 					 * Otherwise, smr will detect this
8083 					 * during smr_cpu_leave() when the
8084 					 * processor actually idles.
8085 					 */
8086 					smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8087 				} else if (processor == current_processor()) {
8088 					ast_on(AST_PREEMPT);
8089 				} else {
8090 					sched_ipi_perform(processor, ipi_type);
8091 				}
8092 
8093 				pset_lock(pset);
8094 			}
8095 			pset_unlock(pset);
8096 		}
8097 	}
8098 
8099 #if defined(__x86_64__)
8100 	commpage_update_active_cpus();
8101 #endif
8102 	/* Issue all pending IPIs now that the pset lock has been dropped */
8103 	for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
8104 		processor_t processor = processor_array[cpuid];
8105 		machine_signal_idle(processor);
8106 	}
8107 
8108 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8109 	    needs_exit_idle_mask, 0, 0, 0);
8110 }
8111 
8112 static void
sched_update_powered_cores(uint64_t requested_powered_cores,processor_reason_t reason,uint32_t flags)8113 sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8114 {
8115 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8116 	    requested_powered_cores, reason, flags, 0);
8117 
8118 	assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
8119 
8120 	/*
8121 	 * Loop through newly set requested_powered_cores and start them.
8122 	 * Loop through newly cleared requested_powered_cores and shut them down.
8123 	 */
8124 
8125 	if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
8126 		flags |= SHUTDOWN_TEMPORARY;
8127 	}
8128 
8129 	/* First set powered cores */
8130 	cpumap_t started_cores = 0ull;
8131 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8132 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8133 			processor_set_t pset = pset_array[pset_id];
8134 
8135 			spl_t s = splsched();
8136 			pset_lock(pset);
8137 			cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8138 			cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8139 			cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8140 			pset_unlock(pset);
8141 			splx(s);
8142 
8143 			cpumap_t newly_powered = requested_changes & requested_powered_cores;
8144 
8145 			cpumap_t cpu_map = newly_powered;
8146 
8147 			if (flags & (LOCK_STATE | UNLOCK_STATE)) {
8148 				/*
8149 				 * We need to change the lock state even if
8150 				 * we don't need to change the actual state.
8151 				 */
8152 				cpu_map = pset_requested_powered_cores;
8153 				/* But not the master_processor, which is always implicitly locked */
8154 				bit_clear(cpu_map, master_processor->cpu_id);
8155 			}
8156 
8157 			if (cpu_map == 0) {
8158 				/* Nothing to do */
8159 				continue;
8160 			}
8161 
8162 			for (int cpu_id = lsb_first(cpu_map); cpu_id >= 0; cpu_id = lsb_next(cpu_map, cpu_id)) {
8163 				processor_t processor = processor_array[cpu_id];
8164 				processor_start_reason(processor, reason, flags);
8165 				bit_set(started_cores, cpu_id);
8166 			}
8167 		}
8168 	}
8169 	if (flags & WAIT_FOR_LAST_START) {
8170 		for (int cpu_id = lsb_first(started_cores); cpu_id >= 0; cpu_id = lsb_next(started_cores, cpu_id)) {
8171 			processor_t processor = processor_array[cpu_id];
8172 			processor_wait_for_start(processor);
8173 		}
8174 	}
8175 
8176 	/* Now shutdown not powered cores */
8177 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8178 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8179 			processor_set_t pset = pset_array[pset_id];
8180 
8181 			spl_t s = splsched();
8182 			pset_lock(pset);
8183 			cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8184 			cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8185 			pset_unlock(pset);
8186 			splx(s);
8187 
8188 			cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8189 
8190 			if (newly_unpowered == 0) {
8191 				/* Nothing to do */
8192 				continue;
8193 			}
8194 
8195 			for (int cpu_id = lsb_first(newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(newly_unpowered, cpu_id)) {
8196 				processor_t processor = processor_array[cpu_id];
8197 
8198 				processor_exit_reason(processor, reason, flags);
8199 			}
8200 		}
8201 	}
8202 
8203 	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
8204 }
8205 
8206 void
thread_set_options(uint32_t thopt)8207 thread_set_options(uint32_t thopt)
8208 {
8209 	spl_t x;
8210 	thread_t t = current_thread();
8211 
8212 	x = splsched();
8213 	thread_lock(t);
8214 
8215 	t->options |= thopt;
8216 
8217 	thread_unlock(t);
8218 	splx(x);
8219 }
8220 
8221 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)8222 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8223 {
8224 	thread->pending_block_hint = block_hint;
8225 }
8226 
8227 uint32_t
qos_max_parallelism(int qos,uint64_t options)8228 qos_max_parallelism(int qos, uint64_t options)
8229 {
8230 	return SCHED(qos_max_parallelism)(qos, options);
8231 }
8232 
8233 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)8234 sched_qos_max_parallelism(__unused int qos, uint64_t options)
8235 {
8236 	host_basic_info_data_t hinfo;
8237 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8238 
8239 
8240 	/*
8241 	 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8242 	 * implement their own qos_max_parallelism() interfaces.
8243 	 */
8244 	assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
8245 
8246 	/* Query the machine layer for core information */
8247 	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
8248 	    (host_info_t)&hinfo, &count);
8249 	assert(kret == KERN_SUCCESS);
8250 
8251 	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8252 		return hinfo.logical_cpu;
8253 	} else {
8254 		return hinfo.physical_cpu;
8255 	}
8256 }
8257 
8258 int sched_allow_NO_SMT_threads = 1;
8259 bool
thread_no_smt(thread_t thread)8260 thread_no_smt(thread_t thread)
8261 {
8262 	return sched_allow_NO_SMT_threads &&
8263 	       (thread->bound_processor == PROCESSOR_NULL) &&
8264 	       ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
8265 }
8266 
8267 bool
processor_active_thread_no_smt(processor_t processor)8268 processor_active_thread_no_smt(processor_t processor)
8269 {
8270 	return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8271 }
8272 
8273 #if __arm64__
8274 
8275 /*
8276  * Set up or replace old timer with new timer
8277  *
8278  * Returns true if canceled old timer, false if it did not
8279  */
8280 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)8281 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8282 {
8283 	/*
8284 	 * Exchange deadline for new deadline, if old deadline was nonzero,
8285 	 * then I cancelled the callback, otherwise I didn't
8286 	 */
8287 
8288 	return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8289 	           relaxed) != 0;
8290 }
8291 
8292 /*
8293  * Set global SFI window (in usec)
8294  */
8295 kern_return_t
sched_perfcontrol_sfi_set_window(uint64_t window_usecs)8296 sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8297 {
8298 	kern_return_t ret = KERN_NOT_SUPPORTED;
8299 #if CONFIG_THREAD_GROUPS
8300 	if (window_usecs == 0ULL) {
8301 		ret = sfi_window_cancel();
8302 	} else {
8303 		ret = sfi_set_window(window_usecs);
8304 	}
8305 #endif // CONFIG_THREAD_GROUPS
8306 	return ret;
8307 }
8308 
8309 /*
8310  * Set background and maintenance SFI class offtimes
8311  */
8312 kern_return_t
sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)8313 sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8314 {
8315 	kern_return_t ret = KERN_NOT_SUPPORTED;
8316 #if CONFIG_THREAD_GROUPS
8317 	if (offtime_usecs == 0ULL) {
8318 		ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8319 		ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8320 	} else {
8321 		ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8322 		ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8323 	}
8324 #endif // CONFIG_THREAD_GROUPS
8325 	return ret;
8326 }
8327 
8328 /*
8329  * Set utility SFI class offtime
8330  */
8331 kern_return_t
sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)8332 sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8333 {
8334 	kern_return_t ret = KERN_NOT_SUPPORTED;
8335 #if CONFIG_THREAD_GROUPS
8336 	if (offtime_usecs == 0ULL) {
8337 		ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8338 	} else {
8339 		ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8340 	}
8341 #endif // CONFIG_THREAD_GROUPS
8342 	return ret;
8343 }
8344 
8345 #endif /* __arm64__ */
8346 
8347 #if CONFIG_SCHED_EDGE
8348 
8349 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8350 
8351 /*
8352  * sched_edge_pset_running_higher_bucket()
8353  *
8354  * Routine to calculate cumulative running counts for each scheduling
8355  * bucket. This effectively lets the load calculation calculate if a
8356  * cluster is running any threads at a QoS lower than the thread being
8357  * migrated etc.
8358  */
8359 
8360 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)8361 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8362 {
8363 	bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8364 
8365 	/* Edge Scheduler Optimization */
8366 	for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8367 		sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8368 		for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8369 			running_higher[bucket]++;
8370 		}
8371 	}
8372 }
8373 
8374 /*
8375  * sched_update_pset_load_average()
8376  *
8377  * Updates the load average for each sched bucket for a cluster.
8378  * This routine must be called with the pset lock held.
8379  */
8380 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)8381 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8382 {
8383 	int avail_cpu_count = pset_available_cpu_count(pset);
8384 	if (avail_cpu_count == 0) {
8385 		/* Looks like the pset is not runnable any more; nothing to do here */
8386 		return;
8387 	}
8388 
8389 	/*
8390 	 * Edge Scheduler Optimization
8391 	 *
8392 	 * See if more callers of this routine can pass in timestamps to avoid the
8393 	 * mach_absolute_time() call here.
8394 	 */
8395 
8396 	if (!curtime) {
8397 		curtime = mach_absolute_time();
8398 	}
8399 	uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8400 	int64_t delta_ticks = curtime - last_update;
8401 	if (delta_ticks < 0) {
8402 		return;
8403 	}
8404 
8405 	uint64_t delta_nsecs = 0;
8406 	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8407 
8408 	if (__improbable(delta_nsecs > UINT32_MAX)) {
8409 		delta_nsecs = UINT32_MAX;
8410 	}
8411 
8412 #if CONFIG_SCHED_EDGE
8413 	/* Update the shared resource load on the pset */
8414 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8415 		uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8416 		uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8417 		uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8418 		uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8419 		if (old_shared_load != new_shared_load) {
8420 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8421 		}
8422 	}
8423 #endif /* CONFIG_SCHED_EDGE */
8424 
8425 	uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8426 	sched_edge_pset_running_higher_bucket(pset, running_higher);
8427 
8428 	for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8429 		uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8430 		uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8431 		uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) +  rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8432 
8433 		/*
8434 		 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8435 		 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8436 		 * new load averga needs to be shifted before it can be added to the old load average.
8437 		 */
8438 		uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8439 
8440 		/*
8441 		 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8442 		 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8443 		 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8444 		 */
8445 		int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8446 		boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8447 		boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8448 		uint64_t load_average;
8449 		if (load_uptick || load_downtick) {
8450 			load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8451 		} else {
8452 			/* Indicates a loaded system; use EWMA for load average calculation */
8453 			load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8454 		}
8455 		os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8456 		if (load_average != old_load_average) {
8457 			KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8458 		}
8459 	}
8460 	os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8461 }
8462 
8463 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)8464 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8465 {
8466 	pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8467 	uint64_t avg_thread_execution_time = 0;
8468 
8469 	os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8470 	    old_execution_time_packed.pset_execution_time_packed,
8471 	    new_execution_time_packed.pset_execution_time_packed, relaxed, {
8472 		uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8473 		int64_t delta_ticks = curtime - last_update;
8474 		if (delta_ticks < 0) {
8475 		        /*
8476 		         * Its possible that another CPU came in and updated the pset_execution_time
8477 		         * before this CPU could do it. Since the average execution time is meant to
8478 		         * be an approximate measure per cluster, ignore the older update.
8479 		         */
8480 		        os_atomic_rmw_loop_give_up(return );
8481 		}
8482 		uint64_t delta_nsecs = 0;
8483 		absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8484 
8485 		uint64_t nanotime = 0;
8486 		absolutetime_to_nanoseconds(execution_time, &nanotime);
8487 		uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8488 
8489 		uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8490 		uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8491 
8492 		avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8493 		new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8494 		new_execution_time_packed.pset_execution_time_last_update = curtime;
8495 	});
8496 	if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8497 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8498 	}
8499 }
8500 
8501 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)8502 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8503 {
8504 	return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8505 }
8506 
8507 #else /* CONFIG_SCHED_EDGE */
8508 
8509 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)8510 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8511 {
8512 	int non_rt_load = pset->pset_runq.count;
8513 	int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8514 	int new_load_average = ((int)pset->load_average + load) >> 1;
8515 
8516 	pset->load_average = new_load_average;
8517 #if (DEVELOPMENT || DEBUG)
8518 #if __AMP__
8519 	if (pset->pset_cluster_type == PSET_AMP_P) {
8520 		KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8521 	}
8522 #endif
8523 #endif
8524 }
8525 
8526 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)8527 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8528 {
8529 }
8530 
8531 #endif /* CONFIG_SCHED_EDGE */
8532 
8533 /* pset is locked */
8534 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)8535 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8536 {
8537 	int cpuid = processor->cpu_id;
8538 #if defined(__x86_64__)
8539 	if (sched_avoid_cpu0 && (cpuid == 0)) {
8540 		return false;
8541 	}
8542 #endif
8543 
8544 	cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8545 
8546 	return bit_test(fasttrack_map, cpuid);
8547 }
8548 
8549 /* pset is locked */
8550 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)8551 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8552 {
8553 #if defined(__x86_64__)
8554 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8555 #else
8556 	const bool avoid_cpu0 = false;
8557 #endif
8558 	cpumap_t cpu_map;
8559 
8560 try_again:
8561 	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8562 	if (skip_processor) {
8563 		bit_clear(cpu_map, skip_processor->cpu_id);
8564 	}
8565 	if (skip_spills) {
8566 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8567 	}
8568 
8569 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8570 		bit_clear(cpu_map, 0);
8571 	}
8572 
8573 	cpumap_t primary_map = cpu_map & pset->primary_map;
8574 	if (avoid_cpu0) {
8575 		primary_map = bit_ror64(primary_map, 1);
8576 	}
8577 
8578 	int rotid = lsb_first(primary_map);
8579 	if (rotid >= 0) {
8580 		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8581 
8582 		processor_t processor = processor_array[cpuid];
8583 
8584 		return processor;
8585 	}
8586 
8587 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8588 		goto out;
8589 	}
8590 
8591 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8592 		/* Also avoid cpu1 */
8593 		bit_clear(cpu_map, 1);
8594 	}
8595 
8596 	/* Consider secondary processors whose primary is actually running a realtime thread */
8597 	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8598 	if (avoid_cpu0) {
8599 		/* Also avoid cpu1 */
8600 		secondary_map = bit_ror64(secondary_map, 2);
8601 	}
8602 	rotid = lsb_first(secondary_map);
8603 	if (rotid >= 0) {
8604 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
8605 
8606 		processor_t processor = processor_array[cpuid];
8607 
8608 		return processor;
8609 	}
8610 
8611 	/* Consider secondary processors */
8612 	secondary_map = cpu_map & ~pset->primary_map;
8613 	if (avoid_cpu0) {
8614 		/* Also avoid cpu1 */
8615 		secondary_map = bit_ror64(secondary_map, 2);
8616 	}
8617 	rotid = lsb_first(secondary_map);
8618 	if (rotid >= 0) {
8619 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
8620 
8621 		processor_t processor = processor_array[cpuid];
8622 
8623 		return processor;
8624 	}
8625 
8626 	/*
8627 	 * I was hoping the compiler would optimize
8628 	 * this away when avoid_cpu0 is const bool false
8629 	 * but it still complains about the assignmnent
8630 	 * in that case.
8631 	 */
8632 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8633 #if defined(__x86_64__)
8634 		avoid_cpu0 = false;
8635 #else
8636 		assert(0);
8637 #endif
8638 		goto try_again;
8639 	}
8640 
8641 out:
8642 	if (skip_processor) {
8643 		return PROCESSOR_NULL;
8644 	}
8645 
8646 	/*
8647 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
8648 	 * not already running realtime threads than realtime threads in the realtime run queue,
8649 	 * this thread belongs in this pset, so choose some other processor in this pset
8650 	 * to ensure the thread is enqueued here.
8651 	 */
8652 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8653 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8654 		cpu_map = non_realtime_map;
8655 		assert(cpu_map != 0);
8656 		int cpuid = bit_first(cpu_map);
8657 		assert(cpuid >= 0);
8658 		return processor_array[cpuid];
8659 	}
8660 
8661 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8662 		goto skip_secondaries;
8663 	}
8664 
8665 	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8666 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
8667 		cpu_map = non_realtime_map;
8668 		assert(cpu_map != 0);
8669 		int cpuid = bit_first(cpu_map);
8670 		assert(cpuid >= 0);
8671 		return processor_array[cpuid];
8672 	}
8673 
8674 skip_secondaries:
8675 	return PROCESSOR_NULL;
8676 }
8677 
8678 /*
8679  * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8680  * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8681  *
8682  * pset is locked.
8683  */
8684 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)8685 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8686 {
8687 	uint64_t  furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
8688 	processor_t fd_processor = PROCESSOR_NULL;
8689 	int lowest_priority = max_pri;
8690 
8691 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8692 	if (skip_processor) {
8693 		bit_clear(cpu_map, skip_processor->cpu_id);
8694 	}
8695 	if (skip_spills) {
8696 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8697 	}
8698 
8699 	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8700 		processor_t processor = processor_array[cpuid];
8701 
8702 		if (processor->current_pri > lowest_priority) {
8703 			continue;
8704 		}
8705 
8706 		if (processor->current_pri < lowest_priority) {
8707 			lowest_priority = processor->current_pri;
8708 			furthest_deadline = processor->deadline;
8709 			fd_processor = processor;
8710 			continue;
8711 		}
8712 
8713 		if (processor->deadline > furthest_deadline) {
8714 			furthest_deadline = processor->deadline;
8715 			fd_processor = processor;
8716 		}
8717 	}
8718 
8719 	if (fd_processor) {
8720 		return fd_processor;
8721 	}
8722 
8723 	/*
8724 	 * There is a race condition possible when there are multiple processor sets.
8725 	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8726 	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8727 	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8728 	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8729 	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8730 	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8731 	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8732 	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8733 	 *
8734 	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8735 	 * on the run queue of that pset.
8736 	 */
8737 	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8738 		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8739 		assert(skip_processor == PROCESSOR_NULL);
8740 		assert(skip_spills == false);
8741 
8742 		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
8743 			processor_t processor = processor_array[cpuid];
8744 
8745 			if (processor->current_pri > lowest_priority) {
8746 				continue;
8747 			}
8748 
8749 			if (processor->current_pri < lowest_priority) {
8750 				lowest_priority = processor->current_pri;
8751 				furthest_deadline = processor->deadline;
8752 				fd_processor = processor;
8753 				continue;
8754 			}
8755 
8756 			if (processor->deadline > furthest_deadline) {
8757 				furthest_deadline = processor->deadline;
8758 				fd_processor = processor;
8759 			}
8760 		}
8761 	}
8762 
8763 	return fd_processor;
8764 }
8765 
8766 /* pset is locked */
8767 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)8768 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8769 {
8770 	bool skip_spills = true;
8771 	bool include_ast_urgent_pending_cpus = false;
8772 
8773 	processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8774 	if (next_processor != PROCESSOR_NULL) {
8775 		return next_processor;
8776 	}
8777 
8778 	next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8779 	return next_processor;
8780 }
8781 
8782 #if defined(__x86_64__)
8783 /* pset is locked */
8784 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)8785 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8786 {
8787 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8788 	int nbackup_cpus = 0;
8789 
8790 	if (include_backups && rt_runq_is_low_latency(pset)) {
8791 		nbackup_cpus = sched_rt_n_backup_processors;
8792 	}
8793 
8794 	cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8795 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8796 		bit_clear(cpu_map, 0);
8797 	}
8798 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8799 }
8800 
8801 /* pset is locked */
8802 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)8803 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8804 {
8805 	int nbackup_cpus = 0;
8806 
8807 	if (include_backups && rt_runq_is_low_latency(pset)) {
8808 		nbackup_cpus = sched_rt_n_backup_processors;
8809 	}
8810 
8811 	cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8812 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8813 }
8814 #endif
8815 
8816 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)8817 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8818 {
8819 	if (!processor->is_recommended) {
8820 		return false;
8821 	}
8822 	bool ok_to_run_realtime_thread = true;
8823 #if defined(__x86_64__)
8824 	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8825 	if (spill_pending) {
8826 		return true;
8827 	}
8828 	if (processor->cpu_id == 0) {
8829 		if (sched_avoid_cpu0 == 1) {
8830 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8831 		} else if (sched_avoid_cpu0 == 2) {
8832 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8833 		}
8834 	} else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8835 		ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8836 	} else if (processor->processor_primary != processor) {
8837 		ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8838 	}
8839 #else
8840 	(void)pset;
8841 	(void)processor;
8842 	(void)as_backup;
8843 #endif
8844 	return ok_to_run_realtime_thread;
8845 }
8846 
8847 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)8848 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8849 {
8850 	if (drop_lock) {
8851 		pset_unlock(pset);
8852 	}
8853 }
8854 
8855 void
thread_set_no_smt(bool set)8856 thread_set_no_smt(bool set)
8857 {
8858 	if (!system_is_SMT) {
8859 		/* Not a machine that supports SMT */
8860 		return;
8861 	}
8862 
8863 	thread_t thread = current_thread();
8864 
8865 	spl_t s = splsched();
8866 	thread_lock(thread);
8867 	if (set) {
8868 		thread->sched_flags |= TH_SFLAG_NO_SMT;
8869 	}
8870 	thread_unlock(thread);
8871 	splx(s);
8872 }
8873 
8874 bool
thread_get_no_smt(void)8875 thread_get_no_smt(void)
8876 {
8877 	return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8878 }
8879 
8880 extern void task_set_no_smt(task_t);
8881 void
task_set_no_smt(task_t task)8882 task_set_no_smt(task_t task)
8883 {
8884 	if (!system_is_SMT) {
8885 		/* Not a machine that supports SMT */
8886 		return;
8887 	}
8888 
8889 	if (task == TASK_NULL) {
8890 		task = current_task();
8891 	}
8892 
8893 	task_lock(task);
8894 	task->t_flags |= TF_NO_SMT;
8895 	task_unlock(task);
8896 }
8897 
8898 #if DEBUG || DEVELOPMENT
8899 extern void sysctl_task_set_no_smt(char no_smt);
8900 void
sysctl_task_set_no_smt(char no_smt)8901 sysctl_task_set_no_smt(char no_smt)
8902 {
8903 	if (!system_is_SMT) {
8904 		/* Not a machine that supports SMT */
8905 		return;
8906 	}
8907 
8908 	task_t task = current_task();
8909 
8910 	task_lock(task);
8911 	if (no_smt == '1') {
8912 		task->t_flags |= TF_NO_SMT;
8913 	}
8914 	task_unlock(task);
8915 }
8916 
8917 extern char sysctl_task_get_no_smt(void);
8918 char
sysctl_task_get_no_smt(void)8919 sysctl_task_get_no_smt(void)
8920 {
8921 	task_t task = current_task();
8922 
8923 	if (task->t_flags & TF_NO_SMT) {
8924 		return '1';
8925 	}
8926 	return '0';
8927 }
8928 #endif /* DEVELOPMENT || DEBUG */
8929 
8930 
8931 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8932 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8933 {
8934 #if __AMP__
8935 	spl_t s = splsched();
8936 	thread_lock(thread);
8937 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8938 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8939 	if (soft_bound) {
8940 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8941 	}
8942 	switch (cluster_type) {
8943 	case 'e':
8944 	case 'E':
8945 		if (pset0.pset_cluster_type == PSET_AMP_E) {
8946 			thread->th_bound_cluster_id = pset0.pset_id;
8947 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8948 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8949 		}
8950 		break;
8951 	case 'p':
8952 	case 'P':
8953 		if (pset0.pset_cluster_type == PSET_AMP_P) {
8954 			thread->th_bound_cluster_id = pset0.pset_id;
8955 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8956 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8957 		}
8958 		break;
8959 	default:
8960 		break;
8961 	}
8962 	thread_unlock(thread);
8963 	splx(s);
8964 
8965 	if (thread == current_thread()) {
8966 		thread_block(THREAD_CONTINUE_NULL);
8967 	}
8968 #else /* __AMP__ */
8969 	(void)thread;
8970 	(void)cluster_type;
8971 	(void)soft_bound;
8972 #endif /* __AMP__ */
8973 }
8974 
8975 extern uint32_t thread_bound_cluster_id(thread_t thread);
8976 uint32_t
thread_bound_cluster_id(thread_t thread)8977 thread_bound_cluster_id(thread_t thread)
8978 {
8979 	return thread->th_bound_cluster_id;
8980 }
8981 
8982 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8983 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8984 {
8985 #if __AMP__
8986 
8987 	processor_set_t pset = NULL;
8988 
8989 	/* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
8990 	if ((options & THREAD_UNBIND) || cluster_id == THREAD_BOUND_CLUSTER_NONE) {
8991 		/* If the thread was actually not bound to some cluster, nothing to do here */
8992 		if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8993 			return KERN_SUCCESS;
8994 		}
8995 	} else {
8996 		/* Validate the inputs for the bind case */
8997 		int max_clusters = ml_get_cluster_count();
8998 		if (cluster_id >= max_clusters) {
8999 			/* Invalid cluster id */
9000 			return KERN_INVALID_VALUE;
9001 		}
9002 		pset = pset_array[cluster_id];
9003 		if (pset == NULL) {
9004 			/* Cluster has not been initialized yet */
9005 			return KERN_INVALID_VALUE;
9006 		}
9007 		if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9008 			if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
9009 				/* Thread is not recommended for the cluster type */
9010 				return KERN_INVALID_POLICY;
9011 			}
9012 		}
9013 	}
9014 
9015 	spl_t s = splsched();
9016 	thread_lock(thread);
9017 
9018 	/* Unbind the thread from its previous bound state */
9019 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9020 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9021 
9022 	if (options & THREAD_UNBIND) {
9023 		/* Nothing more to do here */
9024 		goto thread_bind_cluster_complete;
9025 	}
9026 
9027 	if (options & THREAD_BIND_SOFT) {
9028 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
9029 	}
9030 	thread->th_bound_cluster_id = cluster_id;
9031 
9032 thread_bind_cluster_complete:
9033 	thread_unlock(thread);
9034 	splx(s);
9035 
9036 	if (thread == current_thread()) {
9037 		thread_block(THREAD_CONTINUE_NULL);
9038 	}
9039 #else /* __AMP__ */
9040 	(void)thread;
9041 	(void)cluster_id;
9042 	(void)options;
9043 #endif /* __AMP__ */
9044 	return KERN_SUCCESS;
9045 }
9046 
9047 #if DEVELOPMENT || DEBUG
9048 extern int32_t sysctl_get_bound_cpuid(void);
9049 int32_t
sysctl_get_bound_cpuid(void)9050 sysctl_get_bound_cpuid(void)
9051 {
9052 	int32_t cpuid = -1;
9053 	thread_t self = current_thread();
9054 
9055 	processor_t processor = self->bound_processor;
9056 	if (processor == NULL) {
9057 		cpuid = -1;
9058 	} else {
9059 		cpuid = processor->cpu_id;
9060 	}
9061 
9062 	return cpuid;
9063 }
9064 
9065 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9066 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)9067 sysctl_thread_bind_cpuid(int32_t cpuid)
9068 {
9069 	processor_t processor = PROCESSOR_NULL;
9070 
9071 	if (cpuid == -1) {
9072 		goto unbind;
9073 	}
9074 
9075 	if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9076 		return KERN_INVALID_VALUE;
9077 	}
9078 
9079 	processor = processor_array[cpuid];
9080 	if (processor == PROCESSOR_NULL) {
9081 		return KERN_INVALID_VALUE;
9082 	}
9083 
9084 #if __AMP__
9085 
9086 	thread_t thread = current_thread();
9087 
9088 	if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9089 		if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
9090 			/* Cannot hard-bind an already hard-cluster-bound thread */
9091 			return KERN_NOT_SUPPORTED;
9092 		}
9093 	}
9094 
9095 #endif /* __AMP__ */
9096 
9097 unbind:
9098 	thread_bind(processor);
9099 
9100 	thread_block(THREAD_CONTINUE_NULL);
9101 	return KERN_SUCCESS;
9102 }
9103 
9104 extern char sysctl_get_task_cluster_type(void);
9105 char
sysctl_get_task_cluster_type(void)9106 sysctl_get_task_cluster_type(void)
9107 {
9108 	task_t task = current_task();
9109 	processor_set_t pset_hint = task->pset_hint;
9110 
9111 	if (!pset_hint) {
9112 		return '0';
9113 	}
9114 
9115 #if __AMP__
9116 	if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9117 		return 'E';
9118 	} else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9119 		return 'P';
9120 	}
9121 #endif
9122 
9123 	return '0';
9124 }
9125 
9126 #if __AMP__
9127 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)9128 find_pset_of_type(pset_cluster_type_t t)
9129 {
9130 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9131 		if (node->pset_cluster_type != t) {
9132 			continue;
9133 		}
9134 
9135 		processor_set_t pset = PROCESSOR_SET_NULL;
9136 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9137 			pset = pset_array[pset_id];
9138 			/* Prefer one with recommended processsors */
9139 			if (pset->recommended_bitmask != 0) {
9140 				assert(pset->pset_cluster_type == t);
9141 				return pset;
9142 			}
9143 		}
9144 		/* Otherwise return whatever was found last */
9145 		return pset;
9146 	}
9147 
9148 	return PROCESSOR_SET_NULL;
9149 }
9150 #endif
9151 
9152 extern void sysctl_task_set_cluster_type(char cluster_type);
9153 void
sysctl_task_set_cluster_type(char cluster_type)9154 sysctl_task_set_cluster_type(char cluster_type)
9155 {
9156 	task_t task = current_task();
9157 	processor_set_t pset_hint = PROCESSOR_SET_NULL;
9158 
9159 #if __AMP__
9160 	switch (cluster_type) {
9161 	case 'e':
9162 	case 'E':
9163 		pset_hint = find_pset_of_type(PSET_AMP_E);
9164 		break;
9165 	case 'p':
9166 	case 'P':
9167 		pset_hint = find_pset_of_type(PSET_AMP_P);
9168 		break;
9169 	default:
9170 		break;
9171 	}
9172 
9173 	if (pset_hint) {
9174 		task_lock(task);
9175 		task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9176 		task->pset_hint = pset_hint;
9177 		task_unlock(task);
9178 
9179 		thread_block(THREAD_CONTINUE_NULL);
9180 	}
9181 #else
9182 	(void)cluster_type;
9183 	(void)task;
9184 	(void)pset_hint;
9185 #endif
9186 }
9187 
9188 /*
9189  * The quantum length used for Fixed and RT sched modes. In general the quantum
9190  * can vary - for example for background or QOS.
9191  */
9192 extern uint64_t sysctl_get_quantum_us(void);
9193 uint64_t
sysctl_get_quantum_us(void)9194 sysctl_get_quantum_us(void)
9195 {
9196 	uint32_t quantum;
9197 	uint64_t quantum_ns;
9198 
9199 	quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9200 	absolutetime_to_nanoseconds(quantum, &quantum_ns);
9201 
9202 	return quantum_ns / 1000;
9203 }
9204 
9205 #endif /* DEVELOPMENT || DEBUG */
9206