xref: /xnu-8020.140.41/osfmk/kern/sched_prim.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_FREE_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	sched_prim.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1986
62  *
63  *	Scheduling primitives
64  *
65  */
66 
67 #include <debug.h>
68 
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74 
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/limits.h>
79 #include <machine/atomic.h>
80 
81 #include <machine/commpage.h>
82 
83 #include <kern/kern_types.h>
84 #include <kern/backtrace.h>
85 #include <kern/clock.h>
86 #include <kern/cpu_number.h>
87 #include <kern/cpu_data.h>
88 #include <kern/smp.h>
89 #include <kern/debug.h>
90 #include <kern/macro_help.h>
91 #include <kern/machine.h>
92 #include <kern/misc_protos.h>
93 #if MONOTONIC
94 #include <kern/monotonic.h>
95 #endif /* MONOTONIC */
96 #include <kern/processor.h>
97 #include <kern/queue.h>
98 #include <kern/restartable.h>
99 #include <kern/sched.h>
100 #include <kern/sched_prim.h>
101 #include <kern/sfi.h>
102 #include <kern/syscall_subr.h>
103 #include <kern/task.h>
104 #include <kern/thread.h>
105 #include <kern/thread_group.h>
106 #include <kern/ledger.h>
107 #include <kern/timer_queue.h>
108 #include <kern/waitq.h>
109 #include <kern/policy_internal.h>
110 #include <kern/cpu_quiesce.h>
111 
112 #include <vm/pmap.h>
113 #include <vm/vm_kern.h>
114 #include <vm/vm_map.h>
115 #include <vm/vm_pageout.h>
116 
117 #include <mach/sdt.h>
118 #include <mach/mach_host.h>
119 #include <mach/host_info.h>
120 
121 #include <sys/kdebug.h>
122 #include <kperf/kperf.h>
123 #include <kern/kpc.h>
124 #include <san/kasan.h>
125 #include <kern/pms.h>
126 #include <kern/host.h>
127 #include <stdatomic.h>
128 
129 struct sched_statistics PERCPU_DATA(sched_stats);
130 bool sched_stats_active;
131 
132 static uint64_t
deadline_add(uint64_t d,uint64_t e)133 deadline_add(uint64_t d, uint64_t e)
134 {
135 	uint64_t sum;
136 	return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
137 }
138 
139 int
rt_runq_count(processor_set_t pset)140 rt_runq_count(processor_set_t pset)
141 {
142 	return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
143 }
144 
145 uint64_t
rt_runq_earliest_deadline(processor_set_t pset)146 rt_runq_earliest_deadline(processor_set_t pset)
147 {
148 	return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
149 }
150 
151 static int
rt_runq_priority(processor_set_t pset)152 rt_runq_priority(processor_set_t pset)
153 {
154 	pset_assert_locked(pset);
155 	rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
156 
157 	bitmap_t *map = rt_run_queue->bitmap;
158 	int i = bitmap_first(map, NRTQS);
159 	assert(i < NRTQS);
160 
161 	if (i >= 0) {
162 		return i + BASEPRI_RTQUEUES;
163 	}
164 
165 	return i;
166 }
167 
168 static thread_t rt_runq_first(rt_queue_t rt_runq);
169 
170 #if DEBUG
171 static void
check_rt_runq_consistency(rt_queue_t rt_run_queue,thread_t thread)172 check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
173 {
174 	bitmap_t *map = rt_run_queue->bitmap;
175 
176 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
177 	uint32_t constraint = RT_CONSTRAINT_NONE;
178 	int ed_index = NOPRI;
179 	int count = 0;
180 	bool found_thread = false;
181 
182 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
183 		int i = pri - BASEPRI_RTQUEUES;
184 		rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
185 		queue_t queue = &rt_runq->pri_queue;
186 		queue_entry_t iter;
187 		int n = 0;
188 		uint64_t previous_deadline = 0;
189 		qe_foreach(iter, queue) {
190 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
191 			assert_thread_magic(iter_thread);
192 			if (iter_thread == thread) {
193 				found_thread = true;
194 			}
195 			assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
196 			assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
197 			assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
198 			assert(previous_deadline <= iter_thread->realtime.deadline);
199 			n++;
200 			if (iter == queue_first(queue)) {
201 				assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
202 				assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
203 			}
204 			previous_deadline = iter_thread->realtime.deadline;
205 		}
206 		assert(n == rt_runq->pri_count);
207 		if (n == 0) {
208 			assert(bitmap_test(map, i) == false);
209 			assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
210 			assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
211 		} else {
212 			assert(bitmap_test(map, i) == true);
213 		}
214 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
215 			earliest_deadline = rt_runq->pri_earliest_deadline;
216 			constraint = rt_runq->pri_constraint;
217 			ed_index = i;
218 		}
219 		count += n;
220 	}
221 	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
222 	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
223 	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
224 	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
225 	if (thread) {
226 		assert(found_thread);
227 	}
228 }
229 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    check_rt_runq_consistency(q, th)
230 #else
231 #define CHECK_RT_RUNQ_CONSISTENCY(q, th)    do {} while (0)
232 #endif
233 
234 uint32_t rt_constraint_threshold;
235 
236 static bool
rt_runq_is_low_latency(processor_set_t pset)237 rt_runq_is_low_latency(processor_set_t pset)
238 {
239 	return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
240 }
241 
242 #define         DEFAULT_PREEMPTION_RATE         100             /* (1/s) */
243 TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
244 
245 #define         DEFAULT_BG_PREEMPTION_RATE      400             /* (1/s) */
246 TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
247 
248 #define         MAX_UNSAFE_QUANTA               800
249 TUNABLE(int, max_unsafe_quanta, "unsafe", MAX_UNSAFE_QUANTA);
250 
251 #define         MAX_POLL_QUANTA                 2
252 TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
253 
254 #define         SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
255 int             sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
256 
257 uint64_t        max_poll_computation;
258 
259 uint64_t        max_unsafe_computation;
260 uint64_t        sched_safe_duration;
261 
262 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
263 
264 uint32_t        std_quantum;
265 uint32_t        min_std_quantum;
266 uint32_t        bg_quantum;
267 
268 uint32_t        std_quantum_us;
269 uint32_t        bg_quantum_us;
270 
271 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
272 
273 uint32_t        thread_depress_time;
274 uint32_t        default_timeshare_computation;
275 uint32_t        default_timeshare_constraint;
276 
277 uint32_t        max_rt_quantum;
278 uint32_t        min_rt_quantum;
279 
280 uint32_t        rt_deadline_epsilon;
281 
282 uint32_t        rt_constraint_threshold;
283 uint32_t        rt_constraint_ll;
284 
285 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
286 
287 unsigned                sched_tick;
288 uint32_t                sched_tick_interval;
289 
290 /* Timeshare load calculation interval (15ms) */
291 uint32_t                sched_load_compute_interval_us = 15000;
292 uint64_t                sched_load_compute_interval_abs;
293 static _Atomic uint64_t sched_load_compute_deadline;
294 
295 uint32_t        sched_pri_shifts[TH_BUCKET_MAX];
296 uint32_t        sched_fixed_shift;
297 
298 uint32_t        sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
299 
300 /* Allow foreground to decay past default to resolve inversions */
301 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
302 int             sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
303 
304 /* Defaults for timer deadline profiling */
305 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
306 	                                               * 2ms */
307 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
308 	                                               *   <= 5ms */
309 
310 uint64_t timer_deadline_tracking_bin_1;
311 uint64_t timer_deadline_tracking_bin_2;
312 
313 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
314 
315 thread_t sched_maintenance_thread;
316 
317 /* interrupts disabled lock to guard recommended cores state */
318 decl_simple_lock_data(static, sched_recommended_cores_lock);
319 static uint64_t    usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
320 static void sched_update_recommended_cores(uint64_t recommended_cores);
321 
322 #if __arm__ || __arm64__
323 static void sched_recommended_cores_maintenance(void);
324 uint64_t    perfcontrol_failsafe_starvation_threshold;
325 extern char *proc_name_address(struct proc *p);
326 #endif /* __arm__ || __arm64__ */
327 
328 uint64_t        sched_one_second_interval;
329 boolean_t       allow_direct_handoff = TRUE;
330 
331 /* Forwards */
332 
333 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
334 
335 static void load_shift_init(void);
336 static void preempt_pri_init(void);
337 
338 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
339 
340 thread_t        processor_idle(
341 	thread_t                        thread,
342 	processor_t                     processor);
343 
344 static ast_t
345 csw_check_locked(
346 	thread_t        thread,
347 	processor_t     processor,
348 	processor_set_t pset,
349 	ast_t           check_reason);
350 
351 static void processor_setrun(
352 	processor_t                    processor,
353 	thread_t                       thread,
354 	integer_t                      options);
355 
356 static void
357 sched_realtime_timebase_init(void);
358 
359 static void
360 sched_timer_deadline_tracking_init(void);
361 
362 #if     DEBUG
363 extern int debug_task;
364 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
365 #else
366 #define TLOG(a, fmt, args...) do {} while (0)
367 #endif
368 
369 static processor_t
370 thread_bind_internal(
371 	thread_t                thread,
372 	processor_t             processor);
373 
374 static void
375 sched_vm_group_maintenance(void);
376 
377 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
378 int8_t          sched_load_shifts[NRQS];
379 bitmap_t        sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
380 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
381 
382 /*
383  * Statically allocate a buffer to hold the longest possible
384  * scheduler description string, as currently implemented.
385  * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
386  * to export to userspace via sysctl(3). If either version
387  * changes, update the other.
388  *
389  * Note that in addition to being an upper bound on the strings
390  * in the kernel, it's also an exact parameter to PE_get_default(),
391  * which interrogates the device tree on some platforms. That
392  * API requires the caller know the exact size of the device tree
393  * property, so we need both a legacy size (32) and the current size
394  * (48) to deal with old and new device trees. The device tree property
395  * is similarly padded to a fixed size so that the same kernel image
396  * can run on multiple devices with different schedulers configured
397  * in the device tree.
398  */
399 char sched_string[SCHED_STRING_MAX_LENGTH];
400 
401 uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
402 
403 /* Global flag which indicates whether Background Stepper Context is enabled */
404 static int cpu_throttle_enabled = 1;
405 
406 #if DEVELOPMENT || DEBUG
407 int enable_task_set_cluster_type = 0;
408 bool system_ecore_only = false;
409 #endif /* DEVELOPMENT || DEBUG */
410 
411 void
sched_init(void)412 sched_init(void)
413 {
414 	boolean_t direct_handoff = FALSE;
415 	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
416 
417 	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
418 		/* No boot-args, check in device tree */
419 		if (!PE_get_default("kern.sched_pri_decay_limit",
420 		    &sched_pri_decay_band_limit,
421 		    sizeof(sched_pri_decay_band_limit))) {
422 			/* Allow decay all the way to normal limits */
423 			sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
424 		}
425 	}
426 
427 	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
428 
429 	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
430 		kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
431 	}
432 	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
433 
434 	cpu_quiescent_counter_init();
435 
436 	SCHED(init)();
437 	SCHED(rt_init)(&pset0);
438 	sched_timer_deadline_tracking_init();
439 
440 	SCHED(pset_init)(&pset0);
441 	SCHED(processor_init)(master_processor);
442 
443 	if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) {
444 		allow_direct_handoff = direct_handoff;
445 	}
446 
447 #if DEVELOPMENT || DEBUG
448 	if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
449 		system_ecore_only = (enable_task_set_cluster_type == 2);
450 	}
451 #endif /* DEVELOPMENT || DEBUG */
452 }
453 
454 void
sched_timebase_init(void)455 sched_timebase_init(void)
456 {
457 	uint64_t        abstime;
458 
459 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
460 	sched_one_second_interval = abstime;
461 
462 	SCHED(timebase_init)();
463 	sched_realtime_timebase_init();
464 }
465 
466 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
467 
468 void
sched_timeshare_init(void)469 sched_timeshare_init(void)
470 {
471 	/*
472 	 * Calculate the timeslicing quantum
473 	 * in us.
474 	 */
475 	if (default_preemption_rate < 1) {
476 		default_preemption_rate = DEFAULT_PREEMPTION_RATE;
477 	}
478 	std_quantum_us = (1000 * 1000) / default_preemption_rate;
479 
480 	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
481 
482 	if (default_bg_preemption_rate < 1) {
483 		default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
484 	}
485 	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
486 
487 	printf("standard background quantum is %d us\n", bg_quantum_us);
488 
489 	load_shift_init();
490 	preempt_pri_init();
491 	sched_tick = 0;
492 }
493 
494 void
sched_timeshare_timebase_init(void)495 sched_timeshare_timebase_init(void)
496 {
497 	uint64_t        abstime;
498 	uint32_t        shift;
499 
500 	/* standard timeslicing quantum */
501 	clock_interval_to_absolutetime_interval(
502 		std_quantum_us, NSEC_PER_USEC, &abstime);
503 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
504 	std_quantum = (uint32_t)abstime;
505 
506 	/* smallest remaining quantum (250 us) */
507 	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
508 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
509 	min_std_quantum = (uint32_t)abstime;
510 
511 	/* quantum for background tasks */
512 	clock_interval_to_absolutetime_interval(
513 		bg_quantum_us, NSEC_PER_USEC, &abstime);
514 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
515 	bg_quantum = (uint32_t)abstime;
516 
517 	/* scheduler tick interval */
518 	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
519 	    NSEC_PER_USEC, &abstime);
520 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
521 	sched_tick_interval = (uint32_t)abstime;
522 
523 	/* timeshare load calculation interval & deadline initialization */
524 	clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
525 	os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
526 
527 	/*
528 	 * Compute conversion factor from usage to
529 	 * timesharing priorities with 5/8 ** n aging.
530 	 */
531 	abstime = (abstime * 5) / 3;
532 	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
533 		abstime >>= 1;
534 	}
535 	sched_fixed_shift = shift;
536 
537 	for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
538 		sched_pri_shifts[i] = INT8_MAX;
539 	}
540 
541 	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
542 	sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
543 
544 	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
545 	thread_depress_time = 1 * std_quantum;
546 	default_timeshare_computation = std_quantum / 2;
547 	default_timeshare_constraint = std_quantum;
548 
549 #if __arm__ || __arm64__
550 	perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
551 #endif /* __arm__ || __arm64__ */
552 }
553 
554 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
555 
556 void
pset_rt_init(processor_set_t pset)557 pset_rt_init(processor_set_t pset)
558 {
559 	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
560 		int i = pri - BASEPRI_RTQUEUES;
561 		rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
562 		queue_init(&rqi->pri_queue);
563 		rqi->pri_count = 0;
564 		rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
565 		rqi->pri_constraint = RT_CONSTRAINT_NONE;
566 	}
567 	os_atomic_init(&pset->rt_runq.count, 0);
568 	os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
569 	os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
570 	os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
571 	memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
572 }
573 
574 /* constraint limit for low latency RT threads */
575 int rt_constraint_ll_us = 0;
576 
577 int
sched_get_rt_constraint_ll(void)578 sched_get_rt_constraint_ll(void)
579 {
580 	return rt_constraint_ll_us;
581 }
582 
583 void
sched_set_rt_constraint_ll(int new_constraint_us)584 sched_set_rt_constraint_ll(int new_constraint_us)
585 {
586 	rt_constraint_ll_us = new_constraint_us;
587 
588 	uint64_t abstime;
589 	clock_interval_to_absolutetime_interval(rt_constraint_ll_us, NSEC_PER_USEC, &abstime);
590 	assert((abstime >> 32) == 0 && ((rt_constraint_ll_us == 0) || (uint32_t)abstime != 0));
591 	rt_constraint_ll = (uint32_t)abstime;
592 }
593 
594 /* epsilon for comparing RT deadlines */
595 int rt_deadline_epsilon_us = 100;
596 
597 int
sched_get_rt_deadline_epsilon(void)598 sched_get_rt_deadline_epsilon(void)
599 {
600 	return rt_deadline_epsilon_us;
601 }
602 
603 void
sched_set_rt_deadline_epsilon(int new_epsilon_us)604 sched_set_rt_deadline_epsilon(int new_epsilon_us)
605 {
606 	rt_deadline_epsilon_us = new_epsilon_us;
607 
608 	uint64_t abstime;
609 	clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime);
610 	assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
611 	rt_deadline_epsilon = (uint32_t)abstime;
612 }
613 
614 static void
sched_realtime_timebase_init(void)615 sched_realtime_timebase_init(void)
616 {
617 	uint64_t abstime;
618 
619 	/* smallest rt computation (50 us) */
620 	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
621 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
622 	min_rt_quantum = (uint32_t)abstime;
623 
624 	/* maximum rt computation (50 ms) */
625 	clock_interval_to_absolutetime_interval(
626 		50, 1000 * NSEC_PER_USEC, &abstime);
627 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
628 	max_rt_quantum = (uint32_t)abstime;
629 
630 	/* constraint threshold for sending backup IPIs (4 ms) */
631 	clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime);
632 	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
633 	rt_constraint_threshold = (uint32_t)abstime;
634 
635 	/* constraint limit for low latency RT threads */
636 	sched_set_rt_constraint_ll(rt_constraint_ll_us);
637 
638 	/* epsilon for comparing deadlines */
639 	sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us);
640 }
641 
642 void
sched_check_spill(processor_set_t pset,thread_t thread)643 sched_check_spill(processor_set_t pset, thread_t thread)
644 {
645 	(void)pset;
646 	(void)thread;
647 
648 	return;
649 }
650 
651 bool
sched_thread_should_yield(processor_t processor,thread_t thread)652 sched_thread_should_yield(processor_t processor, thread_t thread)
653 {
654 	(void)thread;
655 
656 	return !SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0;
657 }
658 
659 /* Default implementations of .steal_thread_enabled */
660 bool
sched_steal_thread_DISABLED(processor_set_t pset)661 sched_steal_thread_DISABLED(processor_set_t pset)
662 {
663 	(void)pset;
664 	return false;
665 }
666 
667 bool
sched_steal_thread_enabled(processor_set_t pset)668 sched_steal_thread_enabled(processor_set_t pset)
669 {
670 	return bit_count(pset->node->pset_map) > 1;
671 }
672 
673 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
674 
675 /*
676  * Set up values for timeshare
677  * loading factors.
678  */
679 static void
load_shift_init(void)680 load_shift_init(void)
681 {
682 	int8_t          k, *p = sched_load_shifts;
683 	uint32_t        i, j;
684 
685 	uint32_t        sched_decay_penalty = 1;
686 
687 	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof(sched_decay_penalty))) {
688 		kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
689 	}
690 
691 	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof(sched_decay_usage_age_factor))) {
692 		kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
693 	}
694 
695 	if (sched_decay_penalty == 0) {
696 		/*
697 		 * There is no penalty for timeshare threads for using too much
698 		 * CPU, so set all load shifts to INT8_MIN. Even under high load,
699 		 * sched_pri_shift will be >INT8_MAX, and there will be no
700 		 * penalty applied to threads (nor will sched_usage be updated per
701 		 * thread).
702 		 */
703 		for (i = 0; i < NRQS; i++) {
704 			sched_load_shifts[i] = INT8_MIN;
705 		}
706 
707 		return;
708 	}
709 
710 	*p++ = INT8_MIN; *p++ = 0;
711 
712 	/*
713 	 * For a given system load "i", the per-thread priority
714 	 * penalty per quantum of CPU usage is ~2^k priority
715 	 * levels. "sched_decay_penalty" can cause more
716 	 * array entries to be filled with smaller "k" values
717 	 */
718 	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
719 		for (j <<= 1; (i < j) && (i < NRQS); ++i) {
720 			*p++ = k;
721 		}
722 	}
723 }
724 
725 static void
preempt_pri_init(void)726 preempt_pri_init(void)
727 {
728 	bitmap_t *p = sched_preempt_pri;
729 
730 	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
731 		bitmap_set(p, i);
732 	}
733 
734 	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
735 		bitmap_set(p, i);
736 	}
737 }
738 
739 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
740 
741 void
check_monotonic_time(uint64_t ctime)742 check_monotonic_time(uint64_t ctime)
743 {
744 	processor_t processor = current_processor();
745 	uint64_t last_dispatch = processor->last_dispatch;
746 
747 	if (last_dispatch > ctime) {
748 		panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
749 		    last_dispatch, ctime);
750 	}
751 }
752 
753 
754 /*
755  *	Thread wait timer expiration.
756  */
757 void
thread_timer_expire(void * p0,__unused void * p1)758 thread_timer_expire(
759 	void                    *p0,
760 	__unused void   *p1)
761 {
762 	thread_t                thread = p0;
763 	spl_t                   s;
764 
765 	assert_thread_magic(thread);
766 
767 	s = splsched();
768 	thread_lock(thread);
769 	if (--thread->wait_timer_active == 0) {
770 		if (thread->wait_timer_is_set) {
771 			thread->wait_timer_is_set = FALSE;
772 			clear_wait_internal(thread, THREAD_TIMED_OUT);
773 		}
774 	}
775 	thread_unlock(thread);
776 	splx(s);
777 }
778 
779 /*
780  *	thread_unblock:
781  *
782  *	Unblock thread on wake up.
783  *
784  *	Returns TRUE if the thread should now be placed on the runqueue.
785  *
786  *	Thread must be locked.
787  *
788  *	Called at splsched().
789  */
790 boolean_t
thread_unblock(thread_t thread,wait_result_t wresult)791 thread_unblock(
792 	thread_t                thread,
793 	wait_result_t   wresult)
794 {
795 	boolean_t               ready_for_runq = FALSE;
796 	thread_t                cthread = current_thread();
797 	uint32_t                new_run_count;
798 	int                             old_thread_state;
799 
800 	/*
801 	 *	Set wait_result.
802 	 */
803 	thread->wait_result = wresult;
804 
805 	/*
806 	 *	Cancel pending wait timer.
807 	 */
808 	if (thread->wait_timer_is_set) {
809 		if (timer_call_cancel(thread->wait_timer)) {
810 			thread->wait_timer_active--;
811 		}
812 		thread->wait_timer_is_set = FALSE;
813 	}
814 
815 	boolean_t aticontext, pidle;
816 	ml_get_power_state(&aticontext, &pidle);
817 
818 	/*
819 	 *	Update scheduling state: not waiting,
820 	 *	set running.
821 	 */
822 	old_thread_state = thread->state;
823 	thread->state = (old_thread_state | TH_RUN) &
824 	    ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT);
825 
826 	if ((old_thread_state & TH_RUN) == 0) {
827 		uint64_t ctime = mach_approximate_time();
828 
829 		check_monotonic_time(ctime);
830 
831 		thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
832 		timer_start(&thread->runnable_timer, ctime);
833 
834 		ready_for_runq = TRUE;
835 
836 		if (old_thread_state & TH_WAIT_REPORT) {
837 			(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
838 		}
839 
840 		/* Update the runnable thread count */
841 		new_run_count = SCHED(run_count_incr)(thread);
842 
843 #if CONFIG_SCHED_AUTO_JOIN
844 		if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
845 			work_interval_auto_join_propagate(cthread, thread);
846 		}
847 #endif /*CONFIG_SCHED_AUTO_JOIN */
848 	} else {
849 		/*
850 		 * Either the thread is idling in place on another processor,
851 		 * or it hasn't finished context switching yet.
852 		 */
853 		assert((thread->state & TH_IDLE) == 0);
854 		/*
855 		 * The run count is only dropped after the context switch completes
856 		 * and the thread is still waiting, so we should not run_incr here
857 		 */
858 		new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
859 	}
860 
861 	/*
862 	 * Calculate deadline for real-time threads.
863 	 */
864 	if (thread->sched_mode == TH_MODE_REALTIME) {
865 		uint64_t ctime = mach_absolute_time();
866 		thread->realtime.deadline = thread->realtime.constraint + ctime;
867 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
868 		    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
869 	}
870 
871 	/*
872 	 * Clear old quantum, fail-safe computation, etc.
873 	 */
874 	thread->quantum_remaining = 0;
875 	thread->computation_metered = 0;
876 	thread->reason = AST_NONE;
877 	thread->block_hint = kThreadWaitNone;
878 
879 	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
880 	 * We also account for "double hop" thread signaling via
881 	 * the thread callout infrastructure.
882 	 * DRK: consider removing the callout wakeup counters in the future
883 	 * they're present for verification at the moment.
884 	 */
885 
886 	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
887 		DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
888 
889 		uint64_t ttd = current_processor()->timer_call_ttd;
890 
891 		if (ttd) {
892 			if (ttd <= timer_deadline_tracking_bin_1) {
893 				thread->thread_timer_wakeups_bin_1++;
894 			} else if (ttd <= timer_deadline_tracking_bin_2) {
895 				thread->thread_timer_wakeups_bin_2++;
896 			}
897 		}
898 
899 		ledger_credit_thread(thread, thread->t_ledger,
900 		    task_ledgers.interrupt_wakeups, 1);
901 		if (pidle) {
902 			ledger_credit_thread(thread, thread->t_ledger,
903 			    task_ledgers.platform_idle_wakeups, 1);
904 		}
905 	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
906 		/* TODO: what about an interrupt that does a wake taken on a callout thread? */
907 		if (cthread->callout_woken_from_icontext) {
908 			ledger_credit_thread(thread, thread->t_ledger,
909 			    task_ledgers.interrupt_wakeups, 1);
910 			thread->thread_callout_interrupt_wakeups++;
911 
912 			if (cthread->callout_woken_from_platform_idle) {
913 				ledger_credit_thread(thread, thread->t_ledger,
914 				    task_ledgers.platform_idle_wakeups, 1);
915 				thread->thread_callout_platform_idle_wakeups++;
916 			}
917 
918 			cthread->callout_woke_thread = TRUE;
919 		}
920 	}
921 
922 	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
923 		thread->callout_woken_from_icontext = !!aticontext;
924 		thread->callout_woken_from_platform_idle = !!pidle;
925 		thread->callout_woke_thread = FALSE;
926 	}
927 
928 #if KPERF
929 	if (ready_for_runq) {
930 		kperf_make_runnable(thread, aticontext);
931 	}
932 #endif /* KPERF */
933 
934 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
935 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
936 	    (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
937 	    sched_run_buckets[TH_BUCKET_RUN], 0);
938 
939 	DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
940 
941 	return ready_for_runq;
942 }
943 
944 /*
945  *	Routine:	thread_allowed_for_handoff
946  *	Purpose:
947  *		Check if the thread is allowed for handoff operation
948  *	Conditions:
949  *		thread lock held, IPC locks may be held.
950  *	TODO: In future, do not allow handoff if threads have different cluster
951  *	recommendations.
952  */
953 boolean_t
thread_allowed_for_handoff(thread_t thread)954 thread_allowed_for_handoff(
955 	thread_t         thread)
956 {
957 	thread_t self = current_thread();
958 
959 	if (allow_direct_handoff &&
960 	    thread->sched_mode == TH_MODE_REALTIME &&
961 	    self->sched_mode == TH_MODE_REALTIME) {
962 		return TRUE;
963 	}
964 
965 	return FALSE;
966 }
967 
968 /*
969  *	Routine:	thread_go
970  *	Purpose:
971  *		Unblock and dispatch thread.
972  *	Conditions:
973  *		thread lock held, IPC locks may be held.
974  *		thread must have been pulled from wait queue under same lock hold.
975  *		thread must have been waiting
976  *	Returns:
977  *		KERN_SUCCESS - Thread was set running
978  *
979  * TODO: This should return void
980  */
981 kern_return_t
thread_go(thread_t thread,wait_result_t wresult,waitq_options_t option)982 thread_go(
983 	thread_t        thread,
984 	wait_result_t   wresult,
985 	waitq_options_t option)
986 {
987 	thread_t self = current_thread();
988 
989 	assert_thread_magic(thread);
990 
991 	assert(thread->at_safe_point == FALSE);
992 	assert(thread->wait_event == NO_EVENT64);
993 	assert(waitq_wait_possible(thread));
994 
995 	assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
996 	assert(thread->state & TH_WAIT);
997 
998 
999 	if (thread_unblock(thread, wresult)) {
1000 #if SCHED_TRACE_THREAD_WAKEUPS
1001 		backtrace(&thread->thread_wakeup_bt[0],
1002 		    (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1003 		    NULL);
1004 #endif /* SCHED_TRACE_THREAD_WAKEUPS */
1005 		if ((option & WQ_OPTION_HANDOFF) &&
1006 		    thread_allowed_for_handoff(thread)) {
1007 			thread_reference(thread);
1008 			assert(self->handoff_thread == NULL);
1009 			self->handoff_thread = thread;
1010 		} else {
1011 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1012 		}
1013 	}
1014 
1015 	return KERN_SUCCESS;
1016 }
1017 
1018 /*
1019  *	Routine:	thread_mark_wait_locked
1020  *	Purpose:
1021  *		Mark a thread as waiting.  If, given the circumstances,
1022  *		it doesn't want to wait (i.e. already aborted), then
1023  *		indicate that in the return value.
1024  *	Conditions:
1025  *		at splsched() and thread is locked.
1026  */
1027 __private_extern__
1028 wait_result_t
thread_mark_wait_locked(thread_t thread,wait_interrupt_t interruptible_orig)1029 thread_mark_wait_locked(
1030 	thread_t                        thread,
1031 	wait_interrupt_t        interruptible_orig)
1032 {
1033 	boolean_t                       at_safe_point;
1034 	wait_interrupt_t        interruptible = interruptible_orig;
1035 
1036 	if (thread->state & TH_IDLE) {
1037 		panic("Invalid attempt to wait while running the idle thread");
1038 	}
1039 
1040 	assert(!(thread->state & (TH_WAIT | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1041 
1042 	/*
1043 	 *	The thread may have certain types of interrupts/aborts masked
1044 	 *	off.  Even if the wait location says these types of interrupts
1045 	 *	are OK, we have to honor mask settings (outer-scoped code may
1046 	 *	not be able to handle aborts at the moment).
1047 	 */
1048 	interruptible &= TH_OPT_INTMASK;
1049 	if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1050 		interruptible = thread->options & TH_OPT_INTMASK;
1051 	}
1052 
1053 	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1054 
1055 	if (interruptible == THREAD_UNINT ||
1056 	    !(thread->sched_flags & TH_SFLAG_ABORT) ||
1057 	    (!at_safe_point &&
1058 	    (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1059 		if (!(thread->state & TH_TERMINATE)) {
1060 			DTRACE_SCHED(sleep);
1061 		}
1062 
1063 		int state_bits = TH_WAIT;
1064 		if (!interruptible) {
1065 			state_bits |= TH_UNINT;
1066 		}
1067 		if (thread->sched_call) {
1068 			wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1069 			if (is_kerneltask(get_threadtask(thread))) {
1070 				mask = THREAD_WAIT_NOREPORT_KERNEL;
1071 			}
1072 			if ((interruptible_orig & mask) == 0) {
1073 				state_bits |= TH_WAIT_REPORT;
1074 			}
1075 		}
1076 		thread->state |= state_bits;
1077 		thread->at_safe_point = at_safe_point;
1078 
1079 		/* TODO: pass this through assert_wait instead, have
1080 		 * assert_wait just take a struct as an argument */
1081 		assert(!thread->block_hint);
1082 		thread->block_hint = thread->pending_block_hint;
1083 		thread->pending_block_hint = kThreadWaitNone;
1084 
1085 		return thread->wait_result = THREAD_WAITING;
1086 	} else {
1087 		if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1088 			thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1089 		}
1090 	}
1091 	thread->pending_block_hint = kThreadWaitNone;
1092 
1093 	return thread->wait_result = THREAD_INTERRUPTED;
1094 }
1095 
1096 /*
1097  *	Routine:	thread_interrupt_level
1098  *	Purpose:
1099  *	        Set the maximum interruptible state for the
1100  *		current thread.  The effective value of any
1101  *		interruptible flag passed into assert_wait
1102  *		will never exceed this.
1103  *
1104  *		Useful for code that must not be interrupted,
1105  *		but which calls code that doesn't know that.
1106  *	Returns:
1107  *		The old interrupt level for the thread.
1108  */
1109 __private_extern__
1110 wait_interrupt_t
thread_interrupt_level(wait_interrupt_t new_level)1111 thread_interrupt_level(
1112 	wait_interrupt_t new_level)
1113 {
1114 	thread_t thread = current_thread();
1115 	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1116 
1117 	thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1118 
1119 	return result;
1120 }
1121 
1122 /*
1123  *	assert_wait:
1124  *
1125  *	Assert that the current thread is about to go to
1126  *	sleep until the specified event occurs.
1127  */
1128 wait_result_t
assert_wait(event_t event,wait_interrupt_t interruptible)1129 assert_wait(
1130 	event_t                         event,
1131 	wait_interrupt_t        interruptible)
1132 {
1133 	if (__improbable(event == NO_EVENT)) {
1134 		panic("%s() called with NO_EVENT", __func__);
1135 	}
1136 
1137 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1138 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1139 	    VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1140 
1141 	struct waitq *waitq;
1142 	waitq = global_eventq(event);
1143 	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1144 }
1145 
1146 /*
1147  *	assert_wait_queue:
1148  *
1149  *	Return the global waitq for the specified event
1150  */
1151 struct waitq *
assert_wait_queue(event_t event)1152 assert_wait_queue(
1153 	event_t                         event)
1154 {
1155 	return global_eventq(event);
1156 }
1157 
1158 wait_result_t
assert_wait_timeout(event_t event,wait_interrupt_t interruptible,uint32_t interval,uint32_t scale_factor)1159 assert_wait_timeout(
1160 	event_t                         event,
1161 	wait_interrupt_t        interruptible,
1162 	uint32_t                        interval,
1163 	uint32_t                        scale_factor)
1164 {
1165 	thread_t                        thread = current_thread();
1166 	wait_result_t           wresult;
1167 	uint64_t                        deadline;
1168 	spl_t                           s;
1169 
1170 	if (__improbable(event == NO_EVENT)) {
1171 		panic("%s() called with NO_EVENT", __func__);
1172 	}
1173 
1174 	struct waitq *waitq;
1175 	waitq = global_eventq(event);
1176 
1177 	s = splsched();
1178 	waitq_lock(waitq);
1179 
1180 	clock_interval_to_deadline(interval, scale_factor, &deadline);
1181 
1182 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1183 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1184 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1185 
1186 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1187 	    interruptible,
1188 	    TIMEOUT_URGENCY_SYS_NORMAL,
1189 	    deadline, TIMEOUT_NO_LEEWAY,
1190 	    thread);
1191 
1192 	waitq_unlock(waitq);
1193 	splx(s);
1194 	return wresult;
1195 }
1196 
1197 wait_result_t
assert_wait_timeout_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint32_t interval,uint32_t leeway,uint32_t scale_factor)1198 assert_wait_timeout_with_leeway(
1199 	event_t                         event,
1200 	wait_interrupt_t        interruptible,
1201 	wait_timeout_urgency_t  urgency,
1202 	uint32_t                        interval,
1203 	uint32_t                        leeway,
1204 	uint32_t                        scale_factor)
1205 {
1206 	thread_t                        thread = current_thread();
1207 	wait_result_t           wresult;
1208 	uint64_t                        deadline;
1209 	uint64_t                        abstime;
1210 	uint64_t                        slop;
1211 	uint64_t                        now;
1212 	spl_t                           s;
1213 
1214 	if (__improbable(event == NO_EVENT)) {
1215 		panic("%s() called with NO_EVENT", __func__);
1216 	}
1217 
1218 	now = mach_absolute_time();
1219 	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1220 	deadline = now + abstime;
1221 
1222 	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1223 
1224 	struct waitq *waitq;
1225 	waitq = global_eventq(event);
1226 
1227 	s = splsched();
1228 	waitq_lock(waitq);
1229 
1230 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1231 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1232 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1233 
1234 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1235 	    interruptible,
1236 	    urgency, deadline, slop,
1237 	    thread);
1238 
1239 	waitq_unlock(waitq);
1240 	splx(s);
1241 	return wresult;
1242 }
1243 
1244 wait_result_t
assert_wait_deadline(event_t event,wait_interrupt_t interruptible,uint64_t deadline)1245 assert_wait_deadline(
1246 	event_t                         event,
1247 	wait_interrupt_t        interruptible,
1248 	uint64_t                        deadline)
1249 {
1250 	thread_t                        thread = current_thread();
1251 	wait_result_t           wresult;
1252 	spl_t                           s;
1253 
1254 	if (__improbable(event == NO_EVENT)) {
1255 		panic("%s() called with NO_EVENT", __func__);
1256 	}
1257 
1258 	struct waitq *waitq;
1259 	waitq = global_eventq(event);
1260 
1261 	s = splsched();
1262 	waitq_lock(waitq);
1263 
1264 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1265 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1266 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1267 
1268 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1269 	    interruptible,
1270 	    TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1271 	    TIMEOUT_NO_LEEWAY, thread);
1272 	waitq_unlock(waitq);
1273 	splx(s);
1274 	return wresult;
1275 }
1276 
1277 wait_result_t
assert_wait_deadline_with_leeway(event_t event,wait_interrupt_t interruptible,wait_timeout_urgency_t urgency,uint64_t deadline,uint64_t leeway)1278 assert_wait_deadline_with_leeway(
1279 	event_t                         event,
1280 	wait_interrupt_t        interruptible,
1281 	wait_timeout_urgency_t  urgency,
1282 	uint64_t                        deadline,
1283 	uint64_t                        leeway)
1284 {
1285 	thread_t                        thread = current_thread();
1286 	wait_result_t           wresult;
1287 	spl_t                           s;
1288 
1289 	if (__improbable(event == NO_EVENT)) {
1290 		panic("%s() called with NO_EVENT", __func__);
1291 	}
1292 
1293 	struct waitq *waitq;
1294 	waitq = global_eventq(event);
1295 
1296 	s = splsched();
1297 	waitq_lock(waitq);
1298 
1299 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1300 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1301 	    VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1302 
1303 	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1304 	    interruptible,
1305 	    urgency, deadline, leeway,
1306 	    thread);
1307 	waitq_unlock(waitq);
1308 	splx(s);
1309 	return wresult;
1310 }
1311 
1312 /*
1313  * thread_isoncpu:
1314  *
1315  * Return TRUE if a thread is running on a processor such that an AST
1316  * is needed to pull it out of userspace execution, or if executing in
1317  * the kernel, bring to a context switch boundary that would cause
1318  * thread state to be serialized in the thread PCB.
1319  *
1320  * Thread locked, returns the same way. While locked, fields
1321  * like "state" cannot change. "runq" can change only from set to unset.
1322  */
1323 static inline boolean_t
thread_isoncpu(thread_t thread)1324 thread_isoncpu(thread_t thread)
1325 {
1326 	/* Not running or runnable */
1327 	if (!(thread->state & TH_RUN)) {
1328 		return FALSE;
1329 	}
1330 
1331 	/* Waiting on a runqueue, not currently running */
1332 	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1333 	if (thread->runq != PROCESSOR_NULL) {
1334 		return FALSE;
1335 	}
1336 
1337 	/*
1338 	 * Thread does not have a stack yet
1339 	 * It could be on the stack alloc queue or preparing to be invoked
1340 	 */
1341 	if (!thread->kernel_stack) {
1342 		return FALSE;
1343 	}
1344 
1345 	/*
1346 	 * Thread must be running on a processor, or
1347 	 * about to run, or just did run. In all these
1348 	 * cases, an AST to the processor is needed
1349 	 * to guarantee that the thread is kicked out
1350 	 * of userspace and the processor has
1351 	 * context switched (and saved register state).
1352 	 */
1353 	return TRUE;
1354 }
1355 
1356 /*
1357  * thread_stop:
1358  *
1359  * Force a preemption point for a thread and wait
1360  * for it to stop running on a CPU. If a stronger
1361  * guarantee is requested, wait until no longer
1362  * runnable. Arbitrates access among
1363  * multiple stop requests. (released by unstop)
1364  *
1365  * The thread must enter a wait state and stop via a
1366  * separate means.
1367  *
1368  * Returns FALSE if interrupted.
1369  */
1370 boolean_t
thread_stop(thread_t thread,boolean_t until_not_runnable)1371 thread_stop(
1372 	thread_t                thread,
1373 	boolean_t       until_not_runnable)
1374 {
1375 	wait_result_t   wresult;
1376 	spl_t                   s = splsched();
1377 	boolean_t               oncpu;
1378 
1379 	wake_lock(thread);
1380 	thread_lock(thread);
1381 
1382 	while (thread->state & TH_SUSP) {
1383 		thread->wake_active = TRUE;
1384 		thread_unlock(thread);
1385 
1386 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1387 		wake_unlock(thread);
1388 		splx(s);
1389 
1390 		if (wresult == THREAD_WAITING) {
1391 			wresult = thread_block(THREAD_CONTINUE_NULL);
1392 		}
1393 
1394 		if (wresult != THREAD_AWAKENED) {
1395 			return FALSE;
1396 		}
1397 
1398 		s = splsched();
1399 		wake_lock(thread);
1400 		thread_lock(thread);
1401 	}
1402 
1403 	thread->state |= TH_SUSP;
1404 
1405 	while ((oncpu = thread_isoncpu(thread)) ||
1406 	    (until_not_runnable && (thread->state & TH_RUN))) {
1407 		processor_t             processor;
1408 
1409 		if (oncpu) {
1410 			assert(thread->state & TH_RUN);
1411 			processor = thread->chosen_processor;
1412 			cause_ast_check(processor);
1413 		}
1414 
1415 		thread->wake_active = TRUE;
1416 		thread_unlock(thread);
1417 
1418 		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1419 		wake_unlock(thread);
1420 		splx(s);
1421 
1422 		if (wresult == THREAD_WAITING) {
1423 			wresult = thread_block(THREAD_CONTINUE_NULL);
1424 		}
1425 
1426 		if (wresult != THREAD_AWAKENED) {
1427 			thread_unstop(thread);
1428 			return FALSE;
1429 		}
1430 
1431 		s = splsched();
1432 		wake_lock(thread);
1433 		thread_lock(thread);
1434 	}
1435 
1436 	thread_unlock(thread);
1437 	wake_unlock(thread);
1438 	splx(s);
1439 
1440 	/*
1441 	 * We return with the thread unlocked. To prevent it from
1442 	 * transitioning to a runnable state (or from TH_RUN to
1443 	 * being on the CPU), the caller must ensure the thread
1444 	 * is stopped via an external means (such as an AST)
1445 	 */
1446 
1447 	return TRUE;
1448 }
1449 
1450 /*
1451  * thread_unstop:
1452  *
1453  * Release a previous stop request and set
1454  * the thread running if appropriate.
1455  *
1456  * Use only after a successful stop operation.
1457  */
1458 void
thread_unstop(thread_t thread)1459 thread_unstop(
1460 	thread_t        thread)
1461 {
1462 	spl_t           s = splsched();
1463 
1464 	wake_lock(thread);
1465 	thread_lock(thread);
1466 
1467 	assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1468 
1469 	if (thread->state & TH_SUSP) {
1470 		thread->state &= ~TH_SUSP;
1471 
1472 		if (thread->wake_active) {
1473 			thread->wake_active = FALSE;
1474 			thread_unlock(thread);
1475 
1476 			thread_wakeup(&thread->wake_active);
1477 			wake_unlock(thread);
1478 			splx(s);
1479 
1480 			return;
1481 		}
1482 	}
1483 
1484 	thread_unlock(thread);
1485 	wake_unlock(thread);
1486 	splx(s);
1487 }
1488 
1489 /*
1490  * thread_wait:
1491  *
1492  * Wait for a thread to stop running. (non-interruptible)
1493  *
1494  */
1495 void
thread_wait(thread_t thread,boolean_t until_not_runnable)1496 thread_wait(
1497 	thread_t        thread,
1498 	boolean_t       until_not_runnable)
1499 {
1500 	wait_result_t   wresult;
1501 	boolean_t       oncpu;
1502 	processor_t     processor;
1503 	spl_t           s = splsched();
1504 
1505 	wake_lock(thread);
1506 	thread_lock(thread);
1507 
1508 	/*
1509 	 * Wait until not running on a CPU.  If stronger requirement
1510 	 * desired, wait until not runnable.  Assumption: if thread is
1511 	 * on CPU, then TH_RUN is set, so we're not waiting in any case
1512 	 * where the original, pure "TH_RUN" check would have let us
1513 	 * finish.
1514 	 */
1515 	while ((oncpu = thread_isoncpu(thread)) ||
1516 	    (until_not_runnable && (thread->state & TH_RUN))) {
1517 		if (oncpu) {
1518 			assert(thread->state & TH_RUN);
1519 			processor = thread->chosen_processor;
1520 			cause_ast_check(processor);
1521 		}
1522 
1523 		thread->wake_active = TRUE;
1524 		thread_unlock(thread);
1525 
1526 		wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1527 		wake_unlock(thread);
1528 		splx(s);
1529 
1530 		if (wresult == THREAD_WAITING) {
1531 			thread_block(THREAD_CONTINUE_NULL);
1532 		}
1533 
1534 		s = splsched();
1535 		wake_lock(thread);
1536 		thread_lock(thread);
1537 	}
1538 
1539 	thread_unlock(thread);
1540 	wake_unlock(thread);
1541 	splx(s);
1542 }
1543 
1544 /*
1545  *	Routine: clear_wait_internal
1546  *
1547  *		Clear the wait condition for the specified thread.
1548  *		Start the thread executing if that is appropriate.
1549  *	Arguments:
1550  *		thread		thread to awaken
1551  *		result		Wakeup result the thread should see
1552  *	Conditions:
1553  *		At splsched
1554  *		the thread is locked.
1555  *	Returns:
1556  *		KERN_SUCCESS		thread was rousted out a wait
1557  *		KERN_FAILURE		thread was waiting but could not be rousted
1558  *		KERN_NOT_WAITING	thread was not waiting
1559  */
1560 __private_extern__ kern_return_t
clear_wait_internal(thread_t thread,wait_result_t wresult)1561 clear_wait_internal(
1562 	thread_t        thread,
1563 	wait_result_t   wresult)
1564 {
1565 	waitq_t waitq = thread->waitq;
1566 
1567 	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1568 		return KERN_FAILURE;
1569 	}
1570 
1571 	if (!waitq_is_null(waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1572 		return KERN_NOT_WAITING;
1573 	}
1574 
1575 	/* TODO: Can we instead assert TH_TERMINATE is not set?  */
1576 	if ((thread->state & (TH_WAIT | TH_TERMINATE)) != TH_WAIT) {
1577 		return KERN_NOT_WAITING;
1578 	}
1579 
1580 	return thread_go(thread, wresult, WQ_OPTION_NONE);
1581 }
1582 
1583 
1584 /*
1585  *	clear_wait:
1586  *
1587  *	Clear the wait condition for the specified thread.  Start the thread
1588  *	executing if that is appropriate.
1589  *
1590  *	parameters:
1591  *	  thread		thread to awaken
1592  *	  result		Wakeup result the thread should see
1593  */
1594 kern_return_t
clear_wait(thread_t thread,wait_result_t result)1595 clear_wait(
1596 	thread_t                thread,
1597 	wait_result_t   result)
1598 {
1599 	kern_return_t ret;
1600 	spl_t           s;
1601 
1602 	s = splsched();
1603 	thread_lock(thread);
1604 	ret = clear_wait_internal(thread, result);
1605 	thread_unlock(thread);
1606 	splx(s);
1607 	return ret;
1608 }
1609 
1610 
1611 /*
1612  *	thread_wakeup_prim:
1613  *
1614  *	Common routine for thread_wakeup, thread_wakeup_with_result,
1615  *	and thread_wakeup_one.
1616  *
1617  */
1618 kern_return_t
thread_wakeup_prim(event_t event,boolean_t one_thread,wait_result_t result)1619 thread_wakeup_prim(
1620 	event_t          event,
1621 	boolean_t        one_thread,
1622 	wait_result_t    result)
1623 {
1624 	if (__improbable(event == NO_EVENT)) {
1625 		panic("%s() called with NO_EVENT", __func__);
1626 	}
1627 
1628 	struct waitq *wq = global_eventq(event);
1629 
1630 	if (one_thread) {
1631 		return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1632 	} else {
1633 		return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1634 	}
1635 }
1636 
1637 /*
1638  * Wakeup a specified thread if and only if it's waiting for this event
1639  */
1640 kern_return_t
thread_wakeup_thread(event_t event,thread_t thread)1641 thread_wakeup_thread(
1642 	event_t         event,
1643 	thread_t        thread)
1644 {
1645 	if (__improbable(event == NO_EVENT)) {
1646 		panic("%s() called with NO_EVENT", __func__);
1647 	}
1648 
1649 	if (__improbable(thread == THREAD_NULL)) {
1650 		panic("%s() called with THREAD_NULL", __func__);
1651 	}
1652 
1653 	struct waitq *wq = global_eventq(event);
1654 
1655 	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1656 }
1657 
1658 /*
1659  * Wakeup a thread waiting on an event and promote it to a priority.
1660  *
1661  * Requires woken thread to un-promote itself when done.
1662  */
1663 kern_return_t
thread_wakeup_one_with_pri(event_t event,int priority)1664 thread_wakeup_one_with_pri(
1665 	event_t      event,
1666 	int          priority)
1667 {
1668 	if (__improbable(event == NO_EVENT)) {
1669 		panic("%s() called with NO_EVENT", __func__);
1670 	}
1671 
1672 	struct waitq *wq = global_eventq(event);
1673 
1674 	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1675 }
1676 
1677 /*
1678  * Wakeup a thread waiting on an event,
1679  * promote it to a priority,
1680  * and return a reference to the woken thread.
1681  *
1682  * Requires woken thread to un-promote itself when done.
1683  */
1684 thread_t
thread_wakeup_identify(event_t event,int priority)1685 thread_wakeup_identify(event_t  event,
1686     int      priority)
1687 {
1688 	if (__improbable(event == NO_EVENT)) {
1689 		panic("%s() called with NO_EVENT", __func__);
1690 	}
1691 
1692 	struct waitq *wq = global_eventq(event);
1693 
1694 	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1695 }
1696 
1697 /*
1698  *	thread_bind:
1699  *
1700  *	Force the current thread to execute on the specified processor.
1701  *	Takes effect after the next thread_block().
1702  *
1703  *	Returns the previous binding.  PROCESSOR_NULL means
1704  *	not bound.
1705  *
1706  *	XXX - DO NOT export this to users - XXX
1707  */
1708 processor_t
thread_bind(processor_t processor)1709 thread_bind(
1710 	processor_t             processor)
1711 {
1712 	thread_t                self = current_thread();
1713 	processor_t             prev;
1714 	spl_t                   s;
1715 
1716 	s = splsched();
1717 	thread_lock(self);
1718 
1719 	prev = thread_bind_internal(self, processor);
1720 
1721 	thread_unlock(self);
1722 	splx(s);
1723 
1724 	return prev;
1725 }
1726 
1727 /*
1728  * thread_bind_internal:
1729  *
1730  * If the specified thread is not the current thread, and it is currently
1731  * running on another CPU, a remote AST must be sent to that CPU to cause
1732  * the thread to migrate to its bound processor. Otherwise, the migration
1733  * will occur at the next quantum expiration or blocking point.
1734  *
1735  * When the thread is the current thread, and explicit thread_block() should
1736  * be used to force the current processor to context switch away and
1737  * let the thread migrate to the bound processor.
1738  *
1739  * Thread must be locked, and at splsched.
1740  */
1741 
1742 static processor_t
thread_bind_internal(thread_t thread,processor_t processor)1743 thread_bind_internal(
1744 	thread_t                thread,
1745 	processor_t             processor)
1746 {
1747 	processor_t             prev;
1748 
1749 	/* <rdar://problem/15102234> */
1750 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1751 	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1752 	assert(thread->runq == PROCESSOR_NULL);
1753 
1754 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1755 
1756 	prev = thread->bound_processor;
1757 	thread->bound_processor = processor;
1758 
1759 	return prev;
1760 }
1761 
1762 /*
1763  * thread_vm_bind_group_add:
1764  *
1765  * The "VM bind group" is a special mechanism to mark a collection
1766  * of threads from the VM subsystem that, in general, should be scheduled
1767  * with only one CPU of parallelism. To accomplish this, we initially
1768  * bind all the threads to the master processor, which has the effect
1769  * that only one of the threads in the group can execute at once, including
1770  * preempting threads in the group that are a lower priority. Future
1771  * mechanisms may use more dynamic mechanisms to prevent the collection
1772  * of VM threads from using more CPU time than desired.
1773  *
1774  * The current implementation can result in priority inversions where
1775  * compute-bound priority 95 or realtime threads that happen to have
1776  * landed on the master processor prevent the VM threads from running.
1777  * When this situation is detected, we unbind the threads for one
1778  * scheduler tick to allow the scheduler to run the threads an
1779  * additional CPUs, before restoring the binding (assuming high latency
1780  * is no longer a problem).
1781  */
1782 
1783 /*
1784  * The current max is provisioned for:
1785  * vm_compressor_swap_trigger_thread (92)
1786  * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1787  * vm_pageout_continue (92)
1788  * memorystatus_thread (95)
1789  */
1790 #define MAX_VM_BIND_GROUP_COUNT (5)
1791 decl_simple_lock_data(static, sched_vm_group_list_lock);
1792 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1793 static int sched_vm_group_thread_count;
1794 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1795 
1796 void
thread_vm_bind_group_add(void)1797 thread_vm_bind_group_add(void)
1798 {
1799 	thread_t self = current_thread();
1800 
1801 	thread_reference(self);
1802 	self->options |= TH_OPT_SCHED_VM_GROUP;
1803 
1804 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1805 	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1806 	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1807 	simple_unlock(&sched_vm_group_list_lock);
1808 
1809 	thread_bind(master_processor);
1810 
1811 	/* Switch to bound processor if not already there */
1812 	thread_block(THREAD_CONTINUE_NULL);
1813 }
1814 
1815 static void
sched_vm_group_maintenance(void)1816 sched_vm_group_maintenance(void)
1817 {
1818 	uint64_t ctime = mach_absolute_time();
1819 	uint64_t longtime = ctime - sched_tick_interval;
1820 	int i;
1821 	spl_t s;
1822 	boolean_t high_latency_observed = FALSE;
1823 	boolean_t runnable_and_not_on_runq_observed = FALSE;
1824 	boolean_t bind_target_changed = FALSE;
1825 	processor_t bind_target = PROCESSOR_NULL;
1826 
1827 	/* Make sure nobody attempts to add new threads while we are enumerating them */
1828 	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1829 
1830 	s = splsched();
1831 
1832 	for (i = 0; i < sched_vm_group_thread_count; i++) {
1833 		thread_t thread = sched_vm_group_thread_list[i];
1834 		assert(thread != THREAD_NULL);
1835 		thread_lock(thread);
1836 		if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
1837 			if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1838 				high_latency_observed = TRUE;
1839 			} else if (thread->runq == PROCESSOR_NULL) {
1840 				/* There are some cases where a thread be transitiong that also fall into this case */
1841 				runnable_and_not_on_runq_observed = TRUE;
1842 			}
1843 		}
1844 		thread_unlock(thread);
1845 
1846 		if (high_latency_observed && runnable_and_not_on_runq_observed) {
1847 			/* All the things we are looking for are true, stop looking */
1848 			break;
1849 		}
1850 	}
1851 
1852 	splx(s);
1853 
1854 	if (sched_vm_group_temporarily_unbound) {
1855 		/* If we turned off binding, make sure everything is OK before rebinding */
1856 		if (!high_latency_observed) {
1857 			/* rebind */
1858 			bind_target_changed = TRUE;
1859 			bind_target = master_processor;
1860 			sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1861 		}
1862 	} else {
1863 		/*
1864 		 * Check if we're in a bad state, which is defined by high
1865 		 * latency with no core currently executing a thread. If a
1866 		 * single thread is making progress on a CPU, that means the
1867 		 * binding concept to reduce parallelism is working as
1868 		 * designed.
1869 		 */
1870 		if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1871 			/* unbind */
1872 			bind_target_changed = TRUE;
1873 			bind_target = PROCESSOR_NULL;
1874 			sched_vm_group_temporarily_unbound = TRUE;
1875 		}
1876 	}
1877 
1878 	if (bind_target_changed) {
1879 		s = splsched();
1880 		for (i = 0; i < sched_vm_group_thread_count; i++) {
1881 			thread_t thread = sched_vm_group_thread_list[i];
1882 			boolean_t removed;
1883 			assert(thread != THREAD_NULL);
1884 
1885 			thread_lock(thread);
1886 			removed = thread_run_queue_remove(thread);
1887 			if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1888 				thread_bind_internal(thread, bind_target);
1889 			} else {
1890 				/*
1891 				 * Thread was in the middle of being context-switched-to,
1892 				 * or was in the process of blocking. To avoid switching the bind
1893 				 * state out mid-flight, defer the change if possible.
1894 				 */
1895 				if (bind_target == PROCESSOR_NULL) {
1896 					thread_bind_internal(thread, bind_target);
1897 				} else {
1898 					sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1899 				}
1900 			}
1901 
1902 			if (removed) {
1903 				thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1904 			}
1905 			thread_unlock(thread);
1906 		}
1907 		splx(s);
1908 	}
1909 
1910 	simple_unlock(&sched_vm_group_list_lock);
1911 }
1912 
1913 #if defined(__x86_64__)
1914 #define SCHED_AVOID_CPU0 1
1915 #else
1916 #define SCHED_AVOID_CPU0 0
1917 #endif
1918 
1919 int sched_allow_rt_smt = 1;
1920 int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
1921 int sched_choose_first_fd_processor = 1;
1922 int sched_allow_rt_steal = 1;
1923 int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
1924 
1925 int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
1926 
1927 int
sched_get_rt_n_backup_processors(void)1928 sched_get_rt_n_backup_processors(void)
1929 {
1930 	return sched_rt_n_backup_processors;
1931 }
1932 
1933 void
sched_set_rt_n_backup_processors(int n)1934 sched_set_rt_n_backup_processors(int n)
1935 {
1936 	if (n < 0) {
1937 		n = 0;
1938 	} else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
1939 		n = SCHED_MAX_BACKUP_PROCESSORS;
1940 	}
1941 
1942 	sched_rt_n_backup_processors = n;
1943 }
1944 
1945 int sched_rt_runq_strict_priority = false;
1946 
1947 inline static processor_set_t
change_locked_pset(processor_set_t current_pset,processor_set_t new_pset)1948 change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
1949 {
1950 	if (current_pset != new_pset) {
1951 		pset_unlock(current_pset);
1952 		pset_lock(new_pset);
1953 	}
1954 
1955 	return new_pset;
1956 }
1957 
1958 /*
1959  * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1960  * rebalancing opportunity exists when a core is (instantaneously) idle, but
1961  * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1962  * IPI thrash if this core does not remain idle following the load balancing ASTs
1963  * Idle "thrash", when IPI issue is followed by idle entry/core power down
1964  * followed by a wakeup shortly thereafter.
1965  */
1966 
1967 #if (DEVELOPMENT || DEBUG)
1968 int sched_smt_balance = 1;
1969 #endif
1970 
1971 /* Invoked with pset locked, returns with pset unlocked */
1972 void
sched_SMT_balance(processor_t cprocessor,processor_set_t cpset)1973 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
1974 {
1975 	processor_t ast_processor = NULL;
1976 
1977 #if (DEVELOPMENT || DEBUG)
1978 	if (__improbable(sched_smt_balance == 0)) {
1979 		goto smt_balance_exit;
1980 	}
1981 #endif
1982 
1983 	assert(cprocessor == current_processor());
1984 	if (cprocessor->is_SMT == FALSE) {
1985 		goto smt_balance_exit;
1986 	}
1987 
1988 	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1989 
1990 	/* Determine if both this processor and its sibling are idle,
1991 	 * indicating an SMT rebalancing opportunity.
1992 	 */
1993 	if (sib_processor->state != PROCESSOR_IDLE) {
1994 		goto smt_balance_exit;
1995 	}
1996 
1997 	processor_t sprocessor;
1998 
1999 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2000 	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2001 	    ~cpset->primary_map);
2002 	for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
2003 		sprocessor = processor_array[cpuid];
2004 		if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2005 		    (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2006 			ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2007 			if (ipi_type != SCHED_IPI_NONE) {
2008 				assert(sprocessor != cprocessor);
2009 				ast_processor = sprocessor;
2010 				break;
2011 			}
2012 		}
2013 	}
2014 
2015 smt_balance_exit:
2016 	pset_unlock(cpset);
2017 
2018 	if (ast_processor) {
2019 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2020 		sched_ipi_perform(ast_processor, ipi_type);
2021 	}
2022 }
2023 
2024 static cpumap_t
pset_available_cpumap(processor_set_t pset)2025 pset_available_cpumap(processor_set_t pset)
2026 {
2027 	return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]) &
2028 	       pset->recommended_bitmask;
2029 }
2030 
2031 int
pset_available_cpu_count(processor_set_t pset)2032 pset_available_cpu_count(processor_set_t pset)
2033 {
2034 	return bit_count(pset_available_cpumap(pset));
2035 }
2036 
2037 static cpumap_t
pset_available_but_not_running_cpumap(processor_set_t pset)2038 pset_available_but_not_running_cpumap(processor_set_t pset)
2039 {
2040 	return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2041 	       pset->recommended_bitmask;
2042 }
2043 
2044 bool
pset_has_stealable_threads(processor_set_t pset)2045 pset_has_stealable_threads(processor_set_t pset)
2046 {
2047 	pset_assert_locked(pset);
2048 
2049 	cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2050 	/*
2051 	 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2052 	 * available primary CPUs
2053 	 */
2054 	avail_map &= pset->primary_map;
2055 
2056 	return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map));
2057 }
2058 
2059 static cpumap_t
pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)2060 pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2061 {
2062 	cpumap_t avail_map = pset_available_cpumap(pset);
2063 	if (!sched_allow_rt_smt) {
2064 		/*
2065 		 * Secondary CPUs are not allowed to run RT threads, so
2066 		 * only primary CPUs should be included
2067 		 */
2068 		avail_map &= pset->primary_map;
2069 	}
2070 
2071 	return avail_map & ~pset->realtime_map;
2072 }
2073 
2074 static bool
pset_needs_a_followup_IPI(processor_set_t pset)2075 pset_needs_a_followup_IPI(processor_set_t pset)
2076 {
2077 	int nbackup_cpus = 0;
2078 
2079 	if (rt_runq_is_low_latency(pset)) {
2080 		nbackup_cpus = sched_rt_n_backup_processors;
2081 	}
2082 
2083 	int rt_rq_count = rt_runq_count(pset);
2084 
2085 	return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0);
2086 }
2087 
2088 bool
pset_has_stealable_rt_threads(processor_set_t pset)2089 pset_has_stealable_rt_threads(processor_set_t pset)
2090 {
2091 	pset_node_t node = pset->node;
2092 	if (bit_count(node->pset_map) == 1) {
2093 		return false;
2094 	}
2095 
2096 	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2097 
2098 	return rt_runq_count(pset) > bit_count(avail_map);
2099 }
2100 
2101 static void
pset_update_rt_stealable_state(processor_set_t pset)2102 pset_update_rt_stealable_state(processor_set_t pset)
2103 {
2104 	if (pset_has_stealable_rt_threads(pset)) {
2105 		pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2106 	} else {
2107 		pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2108 	}
2109 }
2110 
2111 static void
clear_pending_AST_bits(processor_set_t pset,processor_t processor,__kdebug_only const int trace_point_number)2112 clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2113 {
2114 	/* Acknowledge any pending IPIs here with pset lock held */
2115 	pset_assert_locked(pset);
2116 	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2117 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2118 		    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2119 	}
2120 	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2121 
2122 #if defined(CONFIG_SCHED_DEFERRED_AST)
2123 	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2124 #endif
2125 }
2126 
2127 /*
2128  * Called with pset locked, on a processor that is committing to run a new thread
2129  * Will transition an idle or dispatching processor to running as it picks up
2130  * the first new thread from the idle thread.
2131  */
2132 static void
pset_commit_processor_to_new_thread(processor_set_t pset,processor_t processor,thread_t new_thread)2133 pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2134 {
2135 	pset_assert_locked(pset);
2136 
2137 	if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2138 		assert(current_thread() == processor->idle_thread);
2139 
2140 		/*
2141 		 * Dispatching processor is now committed to running new_thread,
2142 		 * so change its state to PROCESSOR_RUNNING.
2143 		 */
2144 		pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
2145 	} else {
2146 		assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2147 	}
2148 
2149 	processor_state_update_from_thread(processor, new_thread, true);
2150 
2151 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2152 		bit_set(pset->realtime_map, processor->cpu_id);
2153 	} else {
2154 		bit_clear(pset->realtime_map, processor->cpu_id);
2155 	}
2156 	pset_update_rt_stealable_state(pset);
2157 
2158 	pset_node_t node = pset->node;
2159 
2160 	if (bit_count(node->pset_map) == 1) {
2161 		/* Node has only a single pset, so skip node pset map updates */
2162 		return;
2163 	}
2164 
2165 	cpumap_t avail_map = pset_available_cpumap(pset);
2166 
2167 	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2168 		if ((avail_map & pset->realtime_map) == avail_map) {
2169 			/* No more non-RT CPUs in this pset */
2170 			atomic_bit_clear(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2171 		}
2172 		avail_map &= pset->primary_map;
2173 		if ((avail_map & pset->realtime_map) == avail_map) {
2174 			/* No more non-RT primary CPUs in this pset */
2175 			atomic_bit_clear(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2176 		}
2177 	} else {
2178 		if ((avail_map & pset->realtime_map) != avail_map) {
2179 			if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2180 				atomic_bit_set(&node->pset_non_rt_map, pset->pset_id, memory_order_relaxed);
2181 			}
2182 		}
2183 		avail_map &= pset->primary_map;
2184 		if ((avail_map & pset->realtime_map) != avail_map) {
2185 			if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2186 				atomic_bit_set(&node->pset_non_rt_primary_map, pset->pset_id, memory_order_relaxed);
2187 			}
2188 		}
2189 	}
2190 }
2191 
2192 static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2193 static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2194     processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2195 static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2196 #if defined(__x86_64__)
2197 static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2198 static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2199 #endif
2200 static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2201 static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2202 
2203 static bool
other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset,uint64_t earliest_deadline)2204 other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2205 {
2206 	pset_map_t pset_map = stealing_pset->node->pset_map;
2207 
2208 	bit_clear(pset_map, stealing_pset->pset_id);
2209 
2210 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
2211 		processor_set_t nset = pset_array[pset_id];
2212 
2213 		if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) {
2214 			return true;
2215 		}
2216 	}
2217 
2218 	return false;
2219 }
2220 
2221 /*
2222  * starting_pset must be locked, but returns true if it is unlocked before return
2223  */
2224 static bool
choose_next_rt_processor_for_IPI(processor_set_t starting_pset,processor_t chosen_processor,bool spill_ipi,processor_t * result_processor,sched_ipi_type_t * result_ipi_type)2225 choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2226     processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2227 {
2228 	bool starting_pset_is_unlocked = false;
2229 	uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset);
2230 	int max_pri = rt_runq_priority(starting_pset);
2231 	__kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq));
2232 	if (rt_constraint_ll != 0) {
2233 		uint64_t ctime = mach_absolute_time();
2234 		if (earliest_deadline < rt_constraint_ll + ctime) {
2235 			earliest_deadline = rt_constraint_ll + ctime;
2236 		}
2237 	}
2238 	processor_set_t pset = starting_pset;
2239 	processor_t next_rt_processor = PROCESSOR_NULL;
2240 	if (spill_ipi) {
2241 		processor_set_t nset = next_pset(pset);
2242 		assert(nset != starting_pset);
2243 		pset = change_locked_pset(pset, nset);
2244 		starting_pset_is_unlocked = true;
2245 	}
2246 	do {
2247 		const bool consider_secondaries = true;
2248 		next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries);
2249 		if (next_rt_processor == PROCESSOR_NULL) {
2250 			if (!spill_ipi) {
2251 				break;
2252 			}
2253 			processor_set_t nset = next_pset(pset);
2254 			if (nset == starting_pset) {
2255 				break;
2256 			}
2257 			pset = change_locked_pset(pset, nset);
2258 			starting_pset_is_unlocked = true;
2259 		}
2260 	} while (next_rt_processor == PROCESSOR_NULL);
2261 	if (next_rt_processor) {
2262 		if (pset != starting_pset) {
2263 			if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2264 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2265 				    next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2266 			}
2267 		}
2268 		*result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT);
2269 		*result_processor = next_rt_processor;
2270 	}
2271 	if (pset != starting_pset) {
2272 		pset_unlock(pset);
2273 	}
2274 
2275 	return starting_pset_is_unlocked;
2276 }
2277 
2278 /*
2279  * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2280  * followup processor - used in thread_select when there are still threads on the run queue and available processors
2281  * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2282  */
2283 typedef enum {
2284 	none,
2285 	backup,
2286 	followup,
2287 	spill
2288 } next_processor_type_t;
2289 
2290 #undef LOOP_COUNT
2291 #ifdef LOOP_COUNT
2292 int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2293 #endif
2294 
2295 /*
2296  *	thread_select:
2297  *
2298  *	Select a new thread for the current processor to execute.
2299  *
2300  *	May select the current thread, which must be locked.
2301  */
2302 static thread_t
thread_select(thread_t thread,processor_t processor,ast_t * reason)2303 thread_select(thread_t          thread,
2304     processor_t       processor,
2305     ast_t            *reason)
2306 {
2307 	processor_set_t         pset = processor->processor_set;
2308 	thread_t                        new_thread = THREAD_NULL;
2309 
2310 	assert(processor == current_processor());
2311 	assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2312 
2313 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2314 	    0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2315 
2316 	__kdebug_only int idle_reason = 0;
2317 	__kdebug_only int delay_count = 0;
2318 
2319 #if defined(__x86_64__)
2320 	int timeout_count = sched_backup_cpu_timeout_count;
2321 	if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2322 		/* Prefer cpu0 as backup */
2323 		timeout_count--;
2324 	} else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2325 		/* Prefer secondary cpu as backup */
2326 		timeout_count--;
2327 	}
2328 #endif
2329 	bool pending_AST_URGENT = false;
2330 	bool pending_AST_PREEMPT = false;
2331 
2332 #ifdef LOOP_COUNT
2333 	int loop_count = -1;
2334 #endif
2335 
2336 	do {
2337 		/*
2338 		 *	Update the priority.
2339 		 */
2340 		if (SCHED(can_update_priority)(thread)) {
2341 			SCHED(update_priority)(thread);
2342 		}
2343 
2344 		pset_lock(pset);
2345 
2346 restart:
2347 #ifdef LOOP_COUNT
2348 		loop_count++;
2349 		if (loop_count > max_loop_count[processor->cpu_id]) {
2350 			max_loop_count[processor->cpu_id] = loop_count;
2351 			if (bit_count(loop_count) == 1) {
2352 				kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2353 			}
2354 		}
2355 #endif
2356 		pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2357 		pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2358 
2359 		processor_state_update_from_thread(processor, thread, true);
2360 
2361 		idle_reason = 0;
2362 
2363 		processor_t ast_processor = PROCESSOR_NULL;
2364 		processor_t next_rt_processor = PROCESSOR_NULL;
2365 		sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2366 		sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2367 
2368 		assert(processor->state != PROCESSOR_OFF_LINE);
2369 
2370 		/*
2371 		 * Bound threads are dispatched to a processor without going through
2372 		 * choose_processor(), so in those cases we must continue trying to dequeue work
2373 		 * as we are the only option.
2374 		 */
2375 		if (!SCHED(processor_bound_count)(processor)) {
2376 			if (!processor->is_recommended) {
2377 				/*
2378 				 * The performance controller has provided a hint to not dispatch more threads,
2379 				 */
2380 				idle_reason = 1;
2381 				goto send_followup_ipi_before_idle;
2382 			} else if (rt_runq_count(pset)) {
2383 				bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2384 				/* Give the current RT thread a chance to complete */
2385 				ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2386 #if defined(__x86_64__)
2387 				/*
2388 				 * On Intel we want to avoid SMT secondary processors and processor 0
2389 				 * but allow them to be used as backup processors in case the preferred chosen
2390 				 * processor is delayed by interrupts or processor stalls.  So if it is
2391 				 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2392 				 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2393 				 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2394 				 * to grab the thread before the (current) backup processor does.
2395 				 *
2396 				 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2397 				 * on DEVELOPMENT || DEBUG kernels.  It is also adjusted (see above) depending on whether we want to use
2398 				 * cpu0 before secondary cpus or not.
2399 				 */
2400 				if (!ok_to_run_realtime_thread) {
2401 					if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2402 						if (timeout_count-- > 0) {
2403 							pset_unlock(pset);
2404 							thread_unlock(thread);
2405 							delay(10);
2406 							delay_count++;
2407 							thread_lock(thread);
2408 							pset_lock(pset);
2409 							goto restart;
2410 						}
2411 						ok_to_run_realtime_thread = true;
2412 					}
2413 				}
2414 #endif
2415 				if (!ok_to_run_realtime_thread) {
2416 					idle_reason = 2;
2417 					goto send_followup_ipi_before_idle;
2418 				}
2419 			} else if (processor->processor_primary != processor) {
2420 				/*
2421 				 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2422 				 * we should look for work only under the same conditions that choose_processor()
2423 				 * would have assigned work, which is when all primary processors have been assigned work.
2424 				 */
2425 				if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2426 					/* There are idle primaries */
2427 					idle_reason = 3;
2428 					goto idle;
2429 				}
2430 			}
2431 		}
2432 
2433 		/*
2434 		 *	Test to see if the current thread should continue
2435 		 *	to run on this processor.  Must not be attempting to wait, and not
2436 		 *	bound to a different processor, nor be in the wrong
2437 		 *	processor set, nor be forced to context switch by TH_SUSP.
2438 		 *
2439 		 *	Note that there are never any RT threads in the regular runqueue.
2440 		 *
2441 		 *	This code is very insanely tricky.
2442 		 */
2443 
2444 		/* i.e. not waiting, not TH_SUSP'ed */
2445 		bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2446 
2447 		/*
2448 		 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2449 		 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2450 		 *       <rdar://problem/47907700>
2451 		 *
2452 		 * A yielding thread shouldn't be forced to context switch.
2453 		 */
2454 
2455 		bool is_yielding         = (*reason & AST_YIELD) == AST_YIELD;
2456 
2457 		bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2458 
2459 		bool affinity_mismatch   = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2460 
2461 		bool bound_elsewhere     = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2462 
2463 		bool avoid_processor     = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread);
2464 
2465 		bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2466 
2467 		bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2468 		if (current_thread_can_keep_running) {
2469 			/*
2470 			 * This thread is eligible to keep running on this processor.
2471 			 *
2472 			 * RT threads with un-expired quantum stay on processor,
2473 			 * unless there's a valid RT thread with an earlier deadline
2474 			 * and it is still ok_to_run_realtime_thread.
2475 			 */
2476 			if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2477 				/*
2478 				 * Allow low latency realtime threads to keep running.
2479 				 * Pick a new RT thread only if ok_to_run_realtime_thread
2480 				 * (but the current thread is allowed to complete).
2481 				 */
2482 				if ((thread->realtime.constraint > rt_constraint_ll) && ok_to_run_realtime_thread) {
2483 					if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2484 						goto pick_new_rt_thread;
2485 					}
2486 					if (rt_runq_priority(pset) > thread->sched_pri) {
2487 						if (sched_rt_runq_strict_priority) {
2488 							/* The next RT thread is better, so pick it off the runqueue. */
2489 							goto pick_new_rt_thread;
2490 						}
2491 
2492 						/*
2493 						 * See if the current lower priority thread can continue to run without causing
2494 						 * the higher priority thread on the runq queue to miss its deadline.
2495 						 */
2496 						thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2497 						if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2498 							/* The next RT thread is better, so pick it off the runqueue. */
2499 							goto pick_new_rt_thread;
2500 						}
2501 					} else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) {
2502 						/* The next RT thread is better, so pick it off the runqueue. */
2503 						goto pick_new_rt_thread;
2504 					}
2505 					if (other_psets_have_earlier_rt_threads_pending(pset, thread->realtime.deadline)) {
2506 						goto pick_new_rt_thread;
2507 					}
2508 				}
2509 
2510 				/* This is still the best RT thread to run. */
2511 				processor->deadline = thread->realtime.deadline;
2512 
2513 				sched_update_pset_load_average(pset, 0);
2514 
2515 				clear_pending_AST_bits(pset, processor, 1);
2516 
2517 				next_rt_processor = PROCESSOR_NULL;
2518 				next_rt_ipi_type = SCHED_IPI_NONE;
2519 
2520 				bool pset_unlocked = false;
2521 				__kdebug_only next_processor_type_t nptype = none;
2522 				if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2523 					nptype = spill;
2524 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2525 				} else if (pset_needs_a_followup_IPI(pset)) {
2526 					nptype = followup;
2527 					pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2528 				}
2529 				if (!pset_unlocked) {
2530 					pset_unlock(pset);
2531 				}
2532 
2533 				if (next_rt_processor) {
2534 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2535 					    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2536 					sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2537 				}
2538 
2539 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2540 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2541 				return thread;
2542 			}
2543 
2544 			if ((rt_runq_count(pset) == 0) &&
2545 			    SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2546 				/* This thread is still the highest priority runnable (non-idle) thread */
2547 				processor->deadline = RT_DEADLINE_NONE;
2548 
2549 				sched_update_pset_load_average(pset, 0);
2550 
2551 				clear_pending_AST_bits(pset, processor, 2);
2552 
2553 				pset_unlock(pset);
2554 
2555 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2556 				    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2557 				return thread;
2558 			}
2559 		} else {
2560 			/*
2561 			 * This processor must context switch.
2562 			 * If it's due to a rebalance, we should aggressively find this thread a new home.
2563 			 */
2564 			if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2565 				*reason |= AST_REBALANCE;
2566 			}
2567 		}
2568 
2569 		bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2570 		    (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2571 		    (processor->processor_secondary->state == PROCESSOR_IDLE));
2572 
2573 		/* OK, so we're not going to run the current thread. Look at the RT queue. */
2574 		if (ok_to_run_realtime_thread) {
2575 pick_new_rt_thread:
2576 			new_thread = sched_rt_choose_thread(pset);
2577 			if (new_thread != THREAD_NULL) {
2578 				processor->deadline = new_thread->realtime.deadline;
2579 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2580 
2581 				clear_pending_AST_bits(pset, processor, 3);
2582 
2583 				if (processor->processor_secondary != NULL) {
2584 					processor_t sprocessor = processor->processor_secondary;
2585 					if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2586 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2587 						ast_processor = sprocessor;
2588 					}
2589 				}
2590 			}
2591 		}
2592 
2593 send_followup_ipi_before_idle:
2594 		/* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2595 		if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2596 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2597 		}
2598 		__kdebug_only next_processor_type_t nptype = none;
2599 		bool pset_unlocked = false;
2600 		if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2601 			nptype = spill;
2602 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type);
2603 		} else if (pset_needs_a_followup_IPI(pset)) {
2604 			nptype = followup;
2605 			pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type);
2606 		}
2607 
2608 		assert(new_thread || !ast_processor);
2609 		if (new_thread || next_rt_processor) {
2610 			if (!pset_unlocked) {
2611 				pset_unlock(pset);
2612 				pset_unlocked = true;
2613 			}
2614 			if (ast_processor == next_rt_processor) {
2615 				ast_processor = PROCESSOR_NULL;
2616 				ipi_type = SCHED_IPI_NONE;
2617 			}
2618 
2619 			if (ast_processor) {
2620 				sched_ipi_perform(ast_processor, ipi_type);
2621 			}
2622 
2623 			if (next_rt_processor) {
2624 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2625 				    next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2626 				sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2627 			}
2628 
2629 			if (new_thread) {
2630 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2631 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2632 				return new_thread;
2633 			}
2634 		}
2635 
2636 		if (pset_unlocked) {
2637 			pset_lock(pset);
2638 		}
2639 
2640 		if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2641 			/* Things changed while we dropped the lock */
2642 			goto restart;
2643 		}
2644 
2645 		if (processor->is_recommended) {
2646 			bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2647 			if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2648 				/* Things changed while we dropped the lock */
2649 				goto restart;
2650 			}
2651 
2652 			if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2653 				/* secondary can only run realtime thread */
2654 				if (idle_reason == 0) {
2655 					idle_reason = 4;
2656 				}
2657 				goto idle;
2658 			}
2659 		} else if (!SCHED(processor_bound_count)(processor)) {
2660 			/* processor not recommended and no bound threads */
2661 			if (idle_reason == 0) {
2662 				idle_reason = 5;
2663 			}
2664 			goto idle;
2665 		}
2666 
2667 		processor->deadline = RT_DEADLINE_NONE;
2668 
2669 		/* No RT threads, so let's look at the regular threads. */
2670 		if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2671 			pset_commit_processor_to_new_thread(pset, processor, new_thread);
2672 
2673 			clear_pending_AST_bits(pset, processor, 4);
2674 
2675 			ast_processor = PROCESSOR_NULL;
2676 			ipi_type = SCHED_IPI_NONE;
2677 
2678 			processor_t sprocessor = processor->processor_secondary;
2679 			if (sprocessor != NULL) {
2680 				if (sprocessor->state == PROCESSOR_RUNNING) {
2681 					if (thread_no_smt(new_thread)) {
2682 						ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_SMT_REBAL);
2683 						ast_processor = sprocessor;
2684 					}
2685 				} else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
2686 					ipi_type = sched_ipi_action(sprocessor, NULL, SCHED_IPI_EVENT_PREEMPT);
2687 					ast_processor = sprocessor;
2688 				}
2689 			}
2690 			pset_unlock(pset);
2691 
2692 			if (ast_processor) {
2693 				sched_ipi_perform(ast_processor, ipi_type);
2694 			}
2695 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2696 			    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2697 			return new_thread;
2698 		}
2699 
2700 		if (processor->must_idle) {
2701 			processor->must_idle = false;
2702 			*reason |= AST_REBALANCE;
2703 			idle_reason = 6;
2704 			goto idle;
2705 		}
2706 
2707 		if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2708 			/*
2709 			 * No runnable threads, attempt to steal
2710 			 * from other processors. Returns with pset lock dropped.
2711 			 */
2712 
2713 			if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2714 				pset_lock(pset);
2715 				pset_commit_processor_to_new_thread(pset, processor, new_thread);
2716 				if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2717 					/*
2718 					 * A realtime thread choose this processor while it was DISPATCHING
2719 					 * and the pset lock was dropped
2720 					 */
2721 					ast_on(AST_URGENT | AST_PREEMPT);
2722 				}
2723 
2724 				clear_pending_AST_bits(pset, processor, 5);
2725 
2726 				pset_unlock(pset);
2727 
2728 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2729 				    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2730 				return new_thread;
2731 			}
2732 
2733 			/*
2734 			 * If other threads have appeared, shortcut
2735 			 * around again.
2736 			 */
2737 			if (SCHED(processor_bound_count)(processor)) {
2738 				continue;
2739 			}
2740 			if (processor->is_recommended) {
2741 				if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2742 					continue;
2743 				}
2744 			}
2745 
2746 			pset_lock(pset);
2747 		}
2748 
2749 idle:
2750 		/* Someone selected this processor while we had dropped the lock */
2751 		if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2752 		    (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2753 			goto restart;
2754 		}
2755 
2756 		if ((idle_reason == 0) && current_thread_can_keep_running) {
2757 			/* This thread is the only runnable (non-idle) thread */
2758 			if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2759 				processor->deadline = thread->realtime.deadline;
2760 			} else {
2761 				processor->deadline = RT_DEADLINE_NONE;
2762 			}
2763 
2764 			sched_update_pset_load_average(pset, 0);
2765 
2766 			clear_pending_AST_bits(pset, processor, 6);
2767 
2768 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2769 			    (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2770 			pset_unlock(pset);
2771 			return thread;
2772 		}
2773 
2774 		/*
2775 		 *	Nothing is runnable, or this processor must be forced idle,
2776 		 *	so set this processor idle if it was running.
2777 		 */
2778 		if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2779 			pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2780 			processor_state_update_idle(processor);
2781 		}
2782 		pset_update_rt_stealable_state(pset);
2783 
2784 		clear_pending_AST_bits(pset, processor, 7);
2785 
2786 		/* Invoked with pset locked, returns with pset unlocked */
2787 		SCHED(processor_balance)(processor, pset);
2788 
2789 		new_thread = processor->idle_thread;
2790 	} while (new_thread == THREAD_NULL);
2791 
2792 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2793 	    (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2794 	return new_thread;
2795 }
2796 
2797 /*
2798  * thread_invoke
2799  *
2800  * Called at splsched with neither thread locked.
2801  *
2802  * Perform a context switch and start executing the new thread.
2803  *
2804  * Returns FALSE when the context switch didn't happen.
2805  * The reference to the new thread is still consumed.
2806  *
2807  * "self" is what is currently running on the processor,
2808  * "thread" is the new thread to context switch to
2809  * (which may be the same thread in some cases)
2810  */
2811 static boolean_t
thread_invoke(thread_t self,thread_t thread,ast_t reason)2812 thread_invoke(
2813 	thread_t                        self,
2814 	thread_t                        thread,
2815 	ast_t                           reason)
2816 {
2817 	if (__improbable(get_preemption_level() != 0)) {
2818 		int pl = get_preemption_level();
2819 		panic("thread_invoke: preemption_level %d, possible cause: %s",
2820 		    pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2821 		    "blocking while holding a spinlock, or within interrupt context"));
2822 	}
2823 
2824 	thread_continue_t       continuation = self->continuation;
2825 	void                    *parameter   = self->parameter;
2826 
2827 	uint64_t                ctime = mach_absolute_time();
2828 
2829 	check_monotonic_time(ctime);
2830 
2831 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2832 	commpage_update_mach_approximate_time(ctime);
2833 #endif
2834 
2835 	if (ctime < thread->last_made_runnable_time) {
2836 		panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
2837 		    ctime, thread->last_made_runnable_time);
2838 	}
2839 
2840 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2841 	if (!((thread->state & TH_IDLE) != 0 ||
2842 	    ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
2843 		sched_timeshare_consider_maintenance(ctime);
2844 	}
2845 #endif
2846 
2847 #if MONOTONIC
2848 	mt_sched_update(self);
2849 #endif /* MONOTONIC */
2850 
2851 	assert_thread_magic(self);
2852 	assert(self == current_thread());
2853 	assert(self->runq == PROCESSOR_NULL);
2854 	assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2855 
2856 	thread_lock(thread);
2857 
2858 	assert_thread_magic(thread);
2859 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
2860 	assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2861 	assert(thread->runq == PROCESSOR_NULL);
2862 
2863 	/* Reload precise timing global policy to thread-local policy */
2864 	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2865 
2866 	/* Update SFI class based on other factors */
2867 	thread->sfi_class = sfi_thread_classify(thread);
2868 
2869 	/* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2870 	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2871 	/*
2872 	 * In case a base_pri update happened between the timestamp and
2873 	 * taking the thread lock
2874 	 */
2875 	if (ctime <= thread->last_basepri_change_time) {
2876 		thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2877 	}
2878 
2879 	/* Allow realtime threads to hang onto a stack. */
2880 	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
2881 		self->reserved_stack = self->kernel_stack;
2882 	}
2883 
2884 	/* Prepare for spin debugging */
2885 #if INTERRUPT_MASKED_DEBUG
2886 	ml_spin_debug_clear(thread);
2887 #endif
2888 
2889 	if (continuation != NULL) {
2890 		if (!thread->kernel_stack) {
2891 			/*
2892 			 * If we are using a privileged stack,
2893 			 * check to see whether we can exchange it with
2894 			 * that of the other thread.
2895 			 */
2896 			if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
2897 				goto need_stack;
2898 			}
2899 
2900 			/*
2901 			 * Context switch by performing a stack handoff.
2902 			 * Requires both threads to be parked in a continuation.
2903 			 */
2904 			continuation = thread->continuation;
2905 			parameter = thread->parameter;
2906 
2907 			processor_t processor = current_processor();
2908 			processor->active_thread = thread;
2909 			processor_state_update_from_thread(processor, thread, false);
2910 
2911 			if (thread->last_processor != processor && thread->last_processor != NULL) {
2912 				if (thread->last_processor->processor_set != processor->processor_set) {
2913 					thread->ps_switch++;
2914 				}
2915 				thread->p_switch++;
2916 			}
2917 			thread->last_processor = processor;
2918 			thread->c_switch++;
2919 			ast_context(thread);
2920 
2921 			thread_unlock(thread);
2922 
2923 			self->reason = reason;
2924 
2925 			processor->last_dispatch = ctime;
2926 			self->last_run_time = ctime;
2927 			processor_timer_switch_thread(ctime, &thread->system_timer);
2928 			timer_update(&thread->runnable_timer, ctime);
2929 			processor->kernel_timer = &thread->system_timer;
2930 
2931 			/*
2932 			 * Since non-precise user/kernel time doesn't update the state timer
2933 			 * during privilege transitions, synthesize an event now.
2934 			 */
2935 			if (!thread->precise_user_kernel_time) {
2936 				timer_update(processor->current_state, ctime);
2937 			}
2938 
2939 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2940 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
2941 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2942 
2943 			if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2944 				SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
2945 				    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2946 			}
2947 
2948 			DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
2949 
2950 			SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2951 
2952 #if KPERF
2953 			kperf_off_cpu(self);
2954 #endif /* KPERF */
2955 
2956 			/*
2957 			 * This is where we actually switch thread identity,
2958 			 * and address space if required.  However, register
2959 			 * state is not switched - this routine leaves the
2960 			 * stack and register state active on the current CPU.
2961 			 */
2962 			TLOG(1, "thread_invoke: calling stack_handoff\n");
2963 			stack_handoff(self, thread);
2964 
2965 			/* 'self' is now off core */
2966 			assert(thread == current_thread_volatile());
2967 
2968 			DTRACE_SCHED(on__cpu);
2969 
2970 #if KPERF
2971 			kperf_on_cpu(thread, continuation, NULL);
2972 #endif /* KPERF */
2973 
2974 			thread_dispatch(self, thread);
2975 
2976 #if KASAN
2977 			/* Old thread's stack has been moved to the new thread, so explicitly
2978 			 * unpoison it. */
2979 			kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2980 #endif
2981 
2982 			thread->continuation = thread->parameter = NULL;
2983 
2984 			boolean_t enable_interrupts = TRUE;
2985 
2986 			/* idle thread needs to stay interrupts-disabled */
2987 			if ((thread->state & TH_IDLE)) {
2988 				enable_interrupts = FALSE;
2989 			}
2990 
2991 			assert(continuation);
2992 			call_continuation(continuation, parameter,
2993 			    thread->wait_result, enable_interrupts);
2994 			/*NOTREACHED*/
2995 		} else if (thread == self) {
2996 			/* same thread but with continuation */
2997 			ast_context(self);
2998 
2999 			thread_unlock(self);
3000 
3001 #if KPERF
3002 			kperf_on_cpu(thread, continuation, NULL);
3003 #endif /* KPERF */
3004 
3005 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3006 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3007 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3008 
3009 #if KASAN
3010 			/* stack handoff to self - no thread_dispatch(), so clear the stack
3011 			 * and free the fakestack directly */
3012 			kasan_fakestack_drop(self);
3013 			kasan_fakestack_gc(self);
3014 			kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3015 #endif
3016 
3017 			self->continuation = self->parameter = NULL;
3018 
3019 			boolean_t enable_interrupts = TRUE;
3020 
3021 			/* idle thread needs to stay interrupts-disabled */
3022 			if ((self->state & TH_IDLE)) {
3023 				enable_interrupts = FALSE;
3024 			}
3025 
3026 			call_continuation(continuation, parameter,
3027 			    self->wait_result, enable_interrupts);
3028 			/*NOTREACHED*/
3029 		}
3030 	} else {
3031 		/*
3032 		 * Check that the other thread has a stack
3033 		 */
3034 		if (!thread->kernel_stack) {
3035 need_stack:
3036 			if (!stack_alloc_try(thread)) {
3037 				thread_unlock(thread);
3038 				thread_stack_enqueue(thread);
3039 				return FALSE;
3040 			}
3041 		} else if (thread == self) {
3042 			ast_context(self);
3043 			thread_unlock(self);
3044 
3045 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3046 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3047 			    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3048 
3049 			return TRUE;
3050 		}
3051 	}
3052 
3053 	/*
3054 	 * Context switch by full context save.
3055 	 */
3056 	processor_t processor = current_processor();
3057 	processor->active_thread = thread;
3058 	processor_state_update_from_thread(processor, thread, false);
3059 
3060 	if (thread->last_processor != processor && thread->last_processor != NULL) {
3061 		if (thread->last_processor->processor_set != processor->processor_set) {
3062 			thread->ps_switch++;
3063 		}
3064 		thread->p_switch++;
3065 	}
3066 	thread->last_processor = processor;
3067 	thread->c_switch++;
3068 	ast_context(thread);
3069 
3070 	thread_unlock(thread);
3071 
3072 	self->reason = reason;
3073 
3074 	processor->last_dispatch = ctime;
3075 	self->last_run_time = ctime;
3076 	processor_timer_switch_thread(ctime, &thread->system_timer);
3077 	timer_update(&thread->runnable_timer, ctime);
3078 	processor->kernel_timer = &thread->system_timer;
3079 
3080 	/*
3081 	 * Since non-precise user/kernel time doesn't update the state timer
3082 	 * during privilege transitions, synthesize an event now.
3083 	 */
3084 	if (!thread->precise_user_kernel_time) {
3085 		timer_update(processor->current_state, ctime);
3086 	}
3087 
3088 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3089 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3090 	    self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3091 
3092 	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3093 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3094 		    (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3095 	}
3096 
3097 	DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3098 
3099 	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3100 
3101 #if KPERF
3102 	kperf_off_cpu(self);
3103 #endif /* KPERF */
3104 
3105 	/*
3106 	 * This is where we actually switch register context,
3107 	 * and address space if required.  We will next run
3108 	 * as a result of a subsequent context switch.
3109 	 *
3110 	 * Once registers are switched and the processor is running "thread",
3111 	 * the stack variables and non-volatile registers will contain whatever
3112 	 * was there the last time that thread blocked. No local variables should
3113 	 * be used after this point, except for the special case of "thread", which
3114 	 * the platform layer returns as the previous thread running on the processor
3115 	 * via the function call ABI as a return register, and "self", which may have
3116 	 * been stored on the stack or a non-volatile register, but a stale idea of
3117 	 * what was on the CPU is newly-accurate because that thread is again
3118 	 * running on the CPU.
3119 	 *
3120 	 * If one of the threads is using a continuation, thread_continue
3121 	 * is used to stitch up its context.
3122 	 *
3123 	 * If we are invoking a thread which is resuming from a continuation,
3124 	 * the CPU will invoke thread_continue next.
3125 	 *
3126 	 * If the current thread is parking in a continuation, then its state
3127 	 * won't be saved and the stack will be discarded. When the stack is
3128 	 * re-allocated, it will be configured to resume from thread_continue.
3129 	 */
3130 
3131 	assert(continuation == self->continuation);
3132 	thread = machine_switch_context(self, continuation, thread);
3133 	assert(self == current_thread_volatile());
3134 	TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3135 
3136 	assert(continuation == NULL && self->continuation == NULL);
3137 
3138 	DTRACE_SCHED(on__cpu);
3139 
3140 #if KPERF
3141 	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
3142 #endif /* KPERF */
3143 
3144 	/* We have been resumed and are set to run. */
3145 	thread_dispatch(thread, self);
3146 
3147 	return TRUE;
3148 }
3149 
3150 #if defined(CONFIG_SCHED_DEFERRED_AST)
3151 /*
3152  *	pset_cancel_deferred_dispatch:
3153  *
3154  *	Cancels all ASTs that we can cancel for the given processor set
3155  *	if the current processor is running the last runnable thread in the
3156  *	system.
3157  *
3158  *	This function assumes the current thread is runnable.  This must
3159  *	be called with the pset unlocked.
3160  */
3161 static void
pset_cancel_deferred_dispatch(processor_set_t pset,processor_t processor)3162 pset_cancel_deferred_dispatch(
3163 	processor_set_t         pset,
3164 	processor_t             processor)
3165 {
3166 	processor_t             active_processor = NULL;
3167 	uint32_t                sampled_sched_run_count;
3168 
3169 	pset_lock(pset);
3170 	sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3171 
3172 	/*
3173 	 * If we have emptied the run queue, and our current thread is runnable, we
3174 	 * should tell any processors that are still DISPATCHING that they will
3175 	 * probably not have any work to do.  In the event that there are no
3176 	 * pending signals that we can cancel, this is also uninteresting.
3177 	 *
3178 	 * In the unlikely event that another thread becomes runnable while we are
3179 	 * doing this (sched_run_count is atomically updated, not guarded), the
3180 	 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3181 	 * in order to dispatch it to a processor in our pset.  So, the other
3182 	 * codepath will wait while we squash all cancelable ASTs, get the pset
3183 	 * lock, and then dispatch the freshly runnable thread.  So this should be
3184 	 * correct (we won't accidentally have a runnable thread that hasn't been
3185 	 * dispatched to an idle processor), if not ideal (we may be restarting the
3186 	 * dispatch process, which could have some overhead).
3187 	 */
3188 
3189 	if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3190 		uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3191 		    pset->pending_deferred_AST_cpu_mask &
3192 		    ~pset->pending_AST_URGENT_cpu_mask);
3193 		for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
3194 			active_processor = processor_array[cpuid];
3195 			/*
3196 			 * If a processor is DISPATCHING, it could be because of
3197 			 * a cancelable signal.
3198 			 *
3199 			 * IF the processor is not our
3200 			 * current processor (the current processor should not
3201 			 * be DISPATCHING, so this is a bit paranoid), AND there
3202 			 * is a cancelable signal pending on the processor, AND
3203 			 * there is no non-cancelable signal pending (as there is
3204 			 * no point trying to backtrack on bringing the processor
3205 			 * up if a signal we cannot cancel is outstanding), THEN
3206 			 * it should make sense to roll back the processor state
3207 			 * to the IDLE state.
3208 			 *
3209 			 * If the racey nature of this approach (as the signal
3210 			 * will be arbitrated by hardware, and can fire as we
3211 			 * roll back state) results in the core responding
3212 			 * despite being pushed back to the IDLE state, it
3213 			 * should be no different than if the core took some
3214 			 * interrupt while IDLE.
3215 			 */
3216 			if (active_processor != processor) {
3217 				/*
3218 				 * Squash all of the processor state back to some
3219 				 * reasonable facsimile of PROCESSOR_IDLE.
3220 				 */
3221 
3222 				processor_state_update_idle(active_processor);
3223 				active_processor->deadline = RT_DEADLINE_NONE;
3224 				pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
3225 				bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3226 				machine_signal_idle_cancel(active_processor);
3227 			}
3228 		}
3229 	}
3230 
3231 	pset_unlock(pset);
3232 }
3233 #else
3234 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
3235 #endif
3236 
3237 static void
thread_csw_callout(thread_t old,thread_t new,uint64_t timestamp)3238 thread_csw_callout(
3239 	thread_t            old,
3240 	thread_t            new,
3241 	uint64_t            timestamp)
3242 {
3243 	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3244 	uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3245 	machine_switch_perfcontrol_context(event, timestamp, 0,
3246 	    same_pri_latency, old, new);
3247 }
3248 
3249 
3250 /*
3251  *	thread_dispatch:
3252  *
3253  *	Handle threads at context switch.  Re-dispatch other thread
3254  *	if still running, otherwise update run state and perform
3255  *	special actions.  Update quantum for other thread and begin
3256  *	the quantum for ourselves.
3257  *
3258  *      "thread" is the old thread that we have switched away from.
3259  *      "self" is the new current thread that we have context switched to
3260  *
3261  *	Called at splsched.
3262  *
3263  */
3264 void
thread_dispatch(thread_t thread,thread_t self)3265 thread_dispatch(
3266 	thread_t                thread,
3267 	thread_t                self)
3268 {
3269 	processor_t             processor = self->last_processor;
3270 	bool was_idle = false;
3271 
3272 	assert(processor == current_processor());
3273 	assert(self == current_thread_volatile());
3274 	assert(thread != self);
3275 
3276 	if (thread != THREAD_NULL) {
3277 		/*
3278 		 * Do the perfcontrol callout for context switch.
3279 		 * The reason we do this here is:
3280 		 * - thread_dispatch() is called from various places that are not
3281 		 *   the direct context switch path for eg. processor shutdown etc.
3282 		 *   So adding the callout here covers all those cases.
3283 		 * - We want this callout as early as possible to be close
3284 		 *   to the timestamp taken in thread_invoke()
3285 		 * - We want to avoid holding the thread lock while doing the
3286 		 *   callout
3287 		 * - We do not want to callout if "thread" is NULL.
3288 		 */
3289 		thread_csw_callout(thread, self, processor->last_dispatch);
3290 
3291 #if KASAN
3292 		if (thread->continuation != NULL) {
3293 			/*
3294 			 * Thread has a continuation and the normal stack is going away.
3295 			 * Unpoison the stack and mark all fakestack objects as unused.
3296 			 */
3297 			kasan_fakestack_drop(thread);
3298 			if (thread->kernel_stack) {
3299 				kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3300 			}
3301 		}
3302 
3303 		/*
3304 		 * Free all unused fakestack objects.
3305 		 */
3306 		kasan_fakestack_gc(thread);
3307 #endif
3308 
3309 		/*
3310 		 *	If blocked at a continuation, discard
3311 		 *	the stack.
3312 		 */
3313 		if (thread->continuation != NULL && thread->kernel_stack != 0) {
3314 			stack_free(thread);
3315 		}
3316 
3317 		if (thread->state & TH_IDLE) {
3318 			was_idle = true;
3319 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3320 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3321 			    (uintptr_t)thread_tid(thread), 0, thread->state,
3322 			    sched_run_buckets[TH_BUCKET_RUN], 0);
3323 		} else {
3324 			int64_t consumed;
3325 			int64_t remainder = 0;
3326 
3327 			if (processor->quantum_end > processor->last_dispatch) {
3328 				remainder = processor->quantum_end -
3329 				    processor->last_dispatch;
3330 			}
3331 
3332 			consumed = thread->quantum_remaining - remainder;
3333 
3334 			if ((thread->reason & AST_LEDGER) == 0) {
3335 				/*
3336 				 * Bill CPU time to both the task and
3337 				 * the individual thread.
3338 				 */
3339 				ledger_credit_thread(thread, thread->t_ledger,
3340 				    task_ledgers.cpu_time, consumed);
3341 				ledger_credit_thread(thread, thread->t_threadledger,
3342 				    thread_ledgers.cpu_time, consumed);
3343 				if (thread->t_bankledger) {
3344 					ledger_credit_thread(thread, thread->t_bankledger,
3345 					    bank_ledgers.cpu_time,
3346 					    (consumed - thread->t_deduct_bank_ledger_time));
3347 				}
3348 				thread->t_deduct_bank_ledger_time = 0;
3349 				if (consumed > 0) {
3350 					/*
3351 					 * This should never be negative, but in traces we are seeing some instances
3352 					 * of consumed being negative.
3353 					 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3354 					 */
3355 					sched_update_pset_avg_execution_time(current_processor()->processor_set, consumed, processor->last_dispatch, thread->th_sched_bucket);
3356 				}
3357 			}
3358 
3359 			/* For the thread that we just context switched away from, figure
3360 			 * out if we have expired the wq quantum and set the AST if we have
3361 			 */
3362 			if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3363 				thread_evaluate_workqueue_quantum_expiry(thread);
3364 			}
3365 
3366 			/*
3367 			 * Pairs with task_restartable_ranges_synchronize
3368 			 */
3369 			wake_lock(thread);
3370 			thread_lock(thread);
3371 
3372 			/*
3373 			 * Same as ast_check(), in case we missed the IPI
3374 			 */
3375 			thread_reset_pcs_ack_IPI(thread);
3376 
3377 			/*
3378 			 * Apply a priority floor if the thread holds a kernel resource
3379 			 * or explicitly requested it.
3380 			 * Do this before checking starting_pri to avoid overpenalizing
3381 			 * repeated rwlock blockers.
3382 			 */
3383 			if (__improbable(thread->rwlock_count != 0)) {
3384 				lck_rw_set_promotion_locked(thread);
3385 			}
3386 			if (__improbable(thread->priority_floor_count != 0)) {
3387 				thread_floor_boost_set_promotion_locked(thread);
3388 			}
3389 
3390 			boolean_t keep_quantum = processor->first_timeslice;
3391 
3392 			/*
3393 			 * Treat a thread which has dropped priority since it got on core
3394 			 * as having expired its quantum.
3395 			 */
3396 			if (processor->starting_pri > thread->sched_pri) {
3397 				keep_quantum = FALSE;
3398 			}
3399 
3400 			/* Compute remainder of current quantum. */
3401 			if (keep_quantum &&
3402 			    processor->quantum_end > processor->last_dispatch) {
3403 				thread->quantum_remaining = (uint32_t)remainder;
3404 			} else {
3405 				thread->quantum_remaining = 0;
3406 			}
3407 
3408 			if (thread->sched_mode == TH_MODE_REALTIME) {
3409 				/*
3410 				 *	Cancel the deadline if the thread has
3411 				 *	consumed the entire quantum.
3412 				 */
3413 				if (thread->quantum_remaining == 0) {
3414 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3415 					    (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3416 					thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3417 				}
3418 			} else {
3419 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3420 				/*
3421 				 *	For non-realtime threads treat a tiny
3422 				 *	remaining quantum as an expired quantum
3423 				 *	but include what's left next time.
3424 				 */
3425 				if (thread->quantum_remaining < min_std_quantum) {
3426 					thread->reason |= AST_QUANTUM;
3427 					thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3428 				}
3429 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3430 			}
3431 
3432 			/*
3433 			 *	If we are doing a direct handoff then
3434 			 *	take the remainder of the quantum.
3435 			 */
3436 			if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3437 				self->quantum_remaining = thread->quantum_remaining;
3438 				thread->reason |= AST_QUANTUM;
3439 				thread->quantum_remaining = 0;
3440 			} else {
3441 #if defined(CONFIG_SCHED_MULTIQ)
3442 				if (SCHED(sched_groups_enabled) &&
3443 				    thread->sched_group == self->sched_group) {
3444 					KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3445 					    MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3446 					    self->reason, (uintptr_t)thread_tid(thread),
3447 					    self->quantum_remaining, thread->quantum_remaining, 0);
3448 
3449 					self->quantum_remaining = thread->quantum_remaining;
3450 					thread->quantum_remaining = 0;
3451 					/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3452 				}
3453 #endif /* defined(CONFIG_SCHED_MULTIQ) */
3454 			}
3455 
3456 			thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3457 
3458 			if (!(thread->state & TH_WAIT)) {
3459 				/*
3460 				 *	Still runnable.
3461 				 */
3462 				thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3463 
3464 				machine_thread_going_off_core(thread, FALSE, processor->last_dispatch, TRUE);
3465 
3466 				ast_t reason = thread->reason;
3467 				sched_options_t options = SCHED_NONE;
3468 
3469 				if (reason & AST_REBALANCE) {
3470 					options |= SCHED_REBALANCE;
3471 					if (reason & AST_QUANTUM) {
3472 						/*
3473 						 * Having gone to the trouble of forcing this thread off a less preferred core,
3474 						 * we should force the preferable core to reschedule immediately to give this
3475 						 * thread a chance to run instead of just sitting on the run queue where
3476 						 * it may just be stolen back by the idle core we just forced it off.
3477 						 * But only do this at the end of a quantum to prevent cascading effects.
3478 						 */
3479 						options |= SCHED_PREEMPT;
3480 					}
3481 				}
3482 
3483 				if (reason & AST_QUANTUM) {
3484 					options |= SCHED_TAILQ;
3485 				} else if (reason & AST_PREEMPT) {
3486 					options |= SCHED_HEADQ;
3487 				} else {
3488 					options |= (SCHED_PREEMPT | SCHED_TAILQ);
3489 				}
3490 
3491 				thread_setrun(thread, options);
3492 
3493 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3494 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3495 				    (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3496 				    sched_run_buckets[TH_BUCKET_RUN], 0);
3497 
3498 				if (thread->wake_active) {
3499 					thread->wake_active = FALSE;
3500 					thread_unlock(thread);
3501 
3502 					thread_wakeup(&thread->wake_active);
3503 				} else {
3504 					thread_unlock(thread);
3505 				}
3506 
3507 				wake_unlock(thread);
3508 			} else {
3509 				/*
3510 				 *	Waiting.
3511 				 */
3512 				boolean_t should_terminate = FALSE;
3513 				uint32_t new_run_count;
3514 				int thread_state = thread->state;
3515 
3516 				/* Only the first call to thread_dispatch
3517 				 * after explicit termination should add
3518 				 * the thread to the termination queue
3519 				 */
3520 				if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3521 					should_terminate = TRUE;
3522 					thread_state |= TH_TERMINATE2;
3523 				}
3524 
3525 				timer_stop(&thread->runnable_timer, processor->last_dispatch);
3526 
3527 				thread_state &= ~TH_RUN;
3528 				thread->state = thread_state;
3529 
3530 				thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3531 				thread->chosen_processor = PROCESSOR_NULL;
3532 
3533 				new_run_count = SCHED(run_count_decr)(thread);
3534 
3535 #if CONFIG_SCHED_AUTO_JOIN
3536 				if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3537 					work_interval_auto_join_unwind(thread);
3538 				}
3539 #endif /* CONFIG_SCHED_AUTO_JOIN */
3540 
3541 #if CONFIG_SCHED_SFI
3542 				if (thread->reason & AST_SFI) {
3543 					thread->wait_sfi_begin_time = processor->last_dispatch;
3544 				}
3545 #endif
3546 				machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch, FALSE);
3547 
3548 				KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3549 				    MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3550 				    (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3551 				    new_run_count, 0);
3552 
3553 				if (thread_state & TH_WAIT_REPORT) {
3554 					(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3555 				}
3556 
3557 				if (thread->wake_active) {
3558 					thread->wake_active = FALSE;
3559 					thread_unlock(thread);
3560 
3561 					thread_wakeup(&thread->wake_active);
3562 				} else {
3563 					thread_unlock(thread);
3564 				}
3565 
3566 				wake_unlock(thread);
3567 
3568 				if (should_terminate) {
3569 					thread_terminate_enqueue(thread);
3570 				}
3571 			}
3572 		}
3573 		/*
3574 		 * The thread could have been added to the termination queue, so it's
3575 		 * unsafe to use after this point.
3576 		 */
3577 		thread = THREAD_NULL;
3578 	}
3579 
3580 	int urgency = THREAD_URGENCY_NONE;
3581 	uint64_t latency = 0;
3582 
3583 	/* Update (new) current thread and reprogram running timers */
3584 	thread_lock(self);
3585 
3586 	if (!(self->state & TH_IDLE)) {
3587 		uint64_t        arg1, arg2;
3588 
3589 #if CONFIG_SCHED_SFI
3590 		ast_t                   new_ast;
3591 
3592 		new_ast = sfi_thread_needs_ast(self, NULL);
3593 
3594 		if (new_ast != AST_NONE) {
3595 			ast_on(new_ast);
3596 		}
3597 #endif
3598 
3599 		if (processor->last_dispatch < self->last_made_runnable_time) {
3600 			panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3601 			    processor->last_dispatch, self->last_made_runnable_time);
3602 		}
3603 
3604 		assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3605 
3606 		latency = processor->last_dispatch - self->last_made_runnable_time;
3607 		assert(latency >= self->same_pri_latency);
3608 
3609 		urgency = thread_get_urgency(self, &arg1, &arg2);
3610 
3611 		thread_tell_urgency(urgency, arg1, arg2, latency, self);
3612 
3613 		/*
3614 		 *	Get a new quantum if none remaining.
3615 		 */
3616 		if (self->quantum_remaining == 0) {
3617 			thread_quantum_init(self);
3618 		}
3619 
3620 		/*
3621 		 *	Set up quantum timer and timeslice.
3622 		 */
3623 		processor->quantum_end = processor->last_dispatch +
3624 		    self->quantum_remaining;
3625 
3626 		running_timer_setup(processor, RUNNING_TIMER_QUANTUM, self,
3627 		    processor->quantum_end, processor->last_dispatch);
3628 		if (was_idle) {
3629 			/*
3630 			 * kperf's running timer is active whenever the idle thread for a
3631 			 * CPU is not running.
3632 			 */
3633 			kperf_running_setup(processor, processor->last_dispatch);
3634 		}
3635 		running_timers_activate(processor);
3636 		processor->first_timeslice = TRUE;
3637 	} else {
3638 		running_timers_deactivate(processor);
3639 		processor->first_timeslice = FALSE;
3640 		thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
3641 	}
3642 
3643 	assert(self->block_hint == kThreadWaitNone);
3644 	self->computation_epoch = processor->last_dispatch;
3645 	self->reason = AST_NONE;
3646 	processor->starting_pri = self->sched_pri;
3647 
3648 	thread_unlock(self);
3649 
3650 	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
3651 	    processor->last_dispatch);
3652 
3653 #if defined(CONFIG_SCHED_DEFERRED_AST)
3654 	/*
3655 	 * TODO: Can we state that redispatching our old thread is also
3656 	 * uninteresting?
3657 	 */
3658 	if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3659 		pset_cancel_deferred_dispatch(processor->processor_set, processor);
3660 	}
3661 #endif
3662 }
3663 
3664 /*
3665  *	thread_block_reason:
3666  *
3667  *	Forces a reschedule, blocking the caller if a wait
3668  *	has been asserted.
3669  *
3670  *	If a continuation is specified, then thread_invoke will
3671  *	attempt to discard the thread's kernel stack.  When the
3672  *	thread resumes, it will execute the continuation function
3673  *	on a new kernel stack.
3674  */
3675 wait_result_t
thread_block_reason(thread_continue_t continuation,void * parameter,ast_t reason)3676 thread_block_reason(
3677 	thread_continue_t       continuation,
3678 	void                            *parameter,
3679 	ast_t                           reason)
3680 {
3681 	thread_t        self = current_thread();
3682 	processor_t     processor;
3683 	thread_t        new_thread;
3684 	spl_t           s;
3685 
3686 	s = splsched();
3687 
3688 	processor = current_processor();
3689 
3690 	/* If we're explicitly yielding, force a subsequent quantum */
3691 	if (reason & AST_YIELD) {
3692 		processor->first_timeslice = FALSE;
3693 	}
3694 
3695 	/* We're handling all scheduling AST's */
3696 	ast_off(AST_SCHEDULING);
3697 
3698 #if PROC_REF_DEBUG
3699 	if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3700 		uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3701 	}
3702 #endif
3703 
3704 	self->continuation = continuation;
3705 	self->parameter = parameter;
3706 
3707 	if (self->state & ~(TH_RUN | TH_IDLE)) {
3708 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3709 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3710 		    reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3711 	}
3712 
3713 	do {
3714 		thread_lock(self);
3715 		new_thread = thread_select(self, processor, &reason);
3716 		thread_unlock(self);
3717 	} while (!thread_invoke(self, new_thread, reason));
3718 
3719 	splx(s);
3720 
3721 	return self->wait_result;
3722 }
3723 
3724 /*
3725  *	thread_block:
3726  *
3727  *	Block the current thread if a wait has been asserted.
3728  */
3729 wait_result_t
thread_block(thread_continue_t continuation)3730 thread_block(
3731 	thread_continue_t       continuation)
3732 {
3733 	return thread_block_reason(continuation, NULL, AST_NONE);
3734 }
3735 
3736 wait_result_t
thread_block_parameter(thread_continue_t continuation,void * parameter)3737 thread_block_parameter(
3738 	thread_continue_t       continuation,
3739 	void                            *parameter)
3740 {
3741 	return thread_block_reason(continuation, parameter, AST_NONE);
3742 }
3743 
3744 /*
3745  *	thread_run:
3746  *
3747  *	Switch directly from the current thread to the
3748  *	new thread, handing off our quantum if appropriate.
3749  *
3750  *	New thread must be runnable, and not on a run queue.
3751  *
3752  *	Called at splsched.
3753  */
3754 int
thread_run(thread_t self,thread_continue_t continuation,void * parameter,thread_t new_thread)3755 thread_run(
3756 	thread_t                        self,
3757 	thread_continue_t       continuation,
3758 	void                            *parameter,
3759 	thread_t                        new_thread)
3760 {
3761 	ast_t reason = AST_NONE;
3762 
3763 	if ((self->state & TH_IDLE) == 0) {
3764 		reason = AST_HANDOFF;
3765 	}
3766 
3767 	/*
3768 	 * If this thread hadn't been setrun'ed, it
3769 	 * might not have a chosen processor, so give it one
3770 	 */
3771 	if (new_thread->chosen_processor == NULL) {
3772 		new_thread->chosen_processor = current_processor();
3773 	}
3774 
3775 	self->continuation = continuation;
3776 	self->parameter = parameter;
3777 
3778 	while (!thread_invoke(self, new_thread, reason)) {
3779 		/* the handoff failed, so we have to fall back to the normal block path */
3780 		processor_t processor = current_processor();
3781 
3782 		reason = AST_NONE;
3783 
3784 		thread_lock(self);
3785 		new_thread = thread_select(self, processor, &reason);
3786 		thread_unlock(self);
3787 	}
3788 
3789 	return self->wait_result;
3790 }
3791 
3792 /*
3793  *	thread_continue:
3794  *
3795  *	Called at splsched when a thread first receives
3796  *	a new stack after a continuation.
3797  *
3798  *	Called with THREAD_NULL as the old thread when
3799  *	invoked by machine_load_context.
3800  */
3801 void
thread_continue(thread_t thread)3802 thread_continue(
3803 	thread_t        thread)
3804 {
3805 	thread_t                self = current_thread();
3806 	thread_continue_t       continuation;
3807 	void                    *parameter;
3808 
3809 	DTRACE_SCHED(on__cpu);
3810 
3811 	continuation = self->continuation;
3812 	parameter = self->parameter;
3813 
3814 	assert(continuation != NULL);
3815 
3816 #if KPERF
3817 	kperf_on_cpu(self, continuation, NULL);
3818 #endif
3819 
3820 	thread_dispatch(thread, self);
3821 
3822 	self->continuation = self->parameter = NULL;
3823 
3824 #if INTERRUPT_MASKED_DEBUG
3825 	/* Reset interrupt-masked spin debugging timeout */
3826 	ml_spin_debug_clear(self);
3827 #endif
3828 
3829 	TLOG(1, "thread_continue: calling call_continuation\n");
3830 
3831 	boolean_t enable_interrupts = TRUE;
3832 
3833 	/* bootstrap thread, idle thread need to stay interrupts-disabled */
3834 	if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
3835 		enable_interrupts = FALSE;
3836 	}
3837 
3838 	call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3839 	/*NOTREACHED*/
3840 }
3841 
3842 void
thread_quantum_init(thread_t thread)3843 thread_quantum_init(thread_t thread)
3844 {
3845 	if (thread->sched_mode == TH_MODE_REALTIME) {
3846 		thread->quantum_remaining = thread->realtime.computation;
3847 	} else {
3848 		thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3849 	}
3850 }
3851 
3852 uint32_t
sched_timeshare_initial_quantum_size(thread_t thread)3853 sched_timeshare_initial_quantum_size(thread_t thread)
3854 {
3855 	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
3856 		return bg_quantum;
3857 	} else {
3858 		return std_quantum;
3859 	}
3860 }
3861 
3862 /*
3863  *	run_queue_init:
3864  *
3865  *	Initialize a run queue before first use.
3866  */
3867 void
run_queue_init(run_queue_t rq)3868 run_queue_init(
3869 	run_queue_t             rq)
3870 {
3871 	rq->highq = NOPRI;
3872 	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
3873 		rq->bitmap[i] = 0;
3874 	}
3875 	rq->urgency = rq->count = 0;
3876 	for (int i = 0; i < NRQS; i++) {
3877 		circle_queue_init(&rq->queues[i]);
3878 	}
3879 }
3880 
3881 /*
3882  *	run_queue_dequeue:
3883  *
3884  *	Perform a dequeue operation on a run queue,
3885  *	and return the resulting thread.
3886  *
3887  *	The run queue must be locked (see thread_run_queue_remove()
3888  *	for more info), and not empty.
3889  */
3890 thread_t
run_queue_dequeue(run_queue_t rq,sched_options_t options)3891 run_queue_dequeue(
3892 	run_queue_t     rq,
3893 	sched_options_t options)
3894 {
3895 	thread_t        thread;
3896 	circle_queue_t  queue = &rq->queues[rq->highq];
3897 
3898 	if (options & SCHED_HEADQ) {
3899 		thread = cqe_dequeue_head(queue, struct thread, runq_links);
3900 	} else {
3901 		thread = cqe_dequeue_tail(queue, struct thread, runq_links);
3902 	}
3903 
3904 	assert(thread != THREAD_NULL);
3905 	assert_thread_magic(thread);
3906 
3907 	thread->runq = PROCESSOR_NULL;
3908 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3909 	rq->count--;
3910 	if (SCHED(priority_is_urgent)(rq->highq)) {
3911 		rq->urgency--; assert(rq->urgency >= 0);
3912 	}
3913 	if (circle_queue_empty(queue)) {
3914 		bitmap_clear(rq->bitmap, rq->highq);
3915 		rq->highq = bitmap_first(rq->bitmap, NRQS);
3916 	}
3917 
3918 	return thread;
3919 }
3920 
3921 /*
3922  *	run_queue_enqueue:
3923  *
3924  *	Perform a enqueue operation on a run queue.
3925  *
3926  *	The run queue must be locked (see thread_run_queue_remove()
3927  *	for more info).
3928  */
3929 boolean_t
run_queue_enqueue(run_queue_t rq,thread_t thread,sched_options_t options)3930 run_queue_enqueue(
3931 	run_queue_t      rq,
3932 	thread_t         thread,
3933 	sched_options_t  options)
3934 {
3935 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
3936 	boolean_t       result = FALSE;
3937 
3938 	assert_thread_magic(thread);
3939 
3940 	if (circle_queue_empty(queue)) {
3941 		circle_enqueue_tail(queue, &thread->runq_links);
3942 
3943 		rq_bitmap_set(rq->bitmap, thread->sched_pri);
3944 		if (thread->sched_pri > rq->highq) {
3945 			rq->highq = thread->sched_pri;
3946 			result = TRUE;
3947 		}
3948 	} else {
3949 		if (options & SCHED_TAILQ) {
3950 			circle_enqueue_tail(queue, &thread->runq_links);
3951 		} else {
3952 			circle_enqueue_head(queue, &thread->runq_links);
3953 		}
3954 	}
3955 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3956 		rq->urgency++;
3957 	}
3958 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3959 	rq->count++;
3960 
3961 	return result;
3962 }
3963 
3964 /*
3965  *	run_queue_remove:
3966  *
3967  *	Remove a specific thread from a runqueue.
3968  *
3969  *	The run queue must be locked.
3970  */
3971 void
run_queue_remove(run_queue_t rq,thread_t thread)3972 run_queue_remove(
3973 	run_queue_t    rq,
3974 	thread_t       thread)
3975 {
3976 	circle_queue_t  queue = &rq->queues[thread->sched_pri];
3977 
3978 	assert(thread->runq != PROCESSOR_NULL);
3979 	assert_thread_magic(thread);
3980 
3981 	circle_dequeue(queue, &thread->runq_links);
3982 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3983 	rq->count--;
3984 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3985 		rq->urgency--; assert(rq->urgency >= 0);
3986 	}
3987 
3988 	if (circle_queue_empty(queue)) {
3989 		/* update run queue status */
3990 		bitmap_clear(rq->bitmap, thread->sched_pri);
3991 		rq->highq = bitmap_first(rq->bitmap, NRQS);
3992 	}
3993 
3994 	thread->runq = PROCESSOR_NULL;
3995 }
3996 
3997 /*
3998  *      run_queue_peek
3999  *
4000  *      Peek at the runq and return the highest
4001  *      priority thread from the runq.
4002  *
4003  *	The run queue must be locked.
4004  */
4005 thread_t
run_queue_peek(run_queue_t rq)4006 run_queue_peek(
4007 	run_queue_t    rq)
4008 {
4009 	if (rq->count > 0) {
4010 		circle_queue_t queue = &rq->queues[rq->highq];
4011 		thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4012 		assert_thread_magic(thread);
4013 		return thread;
4014 	} else {
4015 		return THREAD_NULL;
4016 	}
4017 }
4018 
4019 static bool
rt_runq_enqueue(rt_queue_t rt_run_queue,thread_t thread,processor_t processor)4020 rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4021 {
4022 	int pri = thread->sched_pri;
4023 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4024 	int i = pri - BASEPRI_RTQUEUES;
4025 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4026 	bitmap_t *map = rt_run_queue->bitmap;
4027 
4028 	bitmap_set(map, i);
4029 
4030 	queue_t     queue       = &rt_runq->pri_queue;
4031 	uint64_t    deadline    = thread->realtime.deadline;
4032 	bool        preempt     = false;
4033 	bool        earliest    = false;
4034 
4035 	if (queue_empty(queue)) {
4036 		enqueue_tail(queue, &thread->runq_links);
4037 		preempt = true;
4038 		earliest = true;
4039 		rt_runq->pri_earliest_deadline = deadline;
4040 		rt_runq->pri_constraint = thread->realtime.constraint;
4041 	} else {
4042 		/* Insert into rt_runq in thread deadline order */
4043 		queue_entry_t iter;
4044 		qe_foreach(iter, queue) {
4045 			thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4046 			assert_thread_magic(iter_thread);
4047 
4048 			if (deadline < iter_thread->realtime.deadline) {
4049 				if (iter == queue_first(queue)) {
4050 					preempt = true;
4051 					earliest = true;
4052 					rt_runq->pri_earliest_deadline = deadline;
4053 					rt_runq->pri_constraint = thread->realtime.constraint;
4054 				}
4055 				insque(&thread->runq_links, queue_prev(iter));
4056 				break;
4057 			} else if (iter == queue_last(queue)) {
4058 				enqueue_tail(queue, &thread->runq_links);
4059 				break;
4060 			}
4061 		}
4062 	}
4063 	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4064 		os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4065 		os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4066 		os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4067 	}
4068 
4069 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4070 	rt_runq->pri_count++;
4071 	os_atomic_inc(&rt_run_queue->count, relaxed);
4072 
4073 	thread->runq = processor;
4074 
4075 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4076 
4077 	return preempt;
4078 }
4079 
4080 static thread_t
rt_runq_dequeue(rt_queue_t rt_run_queue)4081 rt_runq_dequeue(rt_queue_t rt_run_queue)
4082 {
4083 	bitmap_t *map = rt_run_queue->bitmap;
4084 	int i = bitmap_first(map, NRTQS);
4085 	assert((i >= 0) && (i < NRTQS));
4086 
4087 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4088 
4089 	if (!sched_rt_runq_strict_priority) {
4090 		int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4091 		if (ed_index != i) {
4092 			assert((ed_index >= 0) && (ed_index < NRTQS));
4093 			rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4094 
4095 			thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4096 			thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4097 
4098 			if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4099 				/* choose the earliest deadline thread */
4100 				rt_runq = ed_runq;
4101 				i = ed_index;
4102 			}
4103 		}
4104 	}
4105 
4106 	assert(rt_runq->pri_count > 0);
4107 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4108 	uint32_t constraint = RT_CONSTRAINT_NONE;
4109 	int ed_index = NOPRI;
4110 	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4111 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4112 	if (--rt_runq->pri_count > 0) {
4113 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4114 		assert(next_rt != THREAD_NULL);
4115 		earliest_deadline = next_rt->realtime.deadline;
4116 		constraint = next_rt->realtime.constraint;
4117 		ed_index = i;
4118 	} else {
4119 		bitmap_clear(map, i);
4120 	}
4121 	rt_runq->pri_earliest_deadline = earliest_deadline;
4122 	rt_runq->pri_constraint = constraint;
4123 
4124 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4125 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4126 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4127 			earliest_deadline = rt_runq->pri_earliest_deadline;
4128 			constraint = rt_runq->pri_constraint;
4129 			ed_index = i;
4130 		}
4131 	}
4132 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4133 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4134 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4135 	os_atomic_dec(&rt_run_queue->count, relaxed);
4136 
4137 	new_thread->runq = PROCESSOR_NULL;
4138 
4139 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4140 
4141 	return new_thread;
4142 }
4143 
4144 static thread_t
rt_runq_first(rt_queue_t rt_run_queue)4145 rt_runq_first(rt_queue_t rt_run_queue)
4146 {
4147 	bitmap_t *map = rt_run_queue->bitmap;
4148 	int i = bitmap_first(map, NRTQS);
4149 	if (i < 0) {
4150 		return THREAD_NULL;
4151 	}
4152 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4153 	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4154 
4155 	return next_rt;
4156 }
4157 
4158 static void
rt_runq_remove(rt_queue_t rt_run_queue,thread_t thread)4159 rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4160 {
4161 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4162 
4163 	int pri = thread->sched_pri;
4164 	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4165 	int i = pri - BASEPRI_RTQUEUES;
4166 	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4167 	bitmap_t *map = rt_run_queue->bitmap;
4168 
4169 	assert(rt_runq->pri_count > 0);
4170 	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4171 	uint32_t constraint = RT_CONSTRAINT_NONE;
4172 	int ed_index = NOPRI;
4173 	remqueue(&thread->runq_links);
4174 	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4175 	if (--rt_runq->pri_count > 0) {
4176 		thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4177 		earliest_deadline = next_rt->realtime.deadline;
4178 		constraint = next_rt->realtime.constraint;
4179 		ed_index = i;
4180 	} else {
4181 		bitmap_clear(map, i);
4182 	}
4183 	rt_runq->pri_earliest_deadline = earliest_deadline;
4184 	rt_runq->pri_constraint = constraint;
4185 
4186 	for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4187 		rt_runq = &rt_run_queue->rt_queue_pri[i];
4188 		if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4189 			earliest_deadline = rt_runq->pri_earliest_deadline;
4190 			constraint = rt_runq->pri_constraint;
4191 			ed_index = i;
4192 		}
4193 	}
4194 	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4195 	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4196 	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4197 	os_atomic_dec(&rt_run_queue->count, relaxed);
4198 
4199 	thread->runq = PROCESSOR_NULL;
4200 
4201 	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4202 }
4203 
4204 rt_queue_t
sched_rtlocal_runq(processor_set_t pset)4205 sched_rtlocal_runq(processor_set_t pset)
4206 {
4207 	return &pset->rt_runq;
4208 }
4209 
4210 void
sched_rtlocal_init(processor_set_t pset)4211 sched_rtlocal_init(processor_set_t pset)
4212 {
4213 	pset_rt_init(pset);
4214 }
4215 
4216 void
sched_rtlocal_queue_shutdown(processor_t processor)4217 sched_rtlocal_queue_shutdown(processor_t processor)
4218 {
4219 	processor_set_t pset = processor->processor_set;
4220 	thread_t        thread;
4221 	queue_head_t    tqueue;
4222 
4223 	pset_lock(pset);
4224 
4225 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4226 	if (bit_count(pset_available_cpumap(pset)) > 0) {
4227 		pset_unlock(pset);
4228 		return;
4229 	}
4230 
4231 	queue_init(&tqueue);
4232 
4233 	while (rt_runq_count(pset) > 0) {
4234 		thread = rt_runq_dequeue(&pset->rt_runq);
4235 		enqueue_tail(&tqueue, &thread->runq_links);
4236 	}
4237 	sched_update_pset_load_average(pset, 0);
4238 	pset_update_rt_stealable_state(pset);
4239 	pset_unlock(pset);
4240 
4241 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4242 		remqueue(&thread->runq_links);
4243 
4244 		thread_lock(thread);
4245 
4246 		thread_setrun(thread, SCHED_TAILQ);
4247 
4248 		thread_unlock(thread);
4249 	}
4250 }
4251 
4252 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4253 void
sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)4254 sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4255 {
4256 	thread_t        thread;
4257 
4258 	pset_node_t node = &pset_node0;
4259 	processor_set_t pset = node->psets;
4260 
4261 	spl_t s = splsched();
4262 	do {
4263 		while (pset != NULL) {
4264 			pset_lock(pset);
4265 
4266 			bitmap_t *map = pset->rt_runq.bitmap;
4267 			for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) {
4268 				rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4269 
4270 				qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4271 					if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4272 						scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4273 					}
4274 				}
4275 			}
4276 
4277 			pset_unlock(pset);
4278 
4279 			pset = pset->pset_list;
4280 		}
4281 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4282 	splx(s);
4283 }
4284 
4285 int64_t
sched_rtlocal_runq_count_sum(void)4286 sched_rtlocal_runq_count_sum(void)
4287 {
4288 	pset_node_t node = &pset_node0;
4289 	processor_set_t pset = node->psets;
4290 	int64_t count = 0;
4291 
4292 	do {
4293 		while (pset != NULL) {
4294 			count += pset->rt_runq.runq_stats.count_sum;
4295 
4296 			pset = pset->pset_list;
4297 		}
4298 	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4299 
4300 	return count;
4301 }
4302 
4303 /*
4304  * Called with stealing_pset locked and
4305  * returns with stealing_pset locked
4306  * but the lock will have been dropped
4307  * if a thread is returned.
4308  */
4309 thread_t
sched_rtlocal_steal_thread(processor_set_t stealing_pset,uint64_t earliest_deadline)4310 sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4311 {
4312 	if (!sched_allow_rt_steal) {
4313 		return THREAD_NULL;
4314 	}
4315 	pset_map_t pset_map = stealing_pset->node->pset_map;
4316 
4317 	bit_clear(pset_map, stealing_pset->pset_id);
4318 
4319 	processor_set_t pset = stealing_pset;
4320 
4321 	processor_set_t target_pset;
4322 	uint64_t target_deadline;
4323 
4324 retry:
4325 	target_pset = NULL;
4326 	target_deadline = earliest_deadline - rt_deadline_epsilon;
4327 
4328 	for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) {
4329 		processor_set_t nset = pset_array[pset_id];
4330 
4331 		if (nset->stealable_rt_threads_earliest_deadline < target_deadline) {
4332 			target_deadline = nset->stealable_rt_threads_earliest_deadline;
4333 			target_pset = nset;
4334 		}
4335 	}
4336 
4337 	if (target_pset != NULL) {
4338 		pset = change_locked_pset(pset, target_pset);
4339 		if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4340 			thread_t new_thread = rt_runq_dequeue(&pset->rt_runq);
4341 			pset_update_rt_stealable_state(pset);
4342 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4343 
4344 			pset = change_locked_pset(pset, stealing_pset);
4345 			return new_thread;
4346 		}
4347 		pset = change_locked_pset(pset, stealing_pset);
4348 		earliest_deadline = rt_runq_earliest_deadline(pset);
4349 		goto retry;
4350 	}
4351 
4352 	pset = change_locked_pset(pset, stealing_pset);
4353 	return THREAD_NULL;
4354 }
4355 
4356 /*
4357  * pset is locked
4358  */
4359 thread_t
sched_rt_choose_thread(processor_set_t pset)4360 sched_rt_choose_thread(processor_set_t pset)
4361 {
4362 	processor_t processor = current_processor();
4363 	uint64_t rt_ll_deadline = 0;
4364 	if (rt_constraint_ll != 0) {
4365 		rt_ll_deadline = rt_constraint_ll + mach_absolute_time();
4366 	}
4367 
4368 	if (rt_runq_earliest_deadline(pset) < rt_ll_deadline) {
4369 		thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4370 		pset_update_rt_stealable_state(pset);
4371 		assert(new_thread != THREAD_NULL);
4372 		if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4373 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 1);
4374 		}
4375 		return new_thread;
4376 	}
4377 
4378 	if (SCHED(steal_thread_enabled)(pset)) {
4379 		do {
4380 			bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4381 			if (spill_pending) {
4382 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4383 			}
4384 			thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4385 			if (new_thread != THREAD_NULL) {
4386 				if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4387 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4388 				}
4389 				return new_thread;
4390 			}
4391 		} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4392 	}
4393 
4394 	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4395 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4396 	}
4397 
4398 	if (rt_runq_count(pset) > 0) {
4399 		thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4400 		assert(new_thread != THREAD_NULL);
4401 		pset_update_rt_stealable_state(pset);
4402 		return new_thread;
4403 	}
4404 
4405 	return THREAD_NULL;
4406 }
4407 
4408 /*
4409  *	realtime_queue_insert:
4410  *
4411  *	Enqueue a thread for realtime execution.
4412  */
4413 static bool
realtime_queue_insert(processor_t processor,processor_set_t pset,thread_t thread)4414 realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4415 {
4416 	pset_assert_locked(pset);
4417 
4418 	bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4419 	pset_update_rt_stealable_state(pset);
4420 
4421 	return preempt;
4422 }
4423 
4424 /*
4425  *	realtime_setrun:
4426  *
4427  *	Dispatch a thread for realtime execution.
4428  *
4429  *	Thread must be locked.  Associated pset must
4430  *	be locked, and is returned unlocked.
4431  */
4432 static void
realtime_setrun(processor_t chosen_processor,thread_t thread)4433 realtime_setrun(
4434 	processor_t                     chosen_processor,
4435 	thread_t                        thread)
4436 {
4437 	processor_set_t pset = chosen_processor->processor_set;
4438 	pset_assert_locked(pset);
4439 	bool pset_is_locked = true;
4440 
4441 	int n_backup = 0;
4442 
4443 	if (thread->realtime.constraint <= rt_constraint_threshold) {
4444 		n_backup = sched_rt_n_backup_processors;
4445 	}
4446 	assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4447 
4448 	int existing_backups = bit_count(pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4449 	if (existing_backups > 0) {
4450 		n_backup = n_backup - existing_backups;
4451 		if (n_backup < 0) {
4452 			n_backup = 0;
4453 		}
4454 	}
4455 
4456 	sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4457 	processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4458 
4459 	thread->chosen_processor = chosen_processor;
4460 
4461 	/* <rdar://problem/15102234> */
4462 	assert(thread->bound_processor == PROCESSOR_NULL);
4463 
4464 	realtime_queue_insert(chosen_processor, pset, thread);
4465 
4466 	processor_t processor = chosen_processor;
4467 
4468 	int count = 0;
4469 	for (int i = 0; i <= n_backup; i++) {
4470 		if (i == 0) {
4471 			ipi_type[i] = SCHED_IPI_NONE;
4472 			ipi_processor[i] = processor;
4473 			count++;
4474 
4475 			ast_t preempt = AST_NONE;
4476 			if (thread->sched_pri > processor->current_pri) {
4477 				preempt = (AST_PREEMPT | AST_URGENT);
4478 			} else if (thread->sched_pri == processor->current_pri) {
4479 				if (thread->realtime.constraint <= rt_constraint_ll) {
4480 					preempt = (AST_PREEMPT | AST_URGENT);
4481 				} else if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) {
4482 					preempt = (AST_PREEMPT | AST_URGENT);
4483 				}
4484 			}
4485 
4486 			if (preempt != AST_NONE) {
4487 				if (processor->state == PROCESSOR_IDLE) {
4488 					if (processor == current_processor()) {
4489 						pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4490 						ast_on(preempt);
4491 
4492 						if ((preempt & AST_URGENT) == AST_URGENT) {
4493 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4494 								KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4495 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4496 							}
4497 						}
4498 
4499 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4500 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4501 						}
4502 					} else {
4503 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4504 					}
4505 				} else if (processor->state == PROCESSOR_DISPATCHING) {
4506 					if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4507 						KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4508 						    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4509 					}
4510 				} else {
4511 					if (processor == current_processor()) {
4512 						ast_on(preempt);
4513 
4514 						if ((preempt & AST_URGENT) == AST_URGENT) {
4515 							if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4516 								KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4517 								    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4518 							}
4519 						}
4520 
4521 						if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4522 							bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4523 						}
4524 					} else {
4525 						ipi_type[i] = sched_ipi_action(processor, thread, SCHED_IPI_EVENT_RT_PREEMPT);
4526 					}
4527 				}
4528 			} else {
4529 				/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4530 			}
4531 		} else {
4532 			if (!pset_is_locked) {
4533 				pset_lock(pset);
4534 			}
4535 			ipi_type[i] = SCHED_IPI_NONE;
4536 			ipi_processor[i] = PROCESSOR_NULL;
4537 			pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]);
4538 			if (ipi_processor[i] == PROCESSOR_NULL) {
4539 				break;
4540 			}
4541 			count++;
4542 
4543 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4544 			    ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4545 #if defined(__x86_64__)
4546 #define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4547 			if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4548 				processor_t p0 = ipi_processor[0];
4549 				processor_t p1 = ipi_processor[1];
4550 				assert(p0 && p1);
4551 				if (p_is_good(p0) && p_is_good(p1)) {
4552 					/*
4553 					 * Both the chosen processor and the first backup are non-cpu0 primaries,
4554 					 * so there is no need for a 2nd backup processor.
4555 					 */
4556 					break;
4557 				}
4558 			}
4559 #endif
4560 		}
4561 	}
4562 
4563 	if (pset_is_locked) {
4564 		pset_unlock(pset);
4565 	}
4566 
4567 	assert((count > 0) && (count <= (n_backup + 1)));
4568 	for (int i = 0; i < count; i++) {
4569 		assert(ipi_processor[i] != PROCESSOR_NULL);
4570 		sched_ipi_perform(ipi_processor[i], ipi_type[i]);
4571 	}
4572 }
4573 
4574 
4575 sched_ipi_type_t
sched_ipi_deferred_policy(processor_set_t pset,processor_t dst,thread_t thread,__unused sched_ipi_event_t event)4576 sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4577     thread_t thread, __unused sched_ipi_event_t event)
4578 {
4579 #if defined(CONFIG_SCHED_DEFERRED_AST)
4580 #if CONFIG_THREAD_GROUPS
4581 	if (thread) {
4582 		struct thread_group *tg = thread_group_get(thread);
4583 		if (thread_group_uses_immediate_ipi(tg)) {
4584 			return SCHED_IPI_IMMEDIATE;
4585 		}
4586 	}
4587 #endif /* CONFIG_THREAD_GROUPS */
4588 	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4589 		return SCHED_IPI_DEFERRED;
4590 	}
4591 #else /* CONFIG_SCHED_DEFERRED_AST */
4592 	(void) thread;
4593 	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4594 #endif /* CONFIG_SCHED_DEFERRED_AST */
4595 	return SCHED_IPI_NONE;
4596 }
4597 
4598 sched_ipi_type_t
sched_ipi_action(processor_t dst,thread_t thread,sched_ipi_event_t event)4599 sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4600 {
4601 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4602 	assert(dst != NULL);
4603 
4604 	processor_set_t pset = dst->processor_set;
4605 	if (current_processor() == dst) {
4606 		return SCHED_IPI_NONE;
4607 	}
4608 
4609 	bool dst_idle = (dst->state == PROCESSOR_IDLE);
4610 	if (dst_idle) {
4611 		pset_update_processor_state(pset, dst, PROCESSOR_DISPATCHING);
4612 	}
4613 
4614 	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4615 	switch (ipi_type) {
4616 	case SCHED_IPI_NONE:
4617 		return SCHED_IPI_NONE;
4618 #if defined(CONFIG_SCHED_DEFERRED_AST)
4619 	case SCHED_IPI_DEFERRED:
4620 		bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4621 		break;
4622 #endif /* CONFIG_SCHED_DEFERRED_AST */
4623 	default:
4624 		if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4625 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4626 			    dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4627 		}
4628 		bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4629 		break;
4630 	}
4631 	return ipi_type;
4632 }
4633 
4634 sched_ipi_type_t
sched_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4635 sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4636 {
4637 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4638 	boolean_t deferred_ipi_supported = false;
4639 	processor_set_t pset = dst->processor_set;
4640 
4641 #if defined(CONFIG_SCHED_DEFERRED_AST)
4642 	deferred_ipi_supported = true;
4643 #endif /* CONFIG_SCHED_DEFERRED_AST */
4644 
4645 	switch (event) {
4646 	case SCHED_IPI_EVENT_SPILL:
4647 	case SCHED_IPI_EVENT_SMT_REBAL:
4648 	case SCHED_IPI_EVENT_REBALANCE:
4649 	case SCHED_IPI_EVENT_BOUND_THR:
4650 	case SCHED_IPI_EVENT_RT_PREEMPT:
4651 		/*
4652 		 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4653 		 * scenarios use immediate IPIs always.
4654 		 */
4655 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4656 		break;
4657 	case SCHED_IPI_EVENT_PREEMPT:
4658 		/* In the preemption case, use immediate IPIs for RT threads */
4659 		if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4660 			ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4661 			break;
4662 		}
4663 
4664 		/*
4665 		 * For Non-RT threads preemption,
4666 		 * If the core is active, use immediate IPIs.
4667 		 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4668 		 */
4669 		if (deferred_ipi_supported && dst_idle) {
4670 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4671 		}
4672 		ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4673 		break;
4674 	default:
4675 		panic("Unrecognized scheduler IPI event type %d", event);
4676 	}
4677 	assert(ipi_type != SCHED_IPI_NONE);
4678 	return ipi_type;
4679 }
4680 
4681 void
sched_ipi_perform(processor_t dst,sched_ipi_type_t ipi)4682 sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4683 {
4684 	switch (ipi) {
4685 	case SCHED_IPI_NONE:
4686 		break;
4687 	case SCHED_IPI_IDLE:
4688 		machine_signal_idle(dst);
4689 		break;
4690 	case SCHED_IPI_IMMEDIATE:
4691 		cause_ast_check(dst);
4692 		break;
4693 	case SCHED_IPI_DEFERRED:
4694 		machine_signal_idle_deferred(dst);
4695 		break;
4696 	default:
4697 		panic("Unrecognized scheduler IPI type: %d", ipi);
4698 	}
4699 }
4700 
4701 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4702 
4703 boolean_t
priority_is_urgent(int priority)4704 priority_is_urgent(int priority)
4705 {
4706 	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
4707 }
4708 
4709 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4710 
4711 /*
4712  *	processor_setrun:
4713  *
4714  *	Dispatch a thread for execution on a
4715  *	processor.
4716  *
4717  *	Thread must be locked.  Associated pset must
4718  *	be locked, and is returned unlocked.
4719  */
4720 static void
processor_setrun(processor_t processor,thread_t thread,integer_t options)4721 processor_setrun(
4722 	processor_t                     processor,
4723 	thread_t                        thread,
4724 	integer_t                       options)
4725 {
4726 	processor_set_t pset = processor->processor_set;
4727 	pset_assert_locked(pset);
4728 	ast_t preempt;
4729 	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4730 
4731 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4732 
4733 	thread->chosen_processor = processor;
4734 
4735 	/*
4736 	 *	Set preemption mode.
4737 	 */
4738 #if defined(CONFIG_SCHED_DEFERRED_AST)
4739 	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4740 #endif
4741 	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4742 		preempt = (AST_PREEMPT | AST_URGENT);
4743 	} else if (processor->current_is_eagerpreempt) {
4744 		preempt = (AST_PREEMPT | AST_URGENT);
4745 	} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4746 		if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4747 			preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4748 		} else {
4749 			preempt = AST_NONE;
4750 		}
4751 	} else {
4752 		preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4753 	}
4754 
4755 	if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4756 		/*
4757 		 * Having gone to the trouble of forcing this thread off a less preferred core,
4758 		 * we should force the preferable core to reschedule immediately to give this
4759 		 * thread a chance to run instead of just sitting on the run queue where
4760 		 * it may just be stolen back by the idle core we just forced it off.
4761 		 */
4762 		preempt |= AST_PREEMPT;
4763 	}
4764 
4765 	SCHED(processor_enqueue)(processor, thread, options);
4766 	sched_update_pset_load_average(pset, 0);
4767 
4768 	if (preempt != AST_NONE) {
4769 		if (processor->state == PROCESSOR_IDLE) {
4770 			ipi_action = eExitIdle;
4771 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4772 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4773 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4774 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4775 			}
4776 		} else if ((processor->state == PROCESSOR_RUNNING ||
4777 		    processor->state == PROCESSOR_SHUTDOWN) &&
4778 		    (thread->sched_pri >= processor->current_pri)) {
4779 			ipi_action = eInterruptRunning;
4780 		}
4781 	} else {
4782 		/*
4783 		 * New thread is not important enough to preempt what is running, but
4784 		 * special processor states may need special handling
4785 		 */
4786 		if (processor->state == PROCESSOR_SHUTDOWN &&
4787 		    thread->sched_pri >= processor->current_pri) {
4788 			ipi_action = eInterruptRunning;
4789 		} else if (processor->state == PROCESSOR_IDLE) {
4790 			ipi_action = eExitIdle;
4791 		} else if (processor->state == PROCESSOR_DISPATCHING) {
4792 			if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4793 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4794 				    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
4795 			}
4796 		}
4797 	}
4798 
4799 	if (ipi_action != eDoNothing) {
4800 		if (processor == current_processor()) {
4801 			if (ipi_action == eExitIdle) {
4802 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
4803 			}
4804 			if ((preempt = csw_check_locked(processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
4805 				ast_on(preempt);
4806 			}
4807 
4808 			if ((preempt & AST_URGENT) == AST_URGENT) {
4809 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4810 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4811 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
4812 				}
4813 			} else {
4814 				if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4815 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
4816 				}
4817 			}
4818 
4819 			if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4820 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4821 			} else {
4822 				bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4823 			}
4824 		} else {
4825 			sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
4826 			ipi_type = sched_ipi_action(processor, thread, event);
4827 		}
4828 	}
4829 	pset_unlock(pset);
4830 	sched_ipi_perform(processor, ipi_type);
4831 }
4832 
4833 /*
4834  *	choose_next_pset:
4835  *
4836  *	Return the next sibling pset containing
4837  *	available processors.
4838  *
4839  *	Returns the original pset if none other is
4840  *	suitable.
4841  */
4842 static processor_set_t
choose_next_pset(processor_set_t pset)4843 choose_next_pset(
4844 	processor_set_t         pset)
4845 {
4846 	processor_set_t         nset = pset;
4847 
4848 	do {
4849 		nset = next_pset(nset);
4850 	} while (nset->online_processor_count < 1 && nset != pset);
4851 
4852 	return nset;
4853 }
4854 
4855 /*
4856  *	choose_processor:
4857  *
4858  *	Choose a processor for the thread, beginning at
4859  *	the pset.  Accepts an optional processor hint in
4860  *	the pset.
4861  *
4862  *	Returns a processor, possibly from a different pset.
4863  *
4864  *	The thread must be locked.  The pset must be locked,
4865  *	and the resulting pset is locked on return.
4866  */
4867 processor_t
choose_processor(processor_set_t starting_pset,processor_t processor,thread_t thread)4868 choose_processor(
4869 	processor_set_t         starting_pset,
4870 	processor_t             processor,
4871 	thread_t                thread)
4872 {
4873 	processor_set_t pset = starting_pset;
4874 	processor_set_t nset;
4875 
4876 	assert(thread->sched_pri <= MAXPRI);
4877 
4878 	/*
4879 	 * Prefer the hinted processor, when appropriate.
4880 	 */
4881 
4882 	/* Fold last processor hint from secondary processor to its primary */
4883 	if (processor != PROCESSOR_NULL) {
4884 		processor = processor->processor_primary;
4885 	}
4886 
4887 	/*
4888 	 * Only consult platform layer if pset is active, which
4889 	 * it may not be in some cases when a multi-set system
4890 	 * is going to sleep.
4891 	 */
4892 	if (pset->online_processor_count) {
4893 		if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
4894 			processor_t mc_processor = machine_choose_processor(pset, processor);
4895 			if (mc_processor != PROCESSOR_NULL) {
4896 				processor = mc_processor->processor_primary;
4897 			}
4898 		}
4899 	}
4900 
4901 	/*
4902 	 * At this point, we may have a processor hint, and we may have
4903 	 * an initial starting pset. If the hint is not in the pset, or
4904 	 * if the hint is for a processor in an invalid state, discard
4905 	 * the hint.
4906 	 */
4907 	if (processor != PROCESSOR_NULL) {
4908 		if (processor->processor_set != pset) {
4909 			processor = PROCESSOR_NULL;
4910 		} else if (!processor->is_recommended) {
4911 			processor = PROCESSOR_NULL;
4912 		} else {
4913 			switch (processor->state) {
4914 			case PROCESSOR_START:
4915 			case PROCESSOR_SHUTDOWN:
4916 			case PROCESSOR_OFF_LINE:
4917 				/*
4918 				 * Hint is for a processor that cannot support running new threads.
4919 				 */
4920 				processor = PROCESSOR_NULL;
4921 				break;
4922 			case PROCESSOR_IDLE:
4923 				/*
4924 				 * Hint is for an idle processor. Assume it is no worse than any other
4925 				 * idle processor. The platform layer had an opportunity to provide
4926 				 * the "least cost idle" processor above.
4927 				 */
4928 				if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4929 					return processor;
4930 				}
4931 				processor = PROCESSOR_NULL;
4932 				break;
4933 			case PROCESSOR_RUNNING:
4934 			case PROCESSOR_DISPATCHING:
4935 				/*
4936 				 * Hint is for an active CPU. This fast-path allows
4937 				 * realtime threads to preempt non-realtime threads
4938 				 * to regain their previous executing processor.
4939 				 */
4940 				if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4941 					if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
4942 						return processor;
4943 					}
4944 					processor = PROCESSOR_NULL;
4945 				}
4946 
4947 				/* Otherwise, use hint as part of search below */
4948 				break;
4949 			default:
4950 				processor = PROCESSOR_NULL;
4951 				break;
4952 			}
4953 		}
4954 	}
4955 
4956 	/*
4957 	 * Iterate through the processor sets to locate
4958 	 * an appropriate processor. Seed results with
4959 	 * a last-processor hint, if available, so that
4960 	 * a search must find something strictly better
4961 	 * to replace it.
4962 	 *
4963 	 * A primary/secondary pair of SMT processors are
4964 	 * "unpaired" if the primary is busy but its
4965 	 * corresponding secondary is idle (so the physical
4966 	 * core has full use of its resources).
4967 	 */
4968 
4969 	integer_t lowest_priority = MAXPRI + 1;
4970 	integer_t lowest_secondary_priority = MAXPRI + 1;
4971 	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
4972 	integer_t lowest_idle_secondary_priority = MAXPRI + 1;
4973 	integer_t lowest_count = INT_MAX;
4974 	uint64_t  furthest_deadline = 1;
4975 	processor_t lp_processor = PROCESSOR_NULL;
4976 	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
4977 	processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
4978 	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
4979 	processor_t lc_processor = PROCESSOR_NULL;
4980 	processor_t fd_processor = PROCESSOR_NULL;
4981 
4982 	if (processor != PROCESSOR_NULL) {
4983 		/* All other states should be enumerated above. */
4984 		assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
4985 		assert(thread->sched_pri < BASEPRI_RTQUEUES);
4986 
4987 		lowest_priority = processor->current_pri;
4988 		lp_processor = processor;
4989 
4990 		lowest_count = SCHED(processor_runq_count)(processor);
4991 		lc_processor = processor;
4992 	}
4993 
4994 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4995 		pset_node_t node = pset->node;
4996 		bool include_ast_urgent_pending_cpus = false;
4997 		cpumap_t ast_urgent_pending;
4998 try_again:
4999 		ast_urgent_pending = 0;
5000 		int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5001 		for (; consider_secondaries < 2; consider_secondaries++) {
5002 			pset = change_locked_pset(pset, starting_pset);
5003 			do {
5004 				cpumap_t available_map = pset_available_cpumap(pset);
5005 				if (available_map == 0) {
5006 					goto no_available_cpus;
5007 				}
5008 
5009 				processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5010 				if (processor) {
5011 					return processor;
5012 				}
5013 
5014 				if (consider_secondaries) {
5015 					processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5016 					if (processor && (processor->deadline > furthest_deadline)) {
5017 						fd_processor = processor;
5018 						furthest_deadline = processor->deadline;
5019 						if (sched_choose_first_fd_processor && ((rt_constraint_ll == 0) || (furthest_deadline > rt_constraint_ll + mach_absolute_time()))) {
5020 							/*
5021 							 * Instead of looping through all the psets to find the global
5022 							 * furthest deadline processor, preempt the first candidate found.
5023 							 * The preempted thread will then find any other available far deadline
5024 							 * processors to preempt.
5025 							 */
5026 							return fd_processor;
5027 						}
5028 					}
5029 
5030 					ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5031 
5032 					if (rt_runq_count(pset) < lowest_count) {
5033 						int cpuid = bit_first(available_map);
5034 						assert(cpuid >= 0);
5035 						lc_processor = processor_array[cpuid];
5036 						lowest_count = rt_runq_count(pset);
5037 					}
5038 				}
5039 
5040 no_available_cpus:
5041 				nset = next_pset(pset);
5042 
5043 				if (nset != starting_pset) {
5044 					pset = change_locked_pset(pset, nset);
5045 				}
5046 			} while (nset != starting_pset);
5047 		}
5048 
5049 		/* Short cut for single pset nodes */
5050 		if (bit_count(node->pset_map) == 1) {
5051 			if (fd_processor) {
5052 				pset_assert_locked(fd_processor->processor_set);
5053 				return fd_processor;
5054 			} else if (lc_processor) {
5055 				pset_assert_locked(lc_processor->processor_set);
5056 				return lc_processor;
5057 			}
5058 		} else {
5059 			if ((fd_processor == PROCESSOR_NULL) && ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5060 				/* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5061 				include_ast_urgent_pending_cpus = true;
5062 				goto try_again;
5063 			}
5064 		}
5065 
5066 		processor = PROCESSOR_NULL;
5067 		if (fd_processor) {
5068 			processor = fd_processor;
5069 		} else if (lc_processor) {
5070 			processor = lc_processor;
5071 		}
5072 
5073 		if (processor) {
5074 			pset = change_locked_pset(pset, processor->processor_set);
5075 			/* Check that chosen processor is still usable */
5076 			cpumap_t available_map = pset_available_cpumap(pset);
5077 			if (bit_test(available_map, processor->cpu_id)) {
5078 				return processor;
5079 			}
5080 
5081 			/* processor is no longer usable */
5082 			processor = PROCESSOR_NULL;
5083 		}
5084 
5085 		pset_assert_locked(pset);
5086 		pset_unlock(pset);
5087 		return PROCESSOR_NULL;
5088 	}
5089 
5090 	/* No realtime threads from this point on */
5091 	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5092 
5093 	do {
5094 		/*
5095 		 * Choose an idle processor, in pset traversal order
5096 		 */
5097 
5098 		uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5099 		    pset->primary_map &
5100 		    pset->recommended_bitmask);
5101 
5102 		/* there shouldn't be a pending AST if the processor is idle */
5103 		assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5104 
5105 		int cpuid = lsb_first(idle_primary_map);
5106 		if (cpuid >= 0) {
5107 			processor = processor_array[cpuid];
5108 			return processor;
5109 		}
5110 
5111 		/*
5112 		 * Otherwise, enumerate active and idle processors to find primary candidates
5113 		 * with lower priority/etc.
5114 		 */
5115 
5116 		uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5117 		    pset->recommended_bitmask &
5118 		    ~pset->pending_AST_URGENT_cpu_mask);
5119 
5120 		if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5121 			active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5122 		}
5123 
5124 		active_map = bit_ror64(active_map, (pset->last_chosen + 1));
5125 		for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
5126 			cpuid = ((rotid + pset->last_chosen + 1) & 63);
5127 			processor = processor_array[cpuid];
5128 
5129 			integer_t cpri = processor->current_pri;
5130 			processor_t primary = processor->processor_primary;
5131 			if (primary != processor) {
5132 				/* If primary is running a NO_SMT thread, don't choose its secondary */
5133 				if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(primary))) {
5134 					if (cpri < lowest_secondary_priority) {
5135 						lowest_secondary_priority = cpri;
5136 						lp_paired_secondary_processor = processor;
5137 					}
5138 				}
5139 			} else {
5140 				if (cpri < lowest_priority) {
5141 					lowest_priority = cpri;
5142 					lp_processor = processor;
5143 				}
5144 			}
5145 
5146 			integer_t ccount = SCHED(processor_runq_count)(processor);
5147 			if (ccount < lowest_count) {
5148 				lowest_count = ccount;
5149 				lc_processor = processor;
5150 			}
5151 		}
5152 
5153 		/*
5154 		 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5155 		 * the idle primary would have short-circuited the loop above
5156 		 */
5157 		uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5158 		    ~pset->primary_map &
5159 		    pset->recommended_bitmask);
5160 
5161 		/* there shouldn't be a pending AST if the processor is idle */
5162 		assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5163 		assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5164 
5165 		for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
5166 			processor = processor_array[cpuid];
5167 
5168 			processor_t cprimary = processor->processor_primary;
5169 
5170 			integer_t primary_pri = cprimary->current_pri;
5171 
5172 			/*
5173 			 * TODO: This should also make the same decisions
5174 			 * as secondary_can_run_realtime_thread
5175 			 *
5176 			 * TODO: Keep track of the pending preemption priority
5177 			 * of the primary to make this more accurate.
5178 			 */
5179 
5180 			/* If the primary is running a no-smt thread, then don't choose its secondary */
5181 			if (cprimary->state == PROCESSOR_RUNNING &&
5182 			    processor_active_thread_no_smt(cprimary)) {
5183 				continue;
5184 			}
5185 
5186 			/*
5187 			 * Find the idle secondary processor with the lowest priority primary
5188 			 *
5189 			 * We will choose this processor as a fallback if we find no better
5190 			 * primary to preempt.
5191 			 */
5192 			if (primary_pri < lowest_idle_secondary_priority) {
5193 				lp_idle_secondary_processor = processor;
5194 				lowest_idle_secondary_priority = primary_pri;
5195 			}
5196 
5197 			/* Find the the lowest priority active primary with idle secondary */
5198 			if (primary_pri < lowest_unpaired_primary_priority) {
5199 				/* If the primary processor is offline or starting up, it's not a candidate for this path */
5200 				if (cprimary->state != PROCESSOR_RUNNING &&
5201 				    cprimary->state != PROCESSOR_DISPATCHING) {
5202 					continue;
5203 				}
5204 
5205 				if (!cprimary->is_recommended) {
5206 					continue;
5207 				}
5208 
5209 				/* if the primary is pending preemption, don't try to re-preempt it */
5210 				if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5211 					continue;
5212 				}
5213 
5214 				if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5215 				    bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5216 					continue;
5217 				}
5218 
5219 				lowest_unpaired_primary_priority = primary_pri;
5220 				lp_unpaired_primary_processor = cprimary;
5221 			}
5222 		}
5223 
5224 		/*
5225 		 * We prefer preempting a primary processor over waking up its secondary.
5226 		 * The secondary will then be woken up by the preempted thread.
5227 		 */
5228 		if (thread->sched_pri > lowest_unpaired_primary_priority) {
5229 			pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5230 			return lp_unpaired_primary_processor;
5231 		}
5232 
5233 		/*
5234 		 * We prefer preempting a lower priority active processor over directly
5235 		 * waking up an idle secondary.
5236 		 * The preempted thread will then find the idle secondary.
5237 		 */
5238 		if (thread->sched_pri > lowest_priority) {
5239 			pset->last_chosen = lp_processor->cpu_id;
5240 			return lp_processor;
5241 		}
5242 
5243 		/*
5244 		 * lc_processor is used to indicate the best processor set run queue
5245 		 * on which to enqueue a thread when all available CPUs are busy with
5246 		 * higher priority threads, so try to make sure it is initialized.
5247 		 */
5248 		if (lc_processor == PROCESSOR_NULL) {
5249 			cpumap_t available_map = pset_available_cpumap(pset);
5250 			cpuid = lsb_first(available_map);
5251 			if (cpuid >= 0) {
5252 				lc_processor = processor_array[cpuid];
5253 				lowest_count = SCHED(processor_runq_count)(lc_processor);
5254 			}
5255 		}
5256 
5257 		/*
5258 		 * Move onto the next processor set.
5259 		 *
5260 		 * If all primary processors in this pset are running a higher
5261 		 * priority thread, move on to next pset. Only when we have
5262 		 * exhausted the search for primary processors do we
5263 		 * fall back to secondaries.
5264 		 */
5265 #if CONFIG_SCHED_EDGE
5266 		/*
5267 		 * The edge scheduler expects a CPU to be selected from the pset it passed in
5268 		 * as the starting pset for non-RT workloads. The edge migration algorithm
5269 		 * should already have considered idle CPUs and loads to decide the starting_pset;
5270 		 * which means that this loop can be short-circuted.
5271 		 */
5272 		nset = starting_pset;
5273 #else /* CONFIG_SCHED_EDGE */
5274 		nset = next_pset(pset);
5275 #endif /* CONFIG_SCHED_EDGE */
5276 
5277 		if (nset != starting_pset) {
5278 			pset = change_locked_pset(pset, nset);
5279 		}
5280 	} while (nset != starting_pset);
5281 
5282 	/*
5283 	 * Make sure that we pick a running processor,
5284 	 * and that the correct processor set is locked.
5285 	 * Since we may have unlocked the candidate processor's
5286 	 * pset, it may have changed state.
5287 	 *
5288 	 * All primary processors are running a higher priority
5289 	 * thread, so the only options left are enqueuing on
5290 	 * the secondary processor that would perturb the least priority
5291 	 * primary, or the least busy primary.
5292 	 */
5293 
5294 	/* lowest_priority is evaluated in the main loops above */
5295 	if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5296 		processor = lp_idle_secondary_processor;
5297 	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5298 		processor = lp_paired_secondary_processor;
5299 	} else if (lc_processor != PROCESSOR_NULL) {
5300 		processor = lc_processor;
5301 	} else {
5302 		processor = PROCESSOR_NULL;
5303 	}
5304 
5305 	if (processor) {
5306 		pset = change_locked_pset(pset, processor->processor_set);
5307 		/* Check that chosen processor is still usable */
5308 		cpumap_t available_map = pset_available_cpumap(pset);
5309 		if (bit_test(available_map, processor->cpu_id)) {
5310 			pset->last_chosen = processor->cpu_id;
5311 			return processor;
5312 		}
5313 
5314 		/* processor is no longer usable */
5315 		processor = PROCESSOR_NULL;
5316 	}
5317 
5318 	pset_assert_locked(pset);
5319 	pset_unlock(pset);
5320 	return PROCESSOR_NULL;
5321 }
5322 
5323 /*
5324  * Default implementation of SCHED(choose_node)()
5325  * for single node systems
5326  */
5327 pset_node_t
sched_choose_node(__unused thread_t thread)5328 sched_choose_node(__unused thread_t thread)
5329 {
5330 	return &pset_node0;
5331 }
5332 
5333 /*
5334  *	choose_starting_pset:
5335  *
5336  *	Choose a starting processor set for the thread.
5337  *	May return a processor hint within the pset.
5338  *
5339  *	Returns a starting processor set, to be used by
5340  *      choose_processor.
5341  *
5342  *	The thread must be locked.  The resulting pset is unlocked on return,
5343  *      and is chosen without taking any pset locks.
5344  */
5345 processor_set_t
choose_starting_pset(pset_node_t node,thread_t thread,processor_t * processor_hint)5346 choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5347 {
5348 	processor_set_t pset;
5349 	processor_t processor = PROCESSOR_NULL;
5350 
5351 	if (thread->affinity_set != AFFINITY_SET_NULL) {
5352 		/*
5353 		 * Use affinity set policy hint.
5354 		 */
5355 		pset = thread->affinity_set->aset_pset;
5356 	} else if (thread->last_processor != PROCESSOR_NULL) {
5357 		/*
5358 		 *	Simple (last processor) affinity case.
5359 		 */
5360 		processor = thread->last_processor;
5361 		pset = processor->processor_set;
5362 	} else {
5363 		/*
5364 		 *	No Affinity case:
5365 		 *
5366 		 *	Utilitize a per task hint to spread threads
5367 		 *	among the available processor sets.
5368 		 * NRG this seems like the wrong thing to do.
5369 		 * See also task->pset_hint = pset in thread_setrun()
5370 		 */
5371 		pset = get_threadtask(thread)->pset_hint;
5372 		if (pset == PROCESSOR_SET_NULL) {
5373 			pset = current_processor()->processor_set;
5374 		}
5375 
5376 		pset = choose_next_pset(pset);
5377 	}
5378 
5379 	if (!bit_test(node->pset_map, pset->pset_id)) {
5380 		/* pset is not from this node so choose one that is */
5381 		int id = lsb_first(node->pset_map);
5382 		if (id < 0) {
5383 			/* startup race, so check again under the node lock */
5384 			lck_spin_lock(&pset_node_lock);
5385 			if (bit_test(node->pset_map, pset->pset_id)) {
5386 				id = pset->pset_id;
5387 			} else {
5388 				id = lsb_first(node->pset_map);
5389 			}
5390 			lck_spin_unlock(&pset_node_lock);
5391 		}
5392 		assert(id >= 0);
5393 		pset = pset_array[id];
5394 	}
5395 
5396 	if (bit_count(node->pset_map) == 1) {
5397 		/* Only a single pset in this node */
5398 		goto out;
5399 	}
5400 
5401 	bool avoid_cpu0 = false;
5402 
5403 #if defined(__x86_64__)
5404 	if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5405 		/* Avoid the pset containing cpu0 */
5406 		avoid_cpu0 = true;
5407 		/* Assert that cpu0 is in pset0.  I expect this to be true on __x86_64__ */
5408 		assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5409 	}
5410 #endif
5411 
5412 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5413 		pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5414 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5415 			if (avoid_cpu0) {
5416 				rt_target_map = bit_ror64(rt_target_map, 1);
5417 			}
5418 			int rotid = lsb_first(rt_target_map);
5419 			if (rotid >= 0) {
5420 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5421 				pset = pset_array[id];
5422 				goto out;
5423 			}
5424 		}
5425 		if (!pset->is_SMT || !sched_allow_rt_smt) {
5426 			/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5427 			goto out;
5428 		}
5429 		rt_target_map = atomic_load(&node->pset_non_rt_map);
5430 		if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5431 			if (avoid_cpu0) {
5432 				rt_target_map = bit_ror64(rt_target_map, 1);
5433 			}
5434 			int rotid = lsb_first(rt_target_map);
5435 			if (rotid >= 0) {
5436 				int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5437 				pset = pset_array[id];
5438 				goto out;
5439 			}
5440 		}
5441 		/* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5442 	} else {
5443 		pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5444 		if (!bit_test(idle_map, pset->pset_id)) {
5445 			int next_idle_pset_id = lsb_first(idle_map);
5446 			if (next_idle_pset_id >= 0) {
5447 				pset = pset_array[next_idle_pset_id];
5448 			}
5449 		}
5450 	}
5451 
5452 out:
5453 	if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5454 		processor = PROCESSOR_NULL;
5455 	}
5456 	if (processor != PROCESSOR_NULL) {
5457 		*processor_hint = processor;
5458 	}
5459 
5460 	assert(pset != NULL);
5461 	return pset;
5462 }
5463 
5464 /*
5465  *	thread_setrun:
5466  *
5467  *	Dispatch thread for execution, onto an idle
5468  *	processor or run queue, and signal a preemption
5469  *	as appropriate.
5470  *
5471  *	Thread must be locked.
5472  */
5473 void
thread_setrun(thread_t thread,sched_options_t options)5474 thread_setrun(
5475 	thread_t                        thread,
5476 	sched_options_t                 options)
5477 {
5478 	processor_t                     processor = PROCESSOR_NULL;
5479 	processor_set_t         pset;
5480 
5481 	assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5482 	assert(thread->runq == PROCESSOR_NULL);
5483 
5484 #if CONFIG_PREADOPT_TG
5485 	/* We know that the thread is not in the runq by virtue of being in this
5486 	 * function and the thread is not self since we are running. We can safely
5487 	 * resolve the thread group hierarchy and modify the thread's thread group
5488 	 * here. */
5489 	thread_resolve_and_enforce_thread_group_hierarchy_if_needed(thread);
5490 #endif
5491 
5492 	/*
5493 	 *	Update priority if needed.
5494 	 */
5495 	if (SCHED(can_update_priority)(thread)) {
5496 		SCHED(update_priority)(thread);
5497 	}
5498 	thread->sfi_class = sfi_thread_classify(thread);
5499 
5500 	if (thread->bound_processor == PROCESSOR_NULL) {
5501 		/*
5502 		 * Unbound case.
5503 		 *
5504 		 * Usually, this loop will only be executed once,
5505 		 * but if CLPC derecommends a processor after it has been chosen,
5506 		 * or if a processor is shut down after it is chosen,
5507 		 * choose_processor() may return NULL, so a retry
5508 		 * may be necessary.  A single retry will usually
5509 		 * be enough, and we can't afford to retry too many times
5510 		 * because interrupts are disabled.
5511 		 */
5512 #define CHOOSE_PROCESSOR_MAX_RETRIES 3
5513 		for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5514 			processor_t processor_hint = PROCESSOR_NULL;
5515 			pset_node_t node = SCHED(choose_node)(thread);
5516 			processor_set_t starting_pset = choose_starting_pset(node, thread, &processor_hint);
5517 
5518 			pset_lock(starting_pset);
5519 
5520 			processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5521 			if (processor != PROCESSOR_NULL) {
5522 				pset = processor->processor_set;
5523 				pset_assert_locked(pset);
5524 				break;
5525 			}
5526 		}
5527 		/*
5528 		 * If choose_processor() still returns NULL,
5529 		 * which is very unlikely,
5530 		 * choose the master_processor, which is always
5531 		 * safe to choose.
5532 		 */
5533 		if (processor == PROCESSOR_NULL) {
5534 			/* Choose fallback processor */
5535 			processor = master_processor;
5536 			pset = processor->processor_set;
5537 			pset_lock(pset);
5538 		}
5539 		task_t task = get_threadtask(thread);
5540 		if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5541 			task->pset_hint = pset; /* NRG this is done without holding the task lock */
5542 		}
5543 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5544 		    (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5545 	} else {
5546 		/*
5547 		 *	Bound case:
5548 		 *
5549 		 *	Unconditionally dispatch on the processor.
5550 		 */
5551 		processor = thread->bound_processor;
5552 		pset = processor->processor_set;
5553 		pset_lock(pset);
5554 
5555 		SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5556 		    (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5557 	}
5558 
5559 	/*
5560 	 *	Dispatch the thread on the chosen processor.
5561 	 *	TODO: This should be based on sched_mode, not sched_pri
5562 	 */
5563 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5564 		realtime_setrun(processor, thread);
5565 	} else {
5566 		processor_setrun(processor, thread, options);
5567 	}
5568 	/* pset is now unlocked */
5569 	if (thread->bound_processor == PROCESSOR_NULL) {
5570 		SCHED(check_spill)(pset, thread);
5571 	}
5572 }
5573 
5574 processor_set_t
task_choose_pset(task_t task)5575 task_choose_pset(
5576 	task_t          task)
5577 {
5578 	processor_set_t         pset = task->pset_hint;
5579 
5580 	if (pset != PROCESSOR_SET_NULL) {
5581 		pset = choose_next_pset(pset);
5582 	}
5583 
5584 	return pset;
5585 }
5586 
5587 /*
5588  *	Check for a preemption point in
5589  *	the current context.
5590  *
5591  *	Called at splsched with thread locked.
5592  */
5593 ast_t
csw_check(thread_t thread,processor_t processor,ast_t check_reason)5594 csw_check(
5595 	thread_t                thread,
5596 	processor_t             processor,
5597 	ast_t                   check_reason)
5598 {
5599 	processor_set_t pset = processor->processor_set;
5600 
5601 	assert(thread == processor->active_thread);
5602 
5603 	pset_lock(pset);
5604 
5605 	processor_state_update_from_thread(processor, thread, true);
5606 
5607 	ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5608 
5609 	/* Acknowledge the IPI if we decided not to preempt */
5610 
5611 	if ((preempt & AST_URGENT) == 0) {
5612 		if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5613 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5614 		}
5615 	}
5616 
5617 	if ((preempt & AST_PREEMPT) == 0) {
5618 		bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5619 	}
5620 
5621 	pset_unlock(pset);
5622 
5623 	return preempt;
5624 }
5625 
5626 /*
5627  * Check for preemption at splsched with
5628  * pset and thread locked
5629  */
5630 ast_t
csw_check_locked(thread_t thread,processor_t processor,processor_set_t pset,ast_t check_reason)5631 csw_check_locked(
5632 	thread_t                thread,
5633 	processor_t             processor,
5634 	processor_set_t         pset,
5635 	ast_t                   check_reason)
5636 {
5637 	/*
5638 	 * If the current thread is running on a processor that is no longer recommended,
5639 	 * urgently preempt it, at which point thread_select() should
5640 	 * try to idle the processor and re-dispatch the thread to a recommended processor.
5641 	 */
5642 	if (!processor->is_recommended) {
5643 		return check_reason | AST_PREEMPT | AST_URGENT;
5644 	}
5645 
5646 	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5647 		return check_reason | AST_PREEMPT | AST_URGENT;
5648 	}
5649 
5650 	if (rt_runq_count(pset) > 0) {
5651 		if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5652 			return check_reason | AST_PREEMPT | AST_URGENT;
5653 		} else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) {
5654 			return check_reason | AST_PREEMPT | AST_URGENT;
5655 		} else {
5656 			return check_reason | AST_PREEMPT;
5657 		}
5658 	}
5659 
5660 	ast_t result = SCHED(processor_csw_check)(processor);
5661 	if (result != AST_NONE) {
5662 		return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
5663 	}
5664 
5665 	/*
5666 	 * Same for avoid-processor
5667 	 *
5668 	 * TODO: Should these set AST_REBALANCE?
5669 	 */
5670 	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
5671 		return check_reason | AST_PREEMPT;
5672 	}
5673 
5674 	/*
5675 	 * Even though we could continue executing on this processor, a
5676 	 * secondary SMT core should try to shed load to another primary core.
5677 	 *
5678 	 * TODO: Should this do the same check that thread_select does? i.e.
5679 	 * if no bound threads target this processor, and idle primaries exist, preempt
5680 	 * The case of RT threads existing is already taken care of above
5681 	 */
5682 
5683 	if (processor->current_pri < BASEPRI_RTQUEUES &&
5684 	    processor->processor_primary != processor) {
5685 		return check_reason | AST_PREEMPT;
5686 	}
5687 
5688 	if (thread->state & TH_SUSP) {
5689 		return check_reason | AST_PREEMPT;
5690 	}
5691 
5692 #if CONFIG_SCHED_SFI
5693 	/*
5694 	 * Current thread may not need to be preempted, but maybe needs
5695 	 * an SFI wait?
5696 	 */
5697 	result = sfi_thread_needs_ast(thread, NULL);
5698 	if (result != AST_NONE) {
5699 		return check_reason | result;
5700 	}
5701 #endif
5702 
5703 	return AST_NONE;
5704 }
5705 
5706 /*
5707  * Handle preemption IPI or IPI in response to setting an AST flag
5708  * Triggered by cause_ast_check
5709  * Called at splsched
5710  */
5711 void
ast_check(processor_t processor)5712 ast_check(processor_t processor)
5713 {
5714 	if (processor->state != PROCESSOR_RUNNING &&
5715 	    processor->state != PROCESSOR_SHUTDOWN) {
5716 		return;
5717 	}
5718 
5719 	thread_t thread = processor->active_thread;
5720 
5721 	assert(thread == current_thread());
5722 
5723 	/*
5724 	 * Pairs with task_restartable_ranges_synchronize
5725 	 */
5726 	thread_lock(thread);
5727 
5728 	thread_reset_pcs_ack_IPI(thread);
5729 
5730 	/*
5731 	 * Propagate thread ast to processor.
5732 	 * (handles IPI in response to setting AST flag)
5733 	 */
5734 	ast_propagate(thread);
5735 
5736 	/*
5737 	 * Stash the old urgency and perfctl values to find out if
5738 	 * csw_check updates them.
5739 	 */
5740 	thread_urgency_t old_urgency = processor->current_urgency;
5741 	perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
5742 
5743 	ast_t preempt;
5744 
5745 	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5746 		ast_on(preempt);
5747 	}
5748 
5749 	if (old_urgency != processor->current_urgency) {
5750 		/*
5751 		 * Urgency updates happen with the thread lock held (ugh).
5752 		 * TODO: This doesn't notice QoS changes...
5753 		 */
5754 		uint64_t urgency_param1, urgency_param2;
5755 
5756 		thread_urgency_t urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
5757 		thread_tell_urgency(urgency, urgency_param1, urgency_param2, 0, thread);
5758 	}
5759 
5760 	thread_unlock(thread);
5761 
5762 	if (old_perfctl_class != processor->current_perfctl_class) {
5763 		/*
5764 		 * We updated the perfctl class of this thread from another core.
5765 		 * Let CLPC know that the currently running thread has a new
5766 		 * class.
5767 		 */
5768 
5769 		machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
5770 		    mach_approximate_time(), 0, thread);
5771 	}
5772 }
5773 
5774 
5775 /*
5776  *	set_sched_pri:
5777  *
5778  *	Set the scheduled priority of the specified thread.
5779  *
5780  *	This may cause the thread to change queues.
5781  *
5782  *	Thread must be locked.
5783  */
5784 void
set_sched_pri(thread_t thread,int16_t new_priority,set_sched_pri_options_t options)5785 set_sched_pri(
5786 	thread_t        thread,
5787 	int16_t         new_priority,
5788 	set_sched_pri_options_t options)
5789 {
5790 	bool is_current_thread = (thread == current_thread());
5791 	bool removed_from_runq = false;
5792 	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
5793 
5794 	int16_t old_priority = thread->sched_pri;
5795 
5796 	/* If we're already at this priority, no need to mess with the runqueue */
5797 	if (new_priority == old_priority) {
5798 #if CONFIG_SCHED_CLUTCH
5799 		/* For the first thread in the system, the priority is correct but
5800 		 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
5801 		 * scheduler relies on the bucket being set for all threads, update
5802 		 * its bucket here.
5803 		 */
5804 		if (thread->th_sched_bucket == TH_BUCKET_RUN) {
5805 			assert(thread == vm_pageout_scan_thread);
5806 			SCHED(update_thread_bucket)(thread);
5807 		}
5808 #endif /* CONFIG_SCHED_CLUTCH */
5809 
5810 		return;
5811 	}
5812 
5813 	if (is_current_thread) {
5814 		assert(thread->state & TH_RUN);
5815 		assert(thread->runq == PROCESSOR_NULL);
5816 	} else {
5817 		removed_from_runq = thread_run_queue_remove(thread);
5818 	}
5819 
5820 	thread->sched_pri = new_priority;
5821 
5822 #if CONFIG_SCHED_CLUTCH
5823 	/*
5824 	 * Since for the clutch scheduler, the thread's bucket determines its runq
5825 	 * in the hierarchy it is important to update the bucket when the thread
5826 	 * lock is held and the thread has been removed from the runq hierarchy.
5827 	 */
5828 	SCHED(update_thread_bucket)(thread);
5829 
5830 #endif /* CONFIG_SCHED_CLUTCH */
5831 
5832 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
5833 	    (uintptr_t)thread_tid(thread),
5834 	    thread->base_pri,
5835 	    thread->sched_pri,
5836 	    thread->sched_usage,
5837 	    0);
5838 
5839 	if (removed_from_runq) {
5840 		thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
5841 	} else if (is_current_thread) {
5842 		processor_t processor = thread->last_processor;
5843 		assert(processor == current_processor());
5844 
5845 		thread_urgency_t old_urgency = processor->current_urgency;
5846 
5847 		/*
5848 		 * When dropping in priority, check if the thread no longer belongs on core.
5849 		 * If a thread raises its own priority, don't aggressively rebalance it.
5850 		 * <rdar://problem/31699165>
5851 		 *
5852 		 * csw_check does a processor_state_update_from_thread, but
5853 		 * we should do our own if we're being lazy.
5854 		 */
5855 		if (!lazy_update && new_priority < old_priority) {
5856 			ast_t preempt;
5857 
5858 			if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
5859 				ast_on(preempt);
5860 			}
5861 		} else {
5862 			processor_state_update_from_thread(processor, thread, false);
5863 		}
5864 
5865 		/*
5866 		 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
5867 		 * class alterations from user space to occur relatively infrequently, hence
5868 		 * those are lazily handled. QoS classes have distinct priority bands, and QoS
5869 		 * inheritance is expected to involve priority changes.
5870 		 */
5871 		if (processor->current_urgency != old_urgency) {
5872 			uint64_t urgency_param1, urgency_param2;
5873 
5874 			thread_urgency_t new_urgency = thread_get_urgency(thread,
5875 			    &urgency_param1, &urgency_param2);
5876 
5877 			thread_tell_urgency(new_urgency, urgency_param1,
5878 			    urgency_param2, 0, thread);
5879 		}
5880 
5881 		/* TODO: only call this if current_perfctl_class changed */
5882 		uint64_t ctime = mach_approximate_time();
5883 		machine_thread_going_on_core(thread, processor->current_urgency, 0, 0, ctime);
5884 	} else if (thread->state & TH_RUN) {
5885 		processor_t processor = thread->last_processor;
5886 
5887 		if (!lazy_update &&
5888 		    processor != PROCESSOR_NULL &&
5889 		    processor != current_processor() &&
5890 		    processor->active_thread == thread) {
5891 			cause_ast_check(processor);
5892 		}
5893 	}
5894 }
5895 
5896 /*
5897  * thread_run_queue_remove_for_handoff
5898  *
5899  * Pull a thread or its (recursive) push target out of the runqueue
5900  * so that it is ready for thread_run()
5901  *
5902  * Called at splsched
5903  *
5904  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
5905  * This may be different than the thread that was passed in.
5906  */
5907 thread_t
thread_run_queue_remove_for_handoff(thread_t thread)5908 thread_run_queue_remove_for_handoff(thread_t thread)
5909 {
5910 	thread_t pulled_thread = THREAD_NULL;
5911 
5912 	thread_lock(thread);
5913 
5914 	/*
5915 	 * Check that the thread is not bound to a different processor,
5916 	 * NO_SMT flag is not set on the thread, cluster type of
5917 	 * processor matches with thread if the thread is pinned to a
5918 	 * particular cluster and that realtime is not involved.
5919 	 *
5920 	 * Next, pull it off its run queue.  If it doesn't come, it's not eligible.
5921 	 */
5922 	processor_t processor = current_processor();
5923 	if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
5924 	    && (!thread_no_smt(thread))
5925 	    && (processor->current_pri < BASEPRI_RTQUEUES)
5926 	    && (thread->sched_pri < BASEPRI_RTQUEUES)
5927 #if __AMP__
5928 	    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
5929 	    processor->processor_set->pset_id == thread->th_bound_cluster_id)
5930 #endif /* __AMP__ */
5931 	    ) {
5932 		if (thread_run_queue_remove(thread)) {
5933 			pulled_thread = thread;
5934 		}
5935 	}
5936 
5937 	thread_unlock(thread);
5938 
5939 	return pulled_thread;
5940 }
5941 
5942 /*
5943  * thread_prepare_for_handoff
5944  *
5945  * Make the thread ready for handoff.
5946  * If the thread was runnable then pull it off the runq, if the thread could
5947  * not be pulled, return NULL.
5948  *
5949  * If the thread was woken up from wait for handoff, make sure it is not bound to
5950  * different processor.
5951  *
5952  * Called at splsched
5953  *
5954  * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
5955  * This may be different than the thread that was passed in.
5956  */
5957 thread_t
thread_prepare_for_handoff(thread_t thread,thread_handoff_option_t option)5958 thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
5959 {
5960 	thread_t pulled_thread = THREAD_NULL;
5961 
5962 	if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
5963 		processor_t processor = current_processor();
5964 		thread_lock(thread);
5965 
5966 		/*
5967 		 * Check that the thread is not bound to a different processor,
5968 		 * NO_SMT flag is not set on the thread and cluster type of
5969 		 * processor matches with thread if the thread is pinned to a
5970 		 * particular cluster. Call setrun instead if above conditions
5971 		 * are not satisfied.
5972 		 */
5973 		if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
5974 		    && (!thread_no_smt(thread))
5975 #if __AMP__
5976 		    && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
5977 		    processor->processor_set->pset_id == thread->th_bound_cluster_id)
5978 #endif /* __AMP__ */
5979 		    ) {
5980 			pulled_thread = thread;
5981 		} else {
5982 			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
5983 		}
5984 		thread_unlock(thread);
5985 	} else {
5986 		pulled_thread = thread_run_queue_remove_for_handoff(thread);
5987 	}
5988 
5989 	return pulled_thread;
5990 }
5991 
5992 /*
5993  *	thread_run_queue_remove:
5994  *
5995  *	Remove a thread from its current run queue and
5996  *	return TRUE if successful.
5997  *
5998  *	Thread must be locked.
5999  *
6000  *	If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6001  *	run queues because the caller locked the thread.  Otherwise
6002  *	the thread is on a run queue, but could be chosen for dispatch
6003  *	and removed by another processor under a different lock, which
6004  *	will set thread->runq to PROCESSOR_NULL.
6005  *
6006  *	Hence the thread select path must not rely on anything that could
6007  *	be changed under the thread lock after calling this function,
6008  *	most importantly thread->sched_pri.
6009  */
6010 boolean_t
thread_run_queue_remove(thread_t thread)6011 thread_run_queue_remove(
6012 	thread_t        thread)
6013 {
6014 	boolean_t removed = FALSE;
6015 	processor_t processor = thread->runq;
6016 
6017 	if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6018 		/* Thread isn't runnable */
6019 		assert(thread->runq == PROCESSOR_NULL);
6020 		return FALSE;
6021 	}
6022 
6023 	if (processor == PROCESSOR_NULL) {
6024 		/*
6025 		 * The thread is either not on the runq,
6026 		 * or is in the midst of being removed from the runq.
6027 		 *
6028 		 * runq is set to NULL under the pset lock, not the thread
6029 		 * lock, so the thread may still be in the process of being dequeued
6030 		 * from the runq. It will wait in invoke for the thread lock to be
6031 		 * dropped.
6032 		 */
6033 
6034 		return FALSE;
6035 	}
6036 
6037 	if (thread->sched_pri < BASEPRI_RTQUEUES) {
6038 		return SCHED(processor_queue_remove)(processor, thread);
6039 	}
6040 
6041 	processor_set_t pset = processor->processor_set;
6042 
6043 	pset_lock(pset);
6044 
6045 	if (thread->runq != PROCESSOR_NULL) {
6046 		/*
6047 		 *	Thread is on the RT run queue and we have a lock on
6048 		 *	that run queue.
6049 		 */
6050 		rt_runq_remove(SCHED(rt_runq)(pset), thread);
6051 		pset_update_rt_stealable_state(pset);
6052 
6053 		removed = TRUE;
6054 	}
6055 
6056 	pset_unlock(pset);
6057 
6058 	return removed;
6059 }
6060 
6061 /*
6062  * Put the thread back where it goes after a thread_run_queue_remove
6063  *
6064  * Thread must have been removed under the same thread lock hold
6065  *
6066  * thread locked, at splsched
6067  */
6068 void
thread_run_queue_reinsert(thread_t thread,sched_options_t options)6069 thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6070 {
6071 	assert(thread->runq == PROCESSOR_NULL);
6072 	assert(thread->state & (TH_RUN));
6073 
6074 	thread_setrun(thread, options);
6075 }
6076 
6077 void
sys_override_cpu_throttle(boolean_t enable_override)6078 sys_override_cpu_throttle(boolean_t enable_override)
6079 {
6080 	if (enable_override) {
6081 		cpu_throttle_enabled = 0;
6082 	} else {
6083 		cpu_throttle_enabled = 1;
6084 	}
6085 }
6086 
6087 thread_urgency_t
thread_get_urgency(thread_t thread,uint64_t * arg1,uint64_t * arg2)6088 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6089 {
6090 	uint64_t urgency_param1 = 0, urgency_param2 = 0;
6091 	task_t task = get_threadtask_early(thread);
6092 
6093 	thread_urgency_t urgency;
6094 
6095 	if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6096 		urgency_param1 = 0;
6097 		urgency_param2 = 0;
6098 
6099 		urgency = THREAD_URGENCY_NONE;
6100 	} else if (thread->sched_mode == TH_MODE_REALTIME) {
6101 		urgency_param1 = thread->realtime.period;
6102 		urgency_param2 = thread->realtime.deadline;
6103 
6104 		urgency = THREAD_URGENCY_REAL_TIME;
6105 	} else if (cpu_throttle_enabled &&
6106 	    (thread->sched_pri <= MAXPRI_THROTTLE) &&
6107 	    (thread->base_pri <= MAXPRI_THROTTLE)) {
6108 		/*
6109 		 * Threads that are running at low priority but are not
6110 		 * tagged with a specific QoS are separated out from
6111 		 * the "background" urgency. Performance management
6112 		 * subsystem can decide to either treat these threads
6113 		 * as normal threads or look at other signals like thermal
6114 		 * levels for optimal power/perf tradeoffs for a platform.
6115 		 */
6116 		boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6117 		boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6118 
6119 		/*
6120 		 * Background urgency applied when thread priority is
6121 		 * MAXPRI_THROTTLE or lower and thread is not promoted
6122 		 * and thread has a QoS specified
6123 		 */
6124 		urgency_param1 = thread->sched_pri;
6125 		urgency_param2 = thread->base_pri;
6126 
6127 		if (thread_lacks_qos && !task_is_suppressed) {
6128 			urgency = THREAD_URGENCY_LOWPRI;
6129 		} else {
6130 			urgency = THREAD_URGENCY_BACKGROUND;
6131 		}
6132 	} else {
6133 		/* For otherwise unclassified threads, report throughput QoS parameters */
6134 		urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6135 		urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6136 		urgency = THREAD_URGENCY_NORMAL;
6137 	}
6138 
6139 	if (arg1 != NULL) {
6140 		*arg1 = urgency_param1;
6141 	}
6142 	if (arg2 != NULL) {
6143 		*arg2 = urgency_param2;
6144 	}
6145 
6146 	return urgency;
6147 }
6148 
6149 perfcontrol_class_t
thread_get_perfcontrol_class(thread_t thread)6150 thread_get_perfcontrol_class(thread_t thread)
6151 {
6152 	/* Special case handling */
6153 	if (thread->state & TH_IDLE) {
6154 		return PERFCONTROL_CLASS_IDLE;
6155 	}
6156 
6157 	if (thread->sched_mode == TH_MODE_REALTIME) {
6158 		return PERFCONTROL_CLASS_REALTIME;
6159 	}
6160 
6161 	/* perfcontrol_class based on base_pri */
6162 	if (thread->base_pri <= MAXPRI_THROTTLE) {
6163 		return PERFCONTROL_CLASS_BACKGROUND;
6164 	} else if (thread->base_pri <= BASEPRI_UTILITY) {
6165 		return PERFCONTROL_CLASS_UTILITY;
6166 	} else if (thread->base_pri <= BASEPRI_DEFAULT) {
6167 		return PERFCONTROL_CLASS_NONUI;
6168 	} else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6169 		return PERFCONTROL_CLASS_UI;
6170 	} else {
6171 		if (get_threadtask(thread) == kernel_task) {
6172 			/*
6173 			 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6174 			 * All other lower priority kernel threads should be treated
6175 			 * as regular threads for performance control purposes.
6176 			 */
6177 			return PERFCONTROL_CLASS_KERNEL;
6178 		}
6179 		return PERFCONTROL_CLASS_ABOVEUI;
6180 	}
6181 }
6182 
6183 /*
6184  *	This is the processor idle loop, which just looks for other threads
6185  *	to execute.  Processor idle threads invoke this without supplying a
6186  *	current thread to idle without an asserted wait state.
6187  *
6188  *	Returns a the next thread to execute if dispatched directly.
6189  */
6190 
6191 #if 0
6192 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6193 #else
6194 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6195 #endif
6196 
6197 #if (DEVELOPMENT || DEBUG)
6198 int sched_idle_delay_cpuid = -1;
6199 #endif
6200 
6201 thread_t
processor_idle(thread_t thread,processor_t processor)6202 processor_idle(
6203 	thread_t                        thread,
6204 	processor_t                     processor)
6205 {
6206 	processor_set_t         pset = processor->processor_set;
6207 
6208 	(void)splsched();
6209 
6210 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6211 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6212 	    (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6213 
6214 	SCHED_STATS_INC(idle_transitions);
6215 	assert(processor->running_timers_active == false);
6216 
6217 	uint64_t ctime = mach_absolute_time();
6218 
6219 	timer_switch(&processor->system_state, ctime, &processor->idle_state);
6220 	processor->current_state = &processor->idle_state;
6221 
6222 	cpu_quiescent_counter_leave(ctime);
6223 
6224 	while (1) {
6225 		/*
6226 		 * Ensure that updates to my processor and pset state,
6227 		 * made by the IPI source processor before sending the IPI,
6228 		 * are visible on this processor now (even though we don't
6229 		 * take the pset lock yet).
6230 		 */
6231 		atomic_thread_fence(memory_order_acquire);
6232 
6233 		if (processor->state != PROCESSOR_IDLE) {
6234 			break;
6235 		}
6236 		if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6237 			break;
6238 		}
6239 #if defined(CONFIG_SCHED_DEFERRED_AST)
6240 		if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6241 			break;
6242 		}
6243 #endif
6244 		if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6245 			break;
6246 		}
6247 
6248 		if (processor->is_recommended && (processor->processor_primary == processor)) {
6249 			if (rt_runq_count(pset)) {
6250 				break;
6251 			}
6252 		} else {
6253 			if (SCHED(processor_bound_count)(processor)) {
6254 				break;
6255 			}
6256 		}
6257 
6258 		IDLE_KERNEL_DEBUG_CONSTANT(
6259 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6260 
6261 		machine_track_platform_idle(TRUE);
6262 
6263 		machine_idle();
6264 		/* returns with interrupts enabled */
6265 
6266 		machine_track_platform_idle(FALSE);
6267 
6268 #if (DEVELOPMENT || DEBUG)
6269 		if (processor->cpu_id == sched_idle_delay_cpuid) {
6270 			delay(500);
6271 		}
6272 #endif
6273 
6274 		(void)splsched();
6275 
6276 		atomic_thread_fence(memory_order_acquire);
6277 
6278 		IDLE_KERNEL_DEBUG_CONSTANT(
6279 			MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6280 
6281 		/*
6282 		 * Check if we should call sched_timeshare_consider_maintenance() here.
6283 		 * The CPU was woken out of idle due to an interrupt and we should do the
6284 		 * call only if the processor is still idle. If the processor is non-idle,
6285 		 * the threads running on the processor would do the call as part of
6286 		 * context swithing.
6287 		 */
6288 		if (processor->state == PROCESSOR_IDLE) {
6289 			sched_timeshare_consider_maintenance(mach_absolute_time());
6290 		}
6291 
6292 		if (!SCHED(processor_queue_empty)(processor)) {
6293 			/* Secondary SMT processors respond to directed wakeups
6294 			 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6295 			 */
6296 			if (processor->processor_primary == processor) {
6297 				break;
6298 			}
6299 		}
6300 	}
6301 
6302 	ctime = mach_absolute_time();
6303 
6304 	timer_switch(&processor->idle_state, ctime, &processor->system_state);
6305 	processor->current_state = &processor->system_state;
6306 
6307 	cpu_quiescent_counter_join(ctime);
6308 
6309 	ast_t reason = AST_NONE;
6310 
6311 	/* We're handling all scheduling AST's */
6312 	ast_off(AST_SCHEDULING);
6313 
6314 	/*
6315 	 * thread_select will move the processor from dispatching to running,
6316 	 * or put it in idle if there's nothing to do.
6317 	 */
6318 	thread_t cur_thread = current_thread();
6319 
6320 	thread_lock(cur_thread);
6321 	thread_t new_thread = thread_select(cur_thread, processor, &reason);
6322 	thread_unlock(cur_thread);
6323 
6324 	assert(processor->running_timers_active == false);
6325 
6326 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6327 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6328 	    (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6329 
6330 	return new_thread;
6331 }
6332 
6333 /*
6334  *	Each processor has a dedicated thread which
6335  *	executes the idle loop when there is no suitable
6336  *	previous context.
6337  *
6338  *	This continuation is entered with interrupts disabled.
6339  */
6340 void
idle_thread(__assert_only void * parameter,__unused wait_result_t result)6341 idle_thread(__assert_only void* parameter,
6342     __unused wait_result_t result)
6343 {
6344 	assert(ml_get_interrupts_enabled() == FALSE);
6345 	assert(parameter == NULL);
6346 
6347 	processor_t processor = current_processor();
6348 
6349 	/*
6350 	 * Ensure that anything running in idle context triggers
6351 	 * preemption-disabled checks.
6352 	 */
6353 	disable_preemption_without_measurements();
6354 
6355 	/*
6356 	 * Enable interrupts temporarily to handle any pending interrupts
6357 	 * or IPIs before deciding to sleep
6358 	 */
6359 	spllo();
6360 
6361 	thread_t new_thread = processor_idle(THREAD_NULL, processor);
6362 	/* returns with interrupts disabled */
6363 
6364 	enable_preemption();
6365 
6366 	if (new_thread != THREAD_NULL) {
6367 		thread_run(processor->idle_thread,
6368 		    idle_thread, NULL, new_thread);
6369 		/*NOTREACHED*/
6370 	}
6371 
6372 	thread_block(idle_thread);
6373 	/*NOTREACHED*/
6374 }
6375 
6376 kern_return_t
idle_thread_create(processor_t processor)6377 idle_thread_create(
6378 	processor_t             processor)
6379 {
6380 	kern_return_t   result;
6381 	thread_t                thread;
6382 	spl_t                   s;
6383 	char                    name[MAXTHREADNAMESIZE];
6384 
6385 	result = kernel_thread_create(idle_thread, NULL, MAXPRI_KERNEL, &thread);
6386 	if (result != KERN_SUCCESS) {
6387 		return result;
6388 	}
6389 
6390 	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6391 	thread_set_thread_name(thread, name);
6392 
6393 	s = splsched();
6394 	thread_lock(thread);
6395 	thread->bound_processor = processor;
6396 	processor->idle_thread = thread;
6397 	thread->sched_pri = thread->base_pri = IDLEPRI;
6398 	thread->state = (TH_RUN | TH_IDLE);
6399 	thread->options |= TH_OPT_IDLE_THREAD;
6400 	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6401 	thread_unlock(thread);
6402 	splx(s);
6403 
6404 	thread_deallocate(thread);
6405 
6406 	return KERN_SUCCESS;
6407 }
6408 
6409 /*
6410  * sched_startup:
6411  *
6412  * Kicks off scheduler services.
6413  *
6414  * Called at splsched.
6415  */
6416 void
sched_startup(void)6417 sched_startup(void)
6418 {
6419 	kern_return_t   result;
6420 	thread_t                thread;
6421 
6422 	simple_lock_init(&sched_vm_group_list_lock, 0);
6423 
6424 #if __arm__ || __arm64__
6425 	simple_lock_init(&sched_recommended_cores_lock, 0);
6426 #endif /* __arm__ || __arm64__ */
6427 
6428 	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
6429 	    NULL, MAXPRI_KERNEL, &thread);
6430 	if (result != KERN_SUCCESS) {
6431 		panic("sched_startup");
6432 	}
6433 
6434 	thread_deallocate(thread);
6435 
6436 	assert_thread_magic(thread);
6437 
6438 	/*
6439 	 * Yield to the sched_init_thread once, to
6440 	 * initialize our own thread after being switched
6441 	 * back to.
6442 	 *
6443 	 * The current thread is the only other thread
6444 	 * active at this point.
6445 	 */
6446 	thread_block(THREAD_CONTINUE_NULL);
6447 }
6448 
6449 #if __arm64__
6450 static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6451 #endif /* __arm64__ */
6452 
6453 
6454 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6455 
6456 static volatile uint64_t                sched_maintenance_deadline;
6457 static uint64_t                         sched_tick_last_abstime;
6458 static uint64_t                         sched_tick_delta;
6459 uint64_t                                sched_tick_max_delta;
6460 
6461 
6462 /*
6463  *	sched_init_thread:
6464  *
6465  *	Perform periodic bookkeeping functions about ten
6466  *	times per second.
6467  */
6468 void
sched_timeshare_maintenance_continue(void)6469 sched_timeshare_maintenance_continue(void)
6470 {
6471 	uint64_t        sched_tick_ctime, late_time;
6472 
6473 	struct sched_update_scan_context scan_context = {
6474 		.earliest_bg_make_runnable_time = UINT64_MAX,
6475 		.earliest_normal_make_runnable_time = UINT64_MAX,
6476 		.earliest_rt_make_runnable_time = UINT64_MAX
6477 	};
6478 
6479 	sched_tick_ctime = mach_absolute_time();
6480 
6481 	if (__improbable(sched_tick_last_abstime == 0)) {
6482 		sched_tick_last_abstime = sched_tick_ctime;
6483 		late_time = 0;
6484 		sched_tick_delta = 1;
6485 	} else {
6486 		late_time = sched_tick_ctime - sched_tick_last_abstime;
6487 		sched_tick_delta = late_time / sched_tick_interval;
6488 		/* Ensure a delta of 1, since the interval could be slightly
6489 		 * smaller than the sched_tick_interval due to dispatch
6490 		 * latencies.
6491 		 */
6492 		sched_tick_delta = MAX(sched_tick_delta, 1);
6493 
6494 		/* In the event interrupt latencies or platform
6495 		 * idle events that advanced the timebase resulted
6496 		 * in periods where no threads were dispatched,
6497 		 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6498 		 * iterations.
6499 		 */
6500 		sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6501 
6502 		sched_tick_last_abstime = sched_tick_ctime;
6503 		sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6504 	}
6505 
6506 	scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6507 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6508 	    sched_tick_delta, late_time, 0, 0, 0);
6509 
6510 	/* Add a number of pseudo-ticks corresponding to the elapsed interval
6511 	 * This could be greater than 1 if substantial intervals where
6512 	 * all processors are idle occur, which rarely occurs in practice.
6513 	 */
6514 
6515 	sched_tick += sched_tick_delta;
6516 
6517 	update_vm_info();
6518 
6519 	/*
6520 	 *  Compute various averages.
6521 	 */
6522 	compute_averages(sched_tick_delta);
6523 
6524 	/*
6525 	 *  Scan the run queues for threads which
6526 	 *  may need to be updated, and find the earliest runnable thread on the runqueue
6527 	 *  to report its latency.
6528 	 */
6529 	SCHED(thread_update_scan)(&scan_context);
6530 
6531 	SCHED(rt_runq_scan)(&scan_context);
6532 
6533 	uint64_t ctime = mach_absolute_time();
6534 
6535 	uint64_t bg_max_latency       = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6536 	    ctime - scan_context.earliest_bg_make_runnable_time : 0;
6537 
6538 	uint64_t default_max_latency  = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6539 	    ctime - scan_context.earliest_normal_make_runnable_time : 0;
6540 
6541 	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6542 	    ctime - scan_context.earliest_rt_make_runnable_time : 0;
6543 
6544 	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6545 
6546 	/*
6547 	 * Check to see if the special sched VM group needs attention.
6548 	 */
6549 	sched_vm_group_maintenance();
6550 
6551 #if __arm__ || __arm64__
6552 	/* Check to see if the recommended cores failsafe is active */
6553 	sched_recommended_cores_maintenance();
6554 #endif /* __arm__ || __arm64__ */
6555 
6556 
6557 #if DEBUG || DEVELOPMENT
6558 #if __x86_64__
6559 #include <i386/misc_protos.h>
6560 	/* Check for long-duration interrupts */
6561 	mp_interrupt_watchdog();
6562 #endif /* __x86_64__ */
6563 #endif /* DEBUG || DEVELOPMENT */
6564 
6565 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6566 	    sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6567 	    sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6568 
6569 	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6570 	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
6571 	/*NOTREACHED*/
6572 }
6573 
6574 static uint64_t sched_maintenance_wakeups;
6575 
6576 /*
6577  * Determine if the set of routines formerly driven by a maintenance timer
6578  * must be invoked, based on a deadline comparison. Signals the scheduler
6579  * maintenance thread on deadline expiration. Must be invoked at an interval
6580  * lower than the "sched_tick_interval", currently accomplished by
6581  * invocation via the quantum expiration timer and at context switch time.
6582  * Performance matters: this routine reuses a timestamp approximating the
6583  * current absolute time received from the caller, and should perform
6584  * no more than a comparison against the deadline in the common case.
6585  */
6586 void
sched_timeshare_consider_maintenance(uint64_t ctime)6587 sched_timeshare_consider_maintenance(uint64_t ctime)
6588 {
6589 	cpu_quiescent_counter_checkin(ctime);
6590 
6591 	uint64_t deadline = sched_maintenance_deadline;
6592 
6593 	if (__improbable(ctime >= deadline)) {
6594 		if (__improbable(current_thread() == sched_maintenance_thread)) {
6595 			return;
6596 		}
6597 		OSMemoryBarrier();
6598 
6599 		uint64_t ndeadline = ctime + sched_tick_interval;
6600 
6601 		if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6602 			thread_wakeup((event_t)sched_timeshare_maintenance_continue);
6603 			sched_maintenance_wakeups++;
6604 		}
6605 	}
6606 
6607 #if !CONFIG_SCHED_CLUTCH
6608 	/*
6609 	 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
6610 	 * scheduler, the load is maintained at the thread group and bucket level.
6611 	 */
6612 	uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
6613 
6614 	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
6615 		uint64_t new_deadline = 0;
6616 		if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
6617 			compute_sched_load();
6618 			new_deadline = ctime + sched_load_compute_interval_abs;
6619 			os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
6620 		}
6621 	}
6622 #endif /* CONFIG_SCHED_CLUTCH */
6623 
6624 #if __arm64__
6625 	uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
6626 
6627 	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
6628 		/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
6629 		if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
6630 			machine_perfcontrol_deadline_passed(perf_deadline);
6631 		}
6632 	}
6633 #endif /* __arm64__ */
6634 }
6635 
6636 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6637 
6638 void
sched_init_thread(void)6639 sched_init_thread(void)
6640 {
6641 	thread_block(THREAD_CONTINUE_NULL);
6642 
6643 	thread_t thread = current_thread();
6644 
6645 	thread_set_thread_name(thread, "sched_maintenance_thread");
6646 
6647 	sched_maintenance_thread = thread;
6648 
6649 	SCHED(maintenance_continuation)();
6650 
6651 	/*NOTREACHED*/
6652 }
6653 
6654 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
6655 
6656 /*
6657  *	thread_update_scan / runq_scan:
6658  *
6659  *	Scan the run queues to account for timesharing threads
6660  *	which need to be updated.
6661  *
6662  *	Scanner runs in two passes.  Pass one squirrels likely
6663  *	threads away in an array, pass two does the update.
6664  *
6665  *	This is necessary because the run queue is locked for
6666  *	the candidate scan, but	the thread is locked for the update.
6667  *
6668  *	Array should be sized to make forward progress, without
6669  *	disabling preemption for long periods.
6670  */
6671 
6672 #define THREAD_UPDATE_SIZE              128
6673 
6674 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
6675 static uint32_t thread_update_count = 0;
6676 
6677 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
6678 boolean_t
thread_update_add_thread(thread_t thread)6679 thread_update_add_thread(thread_t thread)
6680 {
6681 	if (thread_update_count == THREAD_UPDATE_SIZE) {
6682 		return FALSE;
6683 	}
6684 
6685 	thread_update_array[thread_update_count++] = thread;
6686 	thread_reference(thread);
6687 	return TRUE;
6688 }
6689 
6690 void
thread_update_process_threads(void)6691 thread_update_process_threads(void)
6692 {
6693 	assert(thread_update_count <= THREAD_UPDATE_SIZE);
6694 
6695 	for (uint32_t i = 0; i < thread_update_count; i++) {
6696 		thread_t thread = thread_update_array[i];
6697 		assert_thread_magic(thread);
6698 		thread_update_array[i] = THREAD_NULL;
6699 
6700 		spl_t s = splsched();
6701 		thread_lock(thread);
6702 		if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
6703 			SCHED(update_priority)(thread);
6704 		}
6705 		thread_unlock(thread);
6706 		splx(s);
6707 
6708 		thread_deallocate(thread);
6709 	}
6710 
6711 	thread_update_count = 0;
6712 }
6713 
6714 static boolean_t
runq_scan_thread(thread_t thread,sched_update_scan_context_t scan_context)6715 runq_scan_thread(
6716 	thread_t thread,
6717 	sched_update_scan_context_t scan_context)
6718 {
6719 	assert_thread_magic(thread);
6720 
6721 	if (thread->sched_stamp != sched_tick &&
6722 	    thread->sched_mode == TH_MODE_TIMESHARE) {
6723 		if (thread_update_add_thread(thread) == FALSE) {
6724 			return TRUE;
6725 		}
6726 	}
6727 
6728 	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
6729 		if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
6730 			scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
6731 		}
6732 	} else {
6733 		if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
6734 			scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
6735 		}
6736 	}
6737 
6738 	return FALSE;
6739 }
6740 
6741 /*
6742  *	Scan a runq for candidate threads.
6743  *
6744  *	Returns TRUE if retry is needed.
6745  */
6746 boolean_t
runq_scan(run_queue_t runq,sched_update_scan_context_t scan_context)6747 runq_scan(
6748 	run_queue_t                   runq,
6749 	sched_update_scan_context_t   scan_context)
6750 {
6751 	int count       = runq->count;
6752 	int queue_index;
6753 
6754 	assert(count >= 0);
6755 
6756 	if (count == 0) {
6757 		return FALSE;
6758 	}
6759 
6760 	for (queue_index = bitmap_first(runq->bitmap, NRQS);
6761 	    queue_index >= 0;
6762 	    queue_index = bitmap_next(runq->bitmap, queue_index)) {
6763 		thread_t thread;
6764 		circle_queue_t queue = &runq->queues[queue_index];
6765 
6766 		cqe_foreach_element(thread, queue, runq_links) {
6767 			assert(count > 0);
6768 			if (runq_scan_thread(thread, scan_context) == TRUE) {
6769 				return TRUE;
6770 			}
6771 			count--;
6772 		}
6773 	}
6774 
6775 	return FALSE;
6776 }
6777 
6778 #if CONFIG_SCHED_CLUTCH
6779 
6780 boolean_t
sched_clutch_timeshare_scan(queue_t thread_queue,uint16_t thread_count,sched_update_scan_context_t scan_context)6781 sched_clutch_timeshare_scan(
6782 	queue_t thread_queue,
6783 	uint16_t thread_count,
6784 	sched_update_scan_context_t scan_context)
6785 {
6786 	if (thread_count == 0) {
6787 		return FALSE;
6788 	}
6789 
6790 	thread_t thread;
6791 	qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
6792 		if (runq_scan_thread(thread, scan_context) == TRUE) {
6793 			return TRUE;
6794 		}
6795 		thread_count--;
6796 	}
6797 
6798 	assert(thread_count == 0);
6799 	return FALSE;
6800 }
6801 
6802 
6803 #endif /* CONFIG_SCHED_CLUTCH */
6804 
6805 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
6806 
6807 bool
thread_is_eager_preempt(thread_t thread)6808 thread_is_eager_preempt(thread_t thread)
6809 {
6810 	return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
6811 }
6812 
6813 void
thread_set_eager_preempt(thread_t thread)6814 thread_set_eager_preempt(thread_t thread)
6815 {
6816 	spl_t s = splsched();
6817 	thread_lock(thread);
6818 
6819 	assert(!thread_is_eager_preempt(thread));
6820 
6821 	thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
6822 
6823 	if (thread == current_thread()) {
6824 		/* csw_check updates current_is_eagerpreempt on the processor */
6825 		ast_t ast = csw_check(thread, current_processor(), AST_NONE);
6826 
6827 		thread_unlock(thread);
6828 
6829 		if (ast != AST_NONE) {
6830 			thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
6831 		}
6832 	} else {
6833 		processor_t last_processor = thread->last_processor;
6834 
6835 		if (last_processor != PROCESSOR_NULL &&
6836 		    last_processor->state == PROCESSOR_RUNNING &&
6837 		    last_processor->active_thread == thread) {
6838 			cause_ast_check(last_processor);
6839 		}
6840 
6841 		thread_unlock(thread);
6842 	}
6843 
6844 	splx(s);
6845 }
6846 
6847 void
thread_clear_eager_preempt(thread_t thread)6848 thread_clear_eager_preempt(thread_t thread)
6849 {
6850 	spl_t s = splsched();
6851 	thread_lock(thread);
6852 
6853 	assert(thread_is_eager_preempt(thread));
6854 
6855 	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
6856 
6857 	if (thread == current_thread()) {
6858 		current_processor()->current_is_eagerpreempt = false;
6859 	}
6860 
6861 	thread_unlock(thread);
6862 	splx(s);
6863 }
6864 
6865 /*
6866  * Scheduling statistics
6867  */
6868 void
sched_stats_handle_csw(processor_t processor,int reasons,int selfpri,int otherpri)6869 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
6870 {
6871 	struct sched_statistics *stats;
6872 	boolean_t to_realtime = FALSE;
6873 
6874 	stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
6875 	stats->csw_count++;
6876 
6877 	if (otherpri >= BASEPRI_REALTIME) {
6878 		stats->rt_sched_count++;
6879 		to_realtime = TRUE;
6880 	}
6881 
6882 	if ((reasons & AST_PREEMPT) != 0) {
6883 		stats->preempt_count++;
6884 
6885 		if (selfpri >= BASEPRI_REALTIME) {
6886 			stats->preempted_rt_count++;
6887 		}
6888 
6889 		if (to_realtime) {
6890 			stats->preempted_by_rt_count++;
6891 		}
6892 	}
6893 }
6894 
6895 void
sched_stats_handle_runq_change(struct runq_stats * stats,int old_count)6896 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
6897 {
6898 	uint64_t timestamp = mach_absolute_time();
6899 
6900 	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
6901 	stats->last_change_timestamp = timestamp;
6902 }
6903 
6904 /*
6905  *     For calls from assembly code
6906  */
6907 #undef thread_wakeup
6908 void
6909 thread_wakeup(
6910 	event_t         x);
6911 
6912 void
thread_wakeup(event_t x)6913 thread_wakeup(
6914 	event_t         x)
6915 {
6916 	thread_wakeup_with_result(x, THREAD_AWAKENED);
6917 }
6918 
6919 boolean_t
preemption_enabled(void)6920 preemption_enabled(void)
6921 {
6922 	return get_preemption_level() == 0 && ml_get_interrupts_enabled();
6923 }
6924 
6925 static void
sched_timer_deadline_tracking_init(void)6926 sched_timer_deadline_tracking_init(void)
6927 {
6928 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
6929 	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
6930 }
6931 
6932 #if __arm__ || __arm64__
6933 
6934 uint32_t    perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
6935 uint32_t    perfcontrol_requested_recommended_core_count = MAX_CPUS;
6936 bool        perfcontrol_failsafe_active = false;
6937 bool        perfcontrol_sleep_override = false;
6938 
6939 uint64_t    perfcontrol_failsafe_maintenance_runnable_time;
6940 uint64_t    perfcontrol_failsafe_activation_time;
6941 uint64_t    perfcontrol_failsafe_deactivation_time;
6942 
6943 /* data covering who likely caused it and how long they ran */
6944 #define FAILSAFE_NAME_LEN       33 /* (2*MAXCOMLEN)+1 from size of p_name */
6945 char        perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
6946 int         perfcontrol_failsafe_pid;
6947 uint64_t    perfcontrol_failsafe_tid;
6948 uint64_t    perfcontrol_failsafe_thread_timer_at_start;
6949 uint64_t    perfcontrol_failsafe_thread_timer_last_seen;
6950 uint32_t    perfcontrol_failsafe_recommended_at_trigger;
6951 
6952 /*
6953  * Perf controller calls here to update the recommended core bitmask.
6954  * If the failsafe is active, we don't immediately apply the new value.
6955  * Instead, we store the new request and use it after the failsafe deactivates.
6956  *
6957  * If the failsafe is not active, immediately apply the update.
6958  *
6959  * No scheduler locks are held, no other locks are held that scheduler might depend on,
6960  * interrupts are enabled
6961  *
6962  * currently prototype is in osfmk/arm/machine_routines.h
6963  */
6964 void
sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)6965 sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
6966 {
6967 	assert(preemption_enabled());
6968 
6969 	spl_t s = splsched();
6970 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
6971 
6972 	perfcontrol_requested_recommended_cores = recommended_cores;
6973 	perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
6974 
6975 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
6976 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
6977 	} else {
6978 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6979 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
6980 		    perfcontrol_requested_recommended_cores,
6981 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
6982 	}
6983 
6984 	simple_unlock(&sched_recommended_cores_lock);
6985 	splx(s);
6986 }
6987 
6988 void
sched_override_recommended_cores_for_sleep(void)6989 sched_override_recommended_cores_for_sleep(void)
6990 {
6991 	spl_t s = splsched();
6992 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
6993 
6994 	if (perfcontrol_sleep_override == false) {
6995 		perfcontrol_sleep_override = true;
6996 		sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
6997 	}
6998 
6999 	simple_unlock(&sched_recommended_cores_lock);
7000 	splx(s);
7001 }
7002 
7003 void
sched_restore_recommended_cores_after_sleep(void)7004 sched_restore_recommended_cores_after_sleep(void)
7005 {
7006 	spl_t s = splsched();
7007 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7008 
7009 	if (perfcontrol_sleep_override == true) {
7010 		perfcontrol_sleep_override = false;
7011 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7012 	}
7013 
7014 	simple_unlock(&sched_recommended_cores_lock);
7015 	splx(s);
7016 }
7017 
7018 /*
7019  * Consider whether we need to activate the recommended cores failsafe
7020  *
7021  * Called from quantum timer interrupt context of a realtime thread
7022  * No scheduler locks are held, interrupts are disabled
7023  */
7024 void
sched_consider_recommended_cores(uint64_t ctime,thread_t cur_thread)7025 sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7026 {
7027 	/*
7028 	 * Check if a realtime thread is starving the system
7029 	 * and bringing up non-recommended cores would help
7030 	 *
7031 	 * TODO: Is this the correct check for recommended == possible cores?
7032 	 * TODO: Validate the checks without the relevant lock are OK.
7033 	 */
7034 
7035 	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7036 		/* keep track of how long the responsible thread runs */
7037 
7038 		simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7039 
7040 		if (perfcontrol_failsafe_active == TRUE &&
7041 		    cur_thread->thread_id == perfcontrol_failsafe_tid) {
7042 			perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
7043 			    timer_grab(&cur_thread->system_timer);
7044 		}
7045 
7046 		simple_unlock(&sched_recommended_cores_lock);
7047 
7048 		/* we're already trying to solve the problem, so bail */
7049 		return;
7050 	}
7051 
7052 	/* The failsafe won't help if there are no more processors to enable */
7053 	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7054 		return;
7055 	}
7056 
7057 	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7058 
7059 	/* Use the maintenance thread as our canary in the coal mine */
7060 	thread_t m_thread = sched_maintenance_thread;
7061 
7062 	/* If it doesn't look bad, nothing to see here */
7063 	if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7064 		return;
7065 	}
7066 
7067 	/* It looks bad, take the lock to be sure */
7068 	thread_lock(m_thread);
7069 
7070 	if (m_thread->runq == PROCESSOR_NULL ||
7071 	    (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7072 	    m_thread->last_made_runnable_time >= too_long_ago) {
7073 		/*
7074 		 * Maintenance thread is either on cpu or blocked, and
7075 		 * therefore wouldn't benefit from more cores
7076 		 */
7077 		thread_unlock(m_thread);
7078 		return;
7079 	}
7080 
7081 	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7082 
7083 	thread_unlock(m_thread);
7084 
7085 	/*
7086 	 * There are cores disabled at perfcontrol's recommendation, but the
7087 	 * system is so overloaded that the maintenance thread can't run.
7088 	 * That likely means that perfcontrol can't run either, so it can't fix
7089 	 * the recommendation.  We have to kick in a failsafe to keep from starving.
7090 	 *
7091 	 * When the maintenance thread has been starved for too long,
7092 	 * ignore the recommendation from perfcontrol and light up all the cores.
7093 	 *
7094 	 * TODO: Consider weird states like boot, sleep, or debugger
7095 	 */
7096 
7097 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7098 
7099 	if (perfcontrol_failsafe_active == TRUE) {
7100 		simple_unlock(&sched_recommended_cores_lock);
7101 		return;
7102 	}
7103 
7104 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7105 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7106 	    perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7107 
7108 	perfcontrol_failsafe_active = TRUE;
7109 	perfcontrol_failsafe_activation_time = mach_absolute_time();
7110 	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7111 	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7112 
7113 	/* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7114 	task_t task = get_threadtask(cur_thread);
7115 	perfcontrol_failsafe_pid = task_pid(task);
7116 	strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
7117 
7118 	perfcontrol_failsafe_tid = cur_thread->thread_id;
7119 
7120 	/* Blame the thread for time it has run recently */
7121 	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7122 
7123 	uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
7124 
7125 	/* Compute the start time of the bad behavior in terms of the thread's on core time */
7126 	perfcontrol_failsafe_thread_timer_at_start  = last_seen - recent_computation;
7127 	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7128 
7129 	/* Ignore the previously recommended core configuration */
7130 	sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
7131 
7132 	simple_unlock(&sched_recommended_cores_lock);
7133 }
7134 
7135 /*
7136  * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7137  *
7138  * Runs in the context of the maintenance thread, no locks held
7139  */
7140 static void
sched_recommended_cores_maintenance(void)7141 sched_recommended_cores_maintenance(void)
7142 {
7143 	/* Common case - no failsafe, nothing to be done here */
7144 	if (__probable(perfcontrol_failsafe_active == FALSE)) {
7145 		return;
7146 	}
7147 
7148 	uint64_t ctime = mach_absolute_time();
7149 
7150 	boolean_t print_diagnostic = FALSE;
7151 	char p_name[FAILSAFE_NAME_LEN] = "";
7152 
7153 	spl_t s = splsched();
7154 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7155 
7156 	/* Check again, under the lock, to avoid races */
7157 	if (perfcontrol_failsafe_active == FALSE) {
7158 		goto out;
7159 	}
7160 
7161 	/*
7162 	 * Ensure that the other cores get another few ticks to run some threads
7163 	 * If we don't have this hysteresis, the maintenance thread is the first
7164 	 * to run, and then it immediately kills the other cores
7165 	 */
7166 	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7167 		goto out;
7168 	}
7169 
7170 	/* Capture some diagnostic state under the lock so we can print it out later */
7171 
7172 	int      pid = perfcontrol_failsafe_pid;
7173 	uint64_t tid = perfcontrol_failsafe_tid;
7174 
7175 	uint64_t thread_usage       = perfcontrol_failsafe_thread_timer_last_seen -
7176 	    perfcontrol_failsafe_thread_timer_at_start;
7177 	uint32_t rec_cores_before   = perfcontrol_failsafe_recommended_at_trigger;
7178 	uint32_t rec_cores_after    = perfcontrol_requested_recommended_cores;
7179 	uint64_t failsafe_duration  = ctime - perfcontrol_failsafe_activation_time;
7180 	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
7181 
7182 	print_diagnostic = TRUE;
7183 
7184 	/* Deactivate the failsafe and reinstate the requested recommendation settings */
7185 
7186 	perfcontrol_failsafe_deactivation_time = ctime;
7187 	perfcontrol_failsafe_active = FALSE;
7188 
7189 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7190 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7191 	    perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7192 
7193 	sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7194 
7195 out:
7196 	simple_unlock(&sched_recommended_cores_lock);
7197 	splx(s);
7198 
7199 	if (print_diagnostic) {
7200 		uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7201 
7202 		absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
7203 		failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7204 
7205 		absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
7206 		thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7207 
7208 		printf("recommended core failsafe kicked in for %lld ms "
7209 		    "likely due to %s[%d] thread 0x%llx spending "
7210 		    "%lld ms on cpu at realtime priority - "
7211 		    "new recommendation: 0x%x -> 0x%x\n",
7212 		    failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7213 		    rec_cores_before, rec_cores_after);
7214 	}
7215 }
7216 
7217 #endif /* __arm__ || __arm64__ */
7218 
7219 kern_return_t
sched_processor_enable(processor_t processor,boolean_t enable)7220 sched_processor_enable(processor_t processor, boolean_t enable)
7221 {
7222 	assert(preemption_enabled());
7223 
7224 	spl_t s = splsched();
7225 	simple_lock(&sched_recommended_cores_lock, LCK_GRP_NULL);
7226 
7227 	if (enable) {
7228 		bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7229 	} else {
7230 		bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7231 	}
7232 
7233 #if __arm__ || __arm64__
7234 	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7235 		sched_update_recommended_cores(perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores);
7236 	} else {
7237 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7238 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7239 		    perfcontrol_requested_recommended_cores,
7240 		    sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7241 	}
7242 #else /* __arm__ || __arm64__ */
7243 	sched_update_recommended_cores(usercontrol_requested_recommended_cores);
7244 #endif /* !__arm__ || __arm64__ */
7245 
7246 	simple_unlock(&sched_recommended_cores_lock);
7247 	splx(s);
7248 
7249 	return KERN_SUCCESS;
7250 }
7251 
7252 
7253 /*
7254  * Apply a new recommended cores mask to the processors it affects
7255  * Runs after considering failsafes and such
7256  *
7257  * Iterate over processors and update their ->is_recommended field.
7258  * If a processor is running, we let it drain out at its next
7259  * quantum expiration or blocking point. If a processor is idle, there
7260  * may be more work for it to do, so IPI it.
7261  *
7262  * interrupts disabled, sched_recommended_cores_lock is held
7263  */
7264 static void
sched_update_recommended_cores(uint64_t recommended_cores)7265 sched_update_recommended_cores(uint64_t recommended_cores)
7266 {
7267 	uint64_t        needs_exit_idle_mask = 0x0;
7268 
7269 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7270 	    recommended_cores,
7271 #if __arm__ || __arm64__
7272 	    perfcontrol_failsafe_active, 0, 0);
7273 #else /* __arm__ || __arm64__ */
7274 	    0, 0, 0);
7275 #endif /* ! __arm__ || __arm64__ */
7276 
7277 	if (__builtin_popcountll(recommended_cores) == 0) {
7278 		bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7279 	}
7280 
7281 	/* First set recommended cores */
7282 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7283 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7284 			processor_set_t pset = pset_array[pset_id];
7285 
7286 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7287 			cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7288 
7289 			if (newly_recommended == 0) {
7290 				/* Nothing to do */
7291 				continue;
7292 			}
7293 
7294 			pset_lock(pset);
7295 
7296 			for (int cpu_id = lsb_first(newly_recommended); cpu_id >= 0; cpu_id = lsb_next(newly_recommended, cpu_id)) {
7297 				processor_t processor = processor_array[cpu_id];
7298 				processor->is_recommended = TRUE;
7299 				bit_set(pset->recommended_bitmask, processor->cpu_id);
7300 
7301 				if (processor->state == PROCESSOR_IDLE) {
7302 					if (processor != current_processor()) {
7303 						bit_set(needs_exit_idle_mask, processor->cpu_id);
7304 					}
7305 				}
7306 				if (processor->state != PROCESSOR_OFF_LINE) {
7307 					os_atomic_inc(&processor_avail_count_user, relaxed);
7308 					if (processor->processor_primary == processor) {
7309 						os_atomic_inc(&primary_processor_avail_count_user, relaxed);
7310 					}
7311 					SCHED(pset_made_schedulable)(processor, pset, false);
7312 				}
7313 			}
7314 			pset_update_rt_stealable_state(pset);
7315 
7316 			pset_unlock(pset);
7317 		}
7318 	}
7319 
7320 	/* Now shutdown not recommended cores */
7321 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7322 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
7323 			processor_set_t pset = pset_array[pset_id];
7324 
7325 			cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7326 			cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
7327 
7328 			if (newly_unrecommended == 0) {
7329 				/* Nothing to do */
7330 				continue;
7331 			}
7332 
7333 			pset_lock(pset);
7334 
7335 			for (int cpu_id = lsb_first(newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(newly_unrecommended, cpu_id)) {
7336 				processor_t processor = processor_array[cpu_id];
7337 				sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
7338 
7339 				processor->is_recommended = FALSE;
7340 				bit_clear(pset->recommended_bitmask, processor->cpu_id);
7341 				if (processor->state != PROCESSOR_OFF_LINE) {
7342 					os_atomic_dec(&processor_avail_count_user, relaxed);
7343 					if (processor->processor_primary == processor) {
7344 						os_atomic_dec(&primary_processor_avail_count_user, relaxed);
7345 					}
7346 				}
7347 				pset_update_rt_stealable_state(pset);
7348 
7349 				if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
7350 					ipi_type = SCHED_IPI_IMMEDIATE;
7351 				}
7352 				SCHED(processor_queue_shutdown)(processor);
7353 				/* pset unlocked */
7354 
7355 				SCHED(rt_queue_shutdown)(processor);
7356 
7357 				if (ipi_type != SCHED_IPI_NONE) {
7358 					if (processor == current_processor()) {
7359 						ast_on(AST_PREEMPT);
7360 					} else {
7361 						sched_ipi_perform(processor, ipi_type);
7362 					}
7363 				}
7364 
7365 				pset_lock(pset);
7366 			}
7367 			pset_unlock(pset);
7368 		}
7369 	}
7370 
7371 #if defined(__x86_64__)
7372 	commpage_update_active_cpus();
7373 #endif
7374 	/* Issue all pending IPIs now that the pset lock has been dropped */
7375 	for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
7376 		processor_t processor = processor_array[cpuid];
7377 		machine_signal_idle(processor);
7378 	}
7379 
7380 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
7381 	    needs_exit_idle_mask, 0, 0, 0);
7382 }
7383 
7384 void
thread_set_options(uint32_t thopt)7385 thread_set_options(uint32_t thopt)
7386 {
7387 	spl_t x;
7388 	thread_t t = current_thread();
7389 
7390 	x = splsched();
7391 	thread_lock(t);
7392 
7393 	t->options |= thopt;
7394 
7395 	thread_unlock(t);
7396 	splx(x);
7397 }
7398 
7399 void
thread_set_pending_block_hint(thread_t thread,block_hint_t block_hint)7400 thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
7401 {
7402 	thread->pending_block_hint = block_hint;
7403 }
7404 
7405 uint32_t
qos_max_parallelism(int qos,uint64_t options)7406 qos_max_parallelism(int qos, uint64_t options)
7407 {
7408 	return SCHED(qos_max_parallelism)(qos, options);
7409 }
7410 
7411 uint32_t
sched_qos_max_parallelism(__unused int qos,uint64_t options)7412 sched_qos_max_parallelism(__unused int qos, uint64_t options)
7413 {
7414 	host_basic_info_data_t hinfo;
7415 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
7416 
7417 
7418 	/*
7419 	 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
7420 	 * implement their own qos_max_parallelism() interfaces.
7421 	 */
7422 	assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
7423 
7424 	/* Query the machine layer for core information */
7425 	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
7426 	    (host_info_t)&hinfo, &count);
7427 	assert(kret == KERN_SUCCESS);
7428 
7429 	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
7430 		return hinfo.logical_cpu;
7431 	} else {
7432 		return hinfo.physical_cpu;
7433 	}
7434 }
7435 
7436 int sched_allow_NO_SMT_threads = 1;
7437 bool
thread_no_smt(thread_t thread)7438 thread_no_smt(thread_t thread)
7439 {
7440 	return sched_allow_NO_SMT_threads &&
7441 	       (thread->bound_processor == PROCESSOR_NULL) &&
7442 	       ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
7443 }
7444 
7445 bool
processor_active_thread_no_smt(processor_t processor)7446 processor_active_thread_no_smt(processor_t processor)
7447 {
7448 	return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
7449 }
7450 
7451 #if __arm64__
7452 
7453 /*
7454  * Set up or replace old timer with new timer
7455  *
7456  * Returns true if canceled old timer, false if it did not
7457  */
7458 boolean_t
sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)7459 sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
7460 {
7461 	/*
7462 	 * Exchange deadline for new deadline, if old deadline was nonzero,
7463 	 * then I cancelled the callback, otherwise I didn't
7464 	 */
7465 
7466 	return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
7467 	           relaxed) != 0;
7468 }
7469 
7470 #endif /* __arm64__ */
7471 
7472 #if CONFIG_SCHED_EDGE
7473 
7474 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
7475 
7476 /*
7477  * sched_edge_pset_running_higher_bucket()
7478  *
7479  * Routine to calculate cumulative running counts for each scheduling
7480  * bucket. This effectively lets the load calculation calculate if a
7481  * cluster is running any threads at a QoS lower than the thread being
7482  * migrated etc.
7483  */
7484 
7485 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)7486 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
7487 {
7488 	bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
7489 
7490 	/* Edge Scheduler Optimization */
7491 	for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
7492 		sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
7493 		for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
7494 			running_higher[bucket]++;
7495 		}
7496 	}
7497 }
7498 
7499 /*
7500  * sched_update_pset_load_average()
7501  *
7502  * Updates the load average for each sched bucket for a cluster.
7503  * This routine must be called with the pset lock held.
7504  */
7505 void
sched_update_pset_load_average(processor_set_t pset,uint64_t curtime)7506 sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
7507 {
7508 	int avail_cpu_count = pset_available_cpu_count(pset);
7509 	if (avail_cpu_count == 0) {
7510 		/* Looks like the pset is not runnable any more; nothing to do here */
7511 		return;
7512 	}
7513 
7514 	/*
7515 	 * Edge Scheduler Optimization
7516 	 *
7517 	 * See if more callers of this routine can pass in timestamps to avoid the
7518 	 * mach_absolute_time() call here.
7519 	 */
7520 
7521 	if (!curtime) {
7522 		curtime = mach_absolute_time();
7523 	}
7524 	uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
7525 	int64_t delta_ticks = curtime - last_update;
7526 	if (delta_ticks < 0) {
7527 		return;
7528 	}
7529 
7530 	uint64_t delta_nsecs = 0;
7531 	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7532 
7533 	if (__improbable(delta_nsecs > UINT32_MAX)) {
7534 		delta_nsecs = UINT32_MAX;
7535 	}
7536 
7537 #if CONFIG_SCHED_EDGE
7538 	/* Update the shared resource load on the pset */
7539 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
7540 		uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
7541 		uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
7542 		uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
7543 		uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
7544 		if (old_shared_load != new_shared_load) {
7545 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
7546 		}
7547 	}
7548 #endif /* CONFIG_SCHED_EDGE */
7549 
7550 	uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
7551 	sched_edge_pset_running_higher_bucket(pset, running_higher);
7552 
7553 	for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
7554 		uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
7555 		uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
7556 		uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) +  rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
7557 
7558 		/*
7559 		 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
7560 		 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
7561 		 * new load averga needs to be shifted before it can be added to the old load average.
7562 		 */
7563 		uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
7564 
7565 		/*
7566 		 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
7567 		 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
7568 		 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
7569 		 */
7570 		int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7571 		boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
7572 		boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
7573 		uint64_t load_average;
7574 		if (load_uptick || load_downtick) {
7575 			load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
7576 		} else {
7577 			/* Indicates a loaded system; use EWMA for load average calculation */
7578 			load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7579 		}
7580 		os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
7581 		if (load_average != old_load_average) {
7582 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
7583 		}
7584 	}
7585 	os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
7586 }
7587 
7588 void
sched_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)7589 sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
7590 {
7591 	pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
7592 	uint64_t avg_thread_execution_time = 0;
7593 
7594 	os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
7595 	    old_execution_time_packed.pset_execution_time_packed,
7596 	    new_execution_time_packed.pset_execution_time_packed, relaxed, {
7597 		uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
7598 		int64_t delta_ticks = curtime - last_update;
7599 		if (delta_ticks < 0) {
7600 		        /*
7601 		         * Its possible that another CPU came in and updated the pset_execution_time
7602 		         * before this CPU could do it. Since the average execution time is meant to
7603 		         * be an approximate measure per cluster, ignore the older update.
7604 		         */
7605 		        os_atomic_rmw_loop_give_up(return );
7606 		}
7607 		uint64_t delta_nsecs = 0;
7608 		absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
7609 
7610 		uint64_t nanotime = 0;
7611 		absolutetime_to_nanoseconds(execution_time, &nanotime);
7612 		uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
7613 
7614 		uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
7615 		uint64_t new_execution_time = (execution_time_us * delta_nsecs);
7616 
7617 		avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
7618 		new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
7619 		new_execution_time_packed.pset_execution_time_last_update = curtime;
7620 	});
7621 	if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
7622 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
7623 	}
7624 }
7625 
7626 uint64_t
sched_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)7627 sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
7628 {
7629 	return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
7630 }
7631 
7632 #else /* CONFIG_SCHED_EDGE */
7633 
7634 void
sched_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)7635 sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
7636 {
7637 	int non_rt_load = pset->pset_runq.count;
7638 	int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
7639 	int new_load_average = ((int)pset->load_average + load) >> 1;
7640 
7641 	pset->load_average = new_load_average;
7642 #if (DEVELOPMENT || DEBUG)
7643 #if __AMP__
7644 	if (pset->pset_cluster_type == PSET_AMP_P) {
7645 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
7646 	}
7647 #endif
7648 #endif
7649 }
7650 
7651 void
sched_update_pset_avg_execution_time(__unused processor_set_t pset,__unused uint64_t execution_time,__unused uint64_t curtime,__unused sched_bucket_t sched_bucket)7652 sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
7653 {
7654 }
7655 
7656 #endif /* CONFIG_SCHED_EDGE */
7657 
7658 /* pset is locked */
7659 static bool
processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset,processor_t processor)7660 processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
7661 {
7662 	int cpuid = processor->cpu_id;
7663 #if defined(__x86_64__)
7664 	if (sched_avoid_cpu0 && (cpuid == 0)) {
7665 		return false;
7666 	}
7667 #endif
7668 
7669 	cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7670 
7671 	return bit_test(fasttrack_map, cpuid);
7672 }
7673 
7674 /* pset is locked */
7675 static processor_t
choose_processor_for_realtime_thread(processor_set_t pset,processor_t skip_processor,bool consider_secondaries,bool skip_spills)7676 choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
7677 {
7678 #if defined(__x86_64__)
7679 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7680 #else
7681 	const bool avoid_cpu0 = false;
7682 #endif
7683 	cpumap_t cpu_map;
7684 
7685 try_again:
7686 	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
7687 	if (skip_processor) {
7688 		bit_clear(cpu_map, skip_processor->cpu_id);
7689 	}
7690 	if (skip_spills) {
7691 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7692 	}
7693 
7694 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7695 		bit_clear(cpu_map, 0);
7696 	}
7697 
7698 	cpumap_t primary_map = cpu_map & pset->primary_map;
7699 	if (avoid_cpu0) {
7700 		primary_map = bit_ror64(primary_map, 1);
7701 	}
7702 
7703 	int rotid = lsb_first(primary_map);
7704 	if (rotid >= 0) {
7705 		int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
7706 
7707 		processor_t processor = processor_array[cpuid];
7708 
7709 		return processor;
7710 	}
7711 
7712 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7713 		goto out;
7714 	}
7715 
7716 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7717 		/* Also avoid cpu1 */
7718 		bit_clear(cpu_map, 1);
7719 	}
7720 
7721 	/* Consider secondary processors whose primary is actually running a realtime thread */
7722 	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
7723 	if (avoid_cpu0) {
7724 		/* Also avoid cpu1 */
7725 		secondary_map = bit_ror64(secondary_map, 2);
7726 	}
7727 	rotid = lsb_first(secondary_map);
7728 	if (rotid >= 0) {
7729 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
7730 
7731 		processor_t processor = processor_array[cpuid];
7732 
7733 		return processor;
7734 	}
7735 
7736 	/* Consider secondary processors */
7737 	secondary_map = cpu_map & ~pset->primary_map;
7738 	if (avoid_cpu0) {
7739 		/* Also avoid cpu1 */
7740 		secondary_map = bit_ror64(secondary_map, 2);
7741 	}
7742 	rotid = lsb_first(secondary_map);
7743 	if (rotid >= 0) {
7744 		int cpuid = avoid_cpu0 ?  ((rotid + 2) & 63) : rotid;
7745 
7746 		processor_t processor = processor_array[cpuid];
7747 
7748 		return processor;
7749 	}
7750 
7751 	/*
7752 	 * I was hoping the compiler would optimize
7753 	 * this away when avoid_cpu0 is const bool false
7754 	 * but it still complains about the assignmnent
7755 	 * in that case.
7756 	 */
7757 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7758 #if defined(__x86_64__)
7759 		avoid_cpu0 = false;
7760 #else
7761 		assert(0);
7762 #endif
7763 		goto try_again;
7764 	}
7765 
7766 out:
7767 	if (skip_processor) {
7768 		return PROCESSOR_NULL;
7769 	}
7770 
7771 	/*
7772 	 * If we didn't find an obvious processor to choose, but there are still more CPUs
7773 	 * not already running realtime threads than realtime threads in the realtime run queue,
7774 	 * this thread belongs in this pset, so choose some other processor in this pset
7775 	 * to ensure the thread is enqueued here.
7776 	 */
7777 	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7778 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7779 		cpu_map = non_realtime_map;
7780 		assert(cpu_map != 0);
7781 		int cpuid = bit_first(cpu_map);
7782 		assert(cpuid >= 0);
7783 		return processor_array[cpuid];
7784 	}
7785 
7786 	if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
7787 		goto skip_secondaries;
7788 	}
7789 
7790 	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
7791 	if (bit_count(non_realtime_map) > rt_runq_count(pset)) {
7792 		cpu_map = non_realtime_map;
7793 		assert(cpu_map != 0);
7794 		int cpuid = bit_first(cpu_map);
7795 		assert(cpuid >= 0);
7796 		return processor_array[cpuid];
7797 	}
7798 
7799 skip_secondaries:
7800 	return PROCESSOR_NULL;
7801 }
7802 
7803 /*
7804  * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
7805  * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
7806  *
7807  * pset is locked.
7808  */
7809 static processor_t
choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool skip_spills,bool include_ast_urgent_pending_cpus)7810 choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
7811 {
7812 	uint64_t  furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon);
7813 	processor_t fd_processor = PROCESSOR_NULL;
7814 	int lowest_priority = max_pri;
7815 
7816 	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
7817 	if (skip_processor) {
7818 		bit_clear(cpu_map, skip_processor->cpu_id);
7819 	}
7820 	if (skip_spills) {
7821 		cpu_map &= ~pset->rt_pending_spill_cpu_mask;
7822 	}
7823 
7824 	for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
7825 		processor_t processor = processor_array[cpuid];
7826 
7827 		if (processor->current_pri > lowest_priority) {
7828 			continue;
7829 		}
7830 
7831 		if (processor->current_pri < lowest_priority) {
7832 			lowest_priority = processor->current_pri;
7833 			furthest_deadline = processor->deadline;
7834 			fd_processor = processor;
7835 			continue;
7836 		}
7837 
7838 		if (processor->deadline > furthest_deadline) {
7839 			furthest_deadline = processor->deadline;
7840 			fd_processor = processor;
7841 		}
7842 	}
7843 
7844 	if (fd_processor) {
7845 		return fd_processor;
7846 	}
7847 
7848 	/*
7849 	 * There is a race condition possible when there are multiple processor sets.
7850 	 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
7851 	 * so it drops pset lock A and tries to take pset lock B.  Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
7852 	 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
7853 	 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
7854 	 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
7855 	 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
7856 	 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
7857 	 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
7858 	 *
7859 	 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
7860 	 * on the run queue of that pset.
7861 	 */
7862 	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
7863 		cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
7864 		assert(skip_processor == PROCESSOR_NULL);
7865 		assert(skip_spills == false);
7866 
7867 		for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) {
7868 			processor_t processor = processor_array[cpuid];
7869 
7870 			if (processor->current_pri > lowest_priority) {
7871 				continue;
7872 			}
7873 
7874 			if (processor->current_pri < lowest_priority) {
7875 				lowest_priority = processor->current_pri;
7876 				furthest_deadline = processor->deadline;
7877 				fd_processor = processor;
7878 				continue;
7879 			}
7880 
7881 			if (processor->deadline > furthest_deadline) {
7882 				furthest_deadline = processor->deadline;
7883 				fd_processor = processor;
7884 			}
7885 		}
7886 	}
7887 
7888 	return fd_processor;
7889 }
7890 
7891 /* pset is locked */
7892 static processor_t
choose_next_processor_for_realtime_thread(processor_set_t pset,int max_pri,uint64_t minimum_deadline,processor_t skip_processor,bool consider_secondaries)7893 choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
7894 {
7895 	bool skip_spills = true;
7896 	bool include_ast_urgent_pending_cpus = false;
7897 
7898 	processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
7899 	if (next_processor != PROCESSOR_NULL) {
7900 		return next_processor;
7901 	}
7902 
7903 	next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
7904 	return next_processor;
7905 }
7906 
7907 #if defined(__x86_64__)
7908 /* pset is locked */
7909 static bool
all_available_primaries_are_running_realtime_threads(processor_set_t pset,bool include_backups)7910 all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
7911 {
7912 	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
7913 	int nbackup_cpus = 0;
7914 
7915 	if (include_backups && rt_runq_is_low_latency(pset)) {
7916 		nbackup_cpus = sched_rt_n_backup_processors;
7917 	}
7918 
7919 	cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
7920 	if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
7921 		bit_clear(cpu_map, 0);
7922 	}
7923 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
7924 }
7925 
7926 /* pset is locked */
7927 static bool
these_processors_are_running_realtime_threads(processor_set_t pset,uint64_t these_map,bool include_backups)7928 these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
7929 {
7930 	int nbackup_cpus = 0;
7931 
7932 	if (include_backups && rt_runq_is_low_latency(pset)) {
7933 		nbackup_cpus = sched_rt_n_backup_processors;
7934 	}
7935 
7936 	cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
7937 	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
7938 }
7939 #endif
7940 
7941 static bool
sched_ok_to_run_realtime_thread(processor_set_t pset,processor_t processor,bool as_backup)7942 sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
7943 {
7944 	if (!processor->is_recommended) {
7945 		return false;
7946 	}
7947 	bool ok_to_run_realtime_thread = true;
7948 #if defined(__x86_64__)
7949 	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
7950 	if (spill_pending) {
7951 		return true;
7952 	}
7953 	if (processor->cpu_id == 0) {
7954 		if (sched_avoid_cpu0 == 1) {
7955 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
7956 		} else if (sched_avoid_cpu0 == 2) {
7957 			ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
7958 		}
7959 	} else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
7960 		ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
7961 	} else if (processor->processor_primary != processor) {
7962 		ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
7963 	}
7964 #else
7965 	(void)pset;
7966 	(void)processor;
7967 	(void)as_backup;
7968 #endif
7969 	return ok_to_run_realtime_thread;
7970 }
7971 
7972 void
sched_pset_made_schedulable(__unused processor_t processor,processor_set_t pset,boolean_t drop_lock)7973 sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
7974 {
7975 	if (drop_lock) {
7976 		pset_unlock(pset);
7977 	}
7978 }
7979 
7980 void
thread_set_no_smt(bool set)7981 thread_set_no_smt(bool set)
7982 {
7983 	if (!system_is_SMT) {
7984 		/* Not a machine that supports SMT */
7985 		return;
7986 	}
7987 
7988 	thread_t thread = current_thread();
7989 
7990 	spl_t s = splsched();
7991 	thread_lock(thread);
7992 	if (set) {
7993 		thread->sched_flags |= TH_SFLAG_NO_SMT;
7994 	}
7995 	thread_unlock(thread);
7996 	splx(s);
7997 }
7998 
7999 bool
thread_get_no_smt(void)8000 thread_get_no_smt(void)
8001 {
8002 	return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8003 }
8004 
8005 extern void task_set_no_smt(task_t);
8006 void
task_set_no_smt(task_t task)8007 task_set_no_smt(task_t task)
8008 {
8009 	if (!system_is_SMT) {
8010 		/* Not a machine that supports SMT */
8011 		return;
8012 	}
8013 
8014 	if (task == TASK_NULL) {
8015 		task = current_task();
8016 	}
8017 
8018 	task_lock(task);
8019 	task->t_flags |= TF_NO_SMT;
8020 	task_unlock(task);
8021 }
8022 
8023 #if DEBUG || DEVELOPMENT
8024 extern void sysctl_task_set_no_smt(char no_smt);
8025 void
sysctl_task_set_no_smt(char no_smt)8026 sysctl_task_set_no_smt(char no_smt)
8027 {
8028 	if (!system_is_SMT) {
8029 		/* Not a machine that supports SMT */
8030 		return;
8031 	}
8032 
8033 	task_t task = current_task();
8034 
8035 	task_lock(task);
8036 	if (no_smt == '1') {
8037 		task->t_flags |= TF_NO_SMT;
8038 	}
8039 	task_unlock(task);
8040 }
8041 
8042 extern char sysctl_task_get_no_smt(void);
8043 char
sysctl_task_get_no_smt(void)8044 sysctl_task_get_no_smt(void)
8045 {
8046 	task_t task = current_task();
8047 
8048 	if (task->t_flags & TF_NO_SMT) {
8049 		return '1';
8050 	}
8051 	return '0';
8052 }
8053 #endif /* DEVELOPMENT || DEBUG */
8054 
8055 
8056 __private_extern__ void
thread_bind_cluster_type(thread_t thread,char cluster_type,bool soft_bound)8057 thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8058 {
8059 #if __AMP__
8060 	spl_t s = splsched();
8061 	thread_lock(thread);
8062 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8063 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8064 	if (soft_bound) {
8065 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8066 	}
8067 	switch (cluster_type) {
8068 	case 'e':
8069 	case 'E':
8070 		if (pset0.pset_cluster_type == PSET_AMP_E) {
8071 			thread->th_bound_cluster_id = pset0.pset_id;
8072 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8073 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8074 		}
8075 		break;
8076 	case 'p':
8077 	case 'P':
8078 		if (pset0.pset_cluster_type == PSET_AMP_P) {
8079 			thread->th_bound_cluster_id = pset0.pset_id;
8080 		} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8081 			thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8082 		}
8083 		break;
8084 	default:
8085 		break;
8086 	}
8087 	thread_unlock(thread);
8088 	splx(s);
8089 
8090 	if (thread == current_thread()) {
8091 		thread_block(THREAD_CONTINUE_NULL);
8092 	}
8093 #else /* __AMP__ */
8094 	(void)thread;
8095 	(void)cluster_type;
8096 	(void)soft_bound;
8097 #endif /* __AMP__ */
8098 }
8099 
8100 extern uint32_t thread_bound_cluster_id(thread_t thread);
8101 uint32_t
thread_bound_cluster_id(thread_t thread)8102 thread_bound_cluster_id(thread_t thread)
8103 {
8104 	return thread->th_bound_cluster_id;
8105 }
8106 
8107 __private_extern__ kern_return_t
thread_bind_cluster_id(thread_t thread,uint32_t cluster_id,thread_bind_option_t options)8108 thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8109 {
8110 #if __AMP__
8111 
8112 	processor_set_t pset = NULL;
8113 	if (options & (THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY)) {
8114 		/* Validate the inputs for the bind case */
8115 		int max_clusters = ml_get_cluster_count();
8116 		if (cluster_id >= max_clusters) {
8117 			/* Invalid cluster id */
8118 			return KERN_INVALID_ARGUMENT;
8119 		}
8120 		pset = pset_array[cluster_id];
8121 		if (pset == NULL) {
8122 			/* Cluster has not been initialized yet */
8123 			return KERN_INVALID_ARGUMENT;
8124 		}
8125 		if (options & THREAD_BIND_ELIGIBLE_ONLY) {
8126 			if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
8127 				/* Thread is not recommended for the cluster type */
8128 				return KERN_INVALID_POLICY;
8129 			}
8130 		}
8131 	}
8132 
8133 	if (options & THREAD_UNBIND) {
8134 		/* If the thread was actually not bound to some cluster, nothing to do here */
8135 		if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8136 			return KERN_SUCCESS;
8137 		}
8138 	}
8139 
8140 	spl_t s = splsched();
8141 	thread_lock(thread);
8142 
8143 	/* Unbind the thread from its previous bound state */
8144 	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8145 	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8146 
8147 	if (options & THREAD_UNBIND) {
8148 		/* Nothing more to do here */
8149 		goto thread_bind_cluster_complete;
8150 	}
8151 
8152 	if (options & THREAD_BIND_SOFT) {
8153 		thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8154 	}
8155 	thread->th_bound_cluster_id = cluster_id;
8156 
8157 thread_bind_cluster_complete:
8158 	thread_unlock(thread);
8159 	splx(s);
8160 
8161 	if (thread == current_thread()) {
8162 		thread_block(THREAD_CONTINUE_NULL);
8163 	}
8164 #else /* __AMP__ */
8165 	(void)thread;
8166 	(void)cluster_id;
8167 	(void)options;
8168 #endif /* __AMP__ */
8169 	return KERN_SUCCESS;
8170 }
8171 
8172 #if DEVELOPMENT || DEBUG
8173 extern int32_t sysctl_get_bound_cpuid(void);
8174 int32_t
sysctl_get_bound_cpuid(void)8175 sysctl_get_bound_cpuid(void)
8176 {
8177 	int32_t cpuid = -1;
8178 	thread_t self = current_thread();
8179 
8180 	processor_t processor = self->bound_processor;
8181 	if (processor == NULL) {
8182 		cpuid = -1;
8183 	} else {
8184 		cpuid = processor->cpu_id;
8185 	}
8186 
8187 	return cpuid;
8188 }
8189 
8190 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
8191 kern_return_t
sysctl_thread_bind_cpuid(int32_t cpuid)8192 sysctl_thread_bind_cpuid(int32_t cpuid)
8193 {
8194 	processor_t processor = PROCESSOR_NULL;
8195 
8196 	if (cpuid == -1) {
8197 		goto unbind;
8198 	}
8199 
8200 	if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
8201 		return KERN_INVALID_VALUE;
8202 	}
8203 
8204 	processor = processor_array[cpuid];
8205 	if (processor == PROCESSOR_NULL) {
8206 		return KERN_INVALID_VALUE;
8207 	}
8208 
8209 #if __AMP__
8210 
8211 	thread_t thread = current_thread();
8212 
8213 	if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
8214 		if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
8215 			/* Cannot hard-bind an already hard-cluster-bound thread */
8216 			return KERN_NOT_SUPPORTED;
8217 		}
8218 	}
8219 
8220 #endif /* __AMP__ */
8221 
8222 unbind:
8223 	thread_bind(processor);
8224 
8225 	thread_block(THREAD_CONTINUE_NULL);
8226 	return KERN_SUCCESS;
8227 }
8228 
8229 extern char sysctl_get_task_cluster_type(void);
8230 char
sysctl_get_task_cluster_type(void)8231 sysctl_get_task_cluster_type(void)
8232 {
8233 	task_t task = current_task();
8234 	processor_set_t pset_hint = task->pset_hint;
8235 
8236 	if (!pset_hint) {
8237 		return '0';
8238 	}
8239 
8240 #if __AMP__
8241 	if (pset_hint->pset_cluster_type == PSET_AMP_E) {
8242 		return 'E';
8243 	} else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
8244 		return 'P';
8245 	}
8246 #endif
8247 
8248 	return '0';
8249 }
8250 
8251 #if __AMP__
8252 static processor_set_t
find_pset_of_type(pset_cluster_type_t t)8253 find_pset_of_type(pset_cluster_type_t t)
8254 {
8255 	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8256 		if (node->pset_cluster_type != t) {
8257 			continue;
8258 		}
8259 
8260 		processor_set_t pset = PROCESSOR_SET_NULL;
8261 		for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
8262 			pset = pset_array[pset_id];
8263 			/* Prefer one with recommended processsors */
8264 			if (pset->recommended_bitmask != 0) {
8265 				assert(pset->pset_cluster_type == t);
8266 				return pset;
8267 			}
8268 		}
8269 		/* Otherwise return whatever was found last */
8270 		return pset;
8271 	}
8272 
8273 	return PROCESSOR_SET_NULL;
8274 }
8275 #endif
8276 
8277 extern void sysctl_task_set_cluster_type(char cluster_type);
8278 void
sysctl_task_set_cluster_type(char cluster_type)8279 sysctl_task_set_cluster_type(char cluster_type)
8280 {
8281 	task_t task = current_task();
8282 	processor_set_t pset_hint = PROCESSOR_SET_NULL;
8283 
8284 #if __AMP__
8285 	switch (cluster_type) {
8286 	case 'e':
8287 	case 'E':
8288 		pset_hint = find_pset_of_type(PSET_AMP_E);
8289 		break;
8290 	case 'p':
8291 	case 'P':
8292 		pset_hint = find_pset_of_type(PSET_AMP_P);
8293 		break;
8294 	default:
8295 		break;
8296 	}
8297 
8298 	if (pset_hint) {
8299 		task_lock(task);
8300 		task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
8301 		task->pset_hint = pset_hint;
8302 		task_unlock(task);
8303 
8304 		thread_block(THREAD_CONTINUE_NULL);
8305 	}
8306 #else
8307 	(void)cluster_type;
8308 	(void)task;
8309 	(void)pset_hint;
8310 #endif
8311 }
8312 
8313 #endif /* DEVELOPMENT || DEBUG */
8314